From 10742fee98eb3b3e8453ef27a33dee314b15f7bd Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Jul 2019 14:07:54 +0200 Subject: [PATCH 0001/2880] eeprom: at24: remove unneeded include We used to have a call to ilog2() in AT24_DEVICE_MAGIC(). That's long gone so this header is no longer needed. Signed-off-by: Bartosz Golaszewski Acked-by: Wolfram Sang --- drivers/misc/eeprom/at24.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c index 35bf2477693d..fff5b340580c 100644 --- a/drivers/misc/eeprom/at24.c +++ b/drivers/misc/eeprom/at24.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include From 4f77e0956bd99901af2bfc6641437ff8bff8d4a4 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Mon, 15 Jul 2019 14:34:16 -0600 Subject: [PATCH 0002/2880] PCI: Remove pci_block_cfg_access() et al (unused) Remove the following unused functions from include/linux/pci.h: pci_block_cfg_access() pci_block_cfg_access_in_atomic() pci_unblock_cfg_access() These were added by fb51ccbf217c ("PCI: Rework config space blocking services"), though no callers were added. Link: https://lore.kernel.org/r/20190715203416.37547-1-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Lukas Bulwahn --- include/linux/pci.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..61b64fa1b0b1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1747,11 +1747,6 @@ static inline void pci_release_regions(struct pci_dev *dev) { } static inline unsigned long pci_address_to_pio(phys_addr_t addr) { return -1; } -static inline void pci_block_cfg_access(struct pci_dev *dev) { } -static inline int pci_block_cfg_access_in_atomic(struct pci_dev *dev) -{ return 0; } -static inline void pci_unblock_cfg_access(struct pci_dev *dev) { } - static inline struct pci_bus *pci_find_next_bus(const struct pci_bus *from) { return NULL; } static inline struct pci_dev *pci_get_slot(struct pci_bus *bus, From fae6b93b19b42fcae0ef71e669b643f3366f6824 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 17 Jul 2019 12:23:53 -0600 Subject: [PATCH 0003/2880] PCI: Unexport pci_bus_get() and pci_bus_put() pci_bus_get() and pci_bus_put() are not used by a loadable kernel module and do not need to be exported. These were exported by fe830ef62ac6 ("PCI: Introduce pci_bus_{get|put}() to manage PCI bus reference count"), but there are no loadable modules in the tree that use them. Link: https://lore.kernel.org/r/20190717182353.45557-1-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Acked-by: Greg Kroah-Hartman --- drivers/pci/bus.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 495059d923f7..8e40b3e6da77 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -417,11 +417,9 @@ struct pci_bus *pci_bus_get(struct pci_bus *bus) get_device(&bus->dev); return bus; } -EXPORT_SYMBOL(pci_bus_get); void pci_bus_put(struct pci_bus *bus) { if (bus) put_device(&bus->dev); } -EXPORT_SYMBOL(pci_bus_put); From 70a658073726ae79cf9747b651f577d1fdf45c3a Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 17 Jul 2019 21:29:52 -0600 Subject: [PATCH 0004/2880] PCI: Unexport pci_bus_sem pci_bus_sem is not used by a loadable kernel module and does not need to be exported. It was exported by ce29ca3ea407 ("PCI: acpiphp: remove all functions in slot, even without ACPI _EJx"), which added a use of pci_bus_sem in acpiphp, which could be built as a module at that time. But since 6037a803b05e ("PCI: acpiphp: Convert acpiphp to be builtin only, not modular"), it can no longer be built as a module. Link: https://lore.kernel.org/r/20190718032951.40188-1-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/search.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/search.c b/drivers/pci/search.c index 7f4e65872b8d..bade14002fd8 100644 --- a/drivers/pci/search.c +++ b/drivers/pci/search.c @@ -15,7 +15,6 @@ #include "pci.h" DECLARE_RWSEM(pci_bus_sem); -EXPORT_SYMBOL_GPL(pci_bus_sem); /* * pci_for_each_dma_alias - Iterate over DMA aliases for a device From a18670f4617d41829ce88ab3e47bbc406e4dc5e8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 21 Jul 2019 14:55:47 +0200 Subject: [PATCH 0005/2880] watchdog: ath79_wdt: fix a typo in the name of a function It is likely that 'ath97_wdt_shutdown()' should be 'ath79_wdt_shutdown()' Signed-off-by: Christophe JAILLET Reviewed-by: Guenter Roeck Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ath79_wdt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/ath79_wdt.c b/drivers/watchdog/ath79_wdt.c index 2e09981fe978..75de664ef4b0 100644 --- a/drivers/watchdog/ath79_wdt.c +++ b/drivers/watchdog/ath79_wdt.c @@ -302,7 +302,7 @@ static int ath79_wdt_remove(struct platform_device *pdev) return 0; } -static void ath97_wdt_shutdown(struct platform_device *pdev) +static void ath79_wdt_shutdown(struct platform_device *pdev) { ath79_wdt_disable(); } @@ -318,7 +318,7 @@ MODULE_DEVICE_TABLE(of, ath79_wdt_match); static struct platform_driver ath79_wdt_driver = { .probe = ath79_wdt_probe, .remove = ath79_wdt_remove, - .shutdown = ath97_wdt_shutdown, + .shutdown = ath79_wdt_shutdown, .driver = { .name = DRIVER_NAME, .of_match_table = of_match_ptr(ath79_wdt_match), From befa45fb5bdd514a40a19e659491193f7fd022f2 Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Mon, 8 Jul 2019 20:33:54 +0800 Subject: [PATCH 0006/2880] PCI: Use devm_add_action_or_reset() devm_add_action_or_reset() is a helper function which internally calls devm_add_action(). If the devm_add_action() fails, it will execute the action mentioned and return the error code. Use devm_add_action_or_reset() to reduce source code size (avoid writing the action twice) and reduce the likelihood of bugs. Link: https://lore.kernel.org/r/20190708123354.12127-1-huangfq.daxian@gmail.com Signed-off-by: Fuqian Huang Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pci-host-common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c index c742881b5061..c8cb9c5188a4 100644 --- a/drivers/pci/controller/pci-host-common.c +++ b/drivers/pci/controller/pci-host-common.c @@ -43,9 +43,8 @@ static struct pci_config_window *gen_pci_init(struct device *dev, goto err_out; } - err = devm_add_action(dev, gen_pci_unmap_cfg, cfg); + err = devm_add_action_or_reset(dev, gen_pci_unmap_cfg, cfg); if (err) { - gen_pci_unmap_cfg(cfg); goto err_out; } return cfg; From 197df18f7038178ef9c5806db1eff8e494381fa6 Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Tue, 9 Jul 2019 23:01:32 +0530 Subject: [PATCH 0007/2880] mfd: max77620: Add of_node_put() before return Each iteration of for_each_child_of_node puts the previous node, but in the case of a return from the middle of the loop, there is no put, thus causing a memory leak. Hence add an of_node_put before the return. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Signed-off-by: Lee Jones --- drivers/mfd/max77620.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/mfd/max77620.c b/drivers/mfd/max77620.c index 0c28965fcc6a..a851ff473a44 100644 --- a/drivers/mfd/max77620.c +++ b/drivers/mfd/max77620.c @@ -416,8 +416,10 @@ static int max77620_initialise_fps(struct max77620_chip *chip) for_each_child_of_node(fps_np, fps_child) { ret = max77620_config_fps(chip, fps_child); - if (ret < 0) + if (ret < 0) { + of_node_put(fps_child); return ret; + } } config = chip->enable_global_lpm ? MAX77620_ONOFFCNFG2_SLP_LPM_MSK : 0; From b5e29aa880be84c271be8d0726cec4018bfbfd74 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Jul 2019 13:47:13 +0200 Subject: [PATCH 0008/2880] mfd: davinci_voicecodec: Remove pointless #include When building davinci as multiplatform, we get a build error in this file: drivers/mfd/davinci_voicecodec.c:22:10: fatal error: 'mach/hardware.h' file not found The header is only used to access the io_v2p() macro, but the result is already known because that comes from the resource a little bit earlier. Signed-off-by: Arnd Bergmann Acked-by: Sekhar Nori Signed-off-by: Lee Jones --- drivers/mfd/davinci_voicecodec.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/mfd/davinci_voicecodec.c b/drivers/mfd/davinci_voicecodec.c index 13ca7203e193..e5c8bc998eb4 100644 --- a/drivers/mfd/davinci_voicecodec.c +++ b/drivers/mfd/davinci_voicecodec.c @@ -19,7 +19,6 @@ #include #include -#include static const struct regmap_config davinci_vc_regmap = { .reg_bits = 32, @@ -31,6 +30,7 @@ static int __init davinci_vc_probe(struct platform_device *pdev) struct davinci_vc *davinci_vc; struct resource *res; struct mfd_cell *cell = NULL; + dma_addr_t fifo_base; int ret; davinci_vc = devm_kzalloc(&pdev->dev, @@ -48,6 +48,7 @@ static int __init davinci_vc_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + fifo_base = (dma_addr_t)res->start; davinci_vc->base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(davinci_vc->base)) { ret = PTR_ERR(davinci_vc->base); @@ -70,8 +71,7 @@ static int __init davinci_vc_probe(struct platform_device *pdev) } davinci_vc->davinci_vcif.dma_tx_channel = res->start; - davinci_vc->davinci_vcif.dma_tx_addr = - (dma_addr_t)(io_v2p(davinci_vc->base) + DAVINCI_VC_WFIFO); + davinci_vc->davinci_vcif.dma_tx_addr = fifo_base + DAVINCI_VC_WFIFO; res = platform_get_resource(pdev, IORESOURCE_DMA, 1); if (!res) { @@ -81,8 +81,7 @@ static int __init davinci_vc_probe(struct platform_device *pdev) } davinci_vc->davinci_vcif.dma_rx_channel = res->start; - davinci_vc->davinci_vcif.dma_rx_addr = - (dma_addr_t)(io_v2p(davinci_vc->base) + DAVINCI_VC_RFIFO); + davinci_vc->davinci_vcif.dma_rx_addr = fifo_base + DAVINCI_VC_RFIFO; davinci_vc->dev = &pdev->dev; davinci_vc->pdev = pdev; From c776dd50196a07a0ef943fcb74e8c86f9a5a20a8 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:38 -0600 Subject: [PATCH 0009/2880] PCI: Make PCI_PM_* delay times private These delay time definitions: #define PCI_PM_D2_DELAY 200 #define PCI_PM_D3_WAIT 10 #define PCI_PM_D3COLD_WAIT 100 #define PCI_PM_BUS_WAIT 50 are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-2-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 5 +++++ include/linux/pci.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1be03a97cb92..708413632429 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -39,6 +39,11 @@ int pci_probe_reset_function(struct pci_dev *dev); int pci_bridge_secondary_bus_reset(struct pci_dev *dev); int pci_bus_error_reset(struct pci_dev *dev); +#define PCI_PM_D2_DELAY 200 +#define PCI_PM_D3_WAIT 10 +#define PCI_PM_D3COLD_WAIT 100 +#define PCI_PM_BUS_WAIT 50 + /** * struct pci_platform_pm_ops - Firmware PM callbacks * diff --git a/include/linux/pci.h b/include/linux/pci.h index 61b64fa1b0b1..9c2c6e161cfd 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -145,11 +145,6 @@ static inline const char *pci_power_name(pci_power_t state) return pci_power_names[1 + (__force int) state]; } -#define PCI_PM_D2_DELAY 200 -#define PCI_PM_D3_WAIT 10 -#define PCI_PM_D3COLD_WAIT 100 -#define PCI_PM_BUS_WAIT 50 - /** * typedef pci_channel_state_t * From 669696ebbccc40e8096a2ee9809c028fc1538001 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:39 -0600 Subject: [PATCH 0010/2880] PCI: Make pci_check_pme_status(), pci_pme_wakeup_bus() private These interfaces: bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-3-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 2 ++ include/linux/pci.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 708413632429..ad1fe54ab8ee 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -89,6 +89,8 @@ void pci_power_up(struct pci_dev *dev); void pci_disable_enabled_device(struct pci_dev *dev); int pci_finish_runtime_suspend(struct pci_dev *dev); void pcie_clear_root_pme_status(struct pci_dev *dev); +bool pci_check_pme_status(struct pci_dev *dev); +void pci_pme_wakeup_bus(struct pci_bus *bus); int __pci_pme_wakeup(struct pci_dev *dev, void *ign); void pci_pme_restore(struct pci_dev *dev); bool pci_dev_need_resume(struct pci_dev *dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 9c2c6e161cfd..b13fb829171e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1236,8 +1236,6 @@ int pci_wake_from_d3(struct pci_dev *dev, bool enable); int pci_prepare_to_sleep(struct pci_dev *dev); int pci_back_from_sleep(struct pci_dev *dev); bool pci_dev_run_wake(struct pci_dev *dev); -bool pci_check_pme_status(struct pci_dev *dev); -void pci_pme_wakeup_bus(struct pci_bus *bus); void pci_d3cold_enable(struct pci_dev *dev); void pci_d3cold_disable(struct pci_dev *dev); bool pcie_relaxed_ordering_enabled(struct pci_dev *dev); From 975e1ac173058b8710e5979e97fc1397233301f3 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:40 -0600 Subject: [PATCH 0011/2880] PCI: Make pci_get_host_bridge_device(), pci_put_host_bridge_device() private These interfaces: struct device *pci_get_host_bridge_device(struct pci_dev *dev); void pci_put_host_bridge_device(struct device *dev); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-4-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 3 +++ include/linux/pci.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ad1fe54ab8ee..b06c750e08d7 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -243,6 +243,9 @@ enum pci_bar_type { pci_bar_mem64, /* A 64-bit memory BAR */ }; +struct device *pci_get_host_bridge_device(struct pci_dev *dev); +void pci_put_host_bridge_device(struct device *dev); + int pci_configure_extended_tags(struct pci_dev *dev, void *ign); bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int crs_timeout); diff --git a/include/linux/pci.h b/include/linux/pci.h index b13fb829171e..ffb904e7895d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -644,9 +644,6 @@ static inline struct pci_dev *pci_upstream_bridge(struct pci_dev *dev) return dev->bus->self; } -struct device *pci_get_host_bridge_device(struct pci_dev *dev); -void pci_put_host_bridge_device(struct device *dev); - #ifdef CONFIG_PCI_MSI static inline bool pci_dev_msi_enabled(struct pci_dev *pci_dev) { From 440589dd1068fb36b2fe10ccddf97e43267e3b8d Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:41 -0600 Subject: [PATCH 0012/2880] PCI: Make pci_save_vc_state(), pci_restore_vc_state(), etc private These Virtual Channel interfaces: int pci_save_vc_state(struct pci_dev *dev); void pci_restore_vc_state(struct pci_dev *dev); void pci_allocate_vc_save_buffers(struct pci_dev *dev); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-5-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 5 +++++ include/linux/pci.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index b06c750e08d7..19cda6e6fccd 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -130,6 +130,11 @@ void pci_vpd_release(struct pci_dev *dev); void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev); void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev); +/* PCI Virtual Channel */ +int pci_save_vc_state(struct pci_dev *dev); +void pci_restore_vc_state(struct pci_dev *dev); +void pci_allocate_vc_save_buffers(struct pci_dev *dev); + /* PCI /proc functions */ #ifdef CONFIG_PROC_FS int pci_proc_attach_device(struct pci_dev *dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index ffb904e7895d..3ff458f89190 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1239,11 +1239,6 @@ bool pcie_relaxed_ordering_enabled(struct pci_dev *dev); void pci_wakeup_bus(struct pci_bus *bus); void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state); -/* PCI Virtual Channel */ -int pci_save_vc_state(struct pci_dev *dev); -void pci_restore_vc_state(struct pci_dev *dev); -void pci_allocate_vc_save_buffers(struct pci_dev *dev); - /* For use by arch with custom probe code */ void set_pcie_port_type(struct pci_dev *pdev); void set_pcie_hotplug_bridge(struct pci_dev *pdev); From 003d3b2c5f835d897b3b10a13a9e1340630e93f1 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:42 -0600 Subject: [PATCH 0013/2880] PCI: Make pci_hotplug_io_size, mem_size, and bus_size private These symbols: extern unsigned long pci_hotplug_io_size; extern unsigned long pci_hotplug_mem_size; extern unsigned long pci_hotplug_bus_size; are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-6-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 3 +++ include/linux/pci.h | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 19cda6e6fccd..9698fa805e28 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -208,6 +208,9 @@ extern const struct attribute_group *pcibus_groups[]; extern const struct device_type pci_dev_type; extern const struct attribute_group *pci_bus_groups[]; +extern unsigned long pci_hotplug_io_size; +extern unsigned long pci_hotplug_mem_size; +extern unsigned long pci_hotplug_bus_size; /** * pci_match_one_device - Tell if a PCI device structure has a matching diff --git a/include/linux/pci.h b/include/linux/pci.h index 3ff458f89190..b5592b9b7f38 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2010,10 +2010,6 @@ extern unsigned long pci_cardbus_mem_size; extern u8 pci_dfl_cache_line_size; extern u8 pci_cache_line_size; -extern unsigned long pci_hotplug_io_size; -extern unsigned long pci_hotplug_mem_size; -extern unsigned long pci_hotplug_bus_size; - /* Architecture-specific versions may override these (weak) */ void pcibios_disable_device(struct pci_dev *dev); void pcibios_set_master(struct pci_dev *dev); From ecd29c1a38af4ba6e61382591dfe93f06f14ba25 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:43 -0600 Subject: [PATCH 0014/2880] PCI: Make pci_bus_get(), pci_bus_put() private These interfaces: struct pci_bus *pci_bus_get(struct pci_bus *bus); void pci_bus_put(struct pci_bus *bus); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-7-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 2 ++ include/linux/pci.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 9698fa805e28..81e94c9f1c12 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -274,6 +274,8 @@ bool pci_bus_clip_resource(struct pci_dev *dev, int idx); void pci_reassigndev_resource_alignment(struct pci_dev *dev); void pci_disable_bridge_window(struct pci_dev *dev); +struct pci_bus *pci_bus_get(struct pci_bus *bus); +void pci_bus_put(struct pci_bus *bus); /* PCIe link information */ #define PCIE_SPEED2STR(speed) \ diff --git a/include/linux/pci.h b/include/linux/pci.h index b5592b9b7f38..4d1b6f07f8f1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1282,8 +1282,6 @@ int pci_request_selected_regions_exclusive(struct pci_dev *, int, const char *); void pci_release_selected_regions(struct pci_dev *, int); /* drivers/pci/bus.c */ -struct pci_bus *pci_bus_get(struct pci_bus *bus); -void pci_bus_put(struct pci_bus *bus); void pci_add_resource(struct list_head *resources, struct resource *res); void pci_add_resource_offset(struct list_head *resources, struct resource *res, resource_size_t offset); From 5da78d95785daa9454336a88b2f86a8998f6c739 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:44 -0600 Subject: [PATCH 0015/2880] PCI: Make pcie_update_link_speed() private This interface: void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); is only used in drivers/pci/ and does not need to be seen by the rest of the kernel. Move it to drivers/pci/pci.h so it's private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-8-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 1 + include/linux/pci.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 81e94c9f1c12..8459663358c5 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -299,6 +299,7 @@ u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed, enum pcie_link_width *width); void __pcie_print_link_status(struct pci_dev *dev, bool verbose); void pcie_report_downtraining(struct pci_dev *dev); +void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); /* Single Root I/O Virtualization */ struct pci_sriov { diff --git a/include/linux/pci.h b/include/linux/pci.h index 4d1b6f07f8f1..8301352a937f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -987,7 +987,6 @@ struct pci_bus *pci_scan_root_bus(struct device *parent, int bus, int pci_scan_root_bus_bridge(struct pci_host_bridge *bridge); struct pci_bus *pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr); -void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr, const char *name, struct hotplug_slot *hotplug); From b92b512a435da01b52de07e3dcc2f07a4ad404de Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:45 -0600 Subject: [PATCH 0016/2880] PCI: Make pci_ats_init() private This interface: void pci_ats_init(struct pci_dev *dev); is only used in drivers/pci/ and does not need to be seen by the rest of the kernel. Move it to drivers/pci/pci.h so it's private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-9-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 7 ++++--- include/linux/pci.h | 2 -- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 8459663358c5..2bb59afb540e 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -439,11 +439,12 @@ static inline void pci_restore_dpc_state(struct pci_dev *dev) {} #endif #ifdef CONFIG_PCI_ATS +/* Address Translation Service */ +void pci_ats_init(struct pci_dev *dev); void pci_restore_ats_state(struct pci_dev *dev); #else -static inline void pci_restore_ats_state(struct pci_dev *dev) -{ -} +static inline void pci_ats_init(struct pci_dev *d) { } +static inline void pci_restore_ats_state(struct pci_dev *dev) { } #endif /* CONFIG_PCI_ATS */ #ifdef CONFIG_PCI_IOV diff --git a/include/linux/pci.h b/include/linux/pci.h index 8301352a937f..9b8d0c2a1c9a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1761,13 +1761,11 @@ static inline bool pci_ats_disabled(void) { return true; } #ifdef CONFIG_PCI_ATS /* Address Translation Service */ -void pci_ats_init(struct pci_dev *dev); int pci_enable_ats(struct pci_dev *dev, int ps); void pci_disable_ats(struct pci_dev *dev); int pci_ats_queue_depth(struct pci_dev *dev); int pci_ats_page_aligned(struct pci_dev *dev); #else -static inline void pci_ats_init(struct pci_dev *d) { } static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; } static inline void pci_disable_ats(struct pci_dev *d) { } static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; } From 72bde9ced373ca1e27332fd020d248151319a3d6 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:46 -0600 Subject: [PATCH 0017/2880] PCI: Make pcie_set_ecrc_checking(), pcie_ecrc_get_policy() private These interfaces: void pcie_set_ecrc_checking(struct pci_dev *dev); void pcie_ecrc_get_policy(char *str); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-10-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 8 ++++++++ include/linux/pci.h | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 2bb59afb540e..d3d006c95041 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -541,6 +541,14 @@ static inline void pcie_aspm_create_sysfs_dev_files(struct pci_dev *pdev) { } static inline void pcie_aspm_remove_sysfs_dev_files(struct pci_dev *pdev) { } #endif +#ifdef CONFIG_PCIE_ECRC +void pcie_set_ecrc_checking(struct pci_dev *dev); +void pcie_ecrc_get_policy(char *str); +#else +static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } +static inline void pcie_ecrc_get_policy(char *str) { } +#endif + #ifdef CONFIG_PCIE_PTM void pci_ptm_init(struct pci_dev *dev); #else diff --git a/include/linux/pci.h b/include/linux/pci.h index 9b8d0c2a1c9a..65502c564661 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1559,14 +1559,6 @@ bool pci_aer_available(void); static inline bool pci_aer_available(void) { return false; } #endif -#ifdef CONFIG_PCIE_ECRC -void pcie_set_ecrc_checking(struct pci_dev *dev); -void pcie_ecrc_get_policy(char *str); -#else -static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } -static inline void pcie_ecrc_get_policy(char *str) { } -#endif - bool pci_ats_disabled(void); #ifdef CONFIG_PCIE_PTM From ac6c26da29c12fa511c877c273ed5c939dc9e96c Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:47 -0600 Subject: [PATCH 0018/2880] PCI: Make pci_enable_ptm() private This interface: int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); is only used in drivers/pci/ and does not need to be seen by the rest of the kernel. Move it to drivers/pci/pci.h so it's private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-11-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 3 +++ include/linux/pci.h | 7 ------- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index d3d006c95041..8935fc1ff446 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -551,8 +551,11 @@ static inline void pcie_ecrc_get_policy(char *str) { } #ifdef CONFIG_PCIE_PTM void pci_ptm_init(struct pci_dev *dev); +int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); #else static inline void pci_ptm_init(struct pci_dev *dev) { } +static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) +{ return -EINVAL; } #endif struct pci_dev_reset_methods { diff --git a/include/linux/pci.h b/include/linux/pci.h index 65502c564661..ef73b0c87f01 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1561,13 +1561,6 @@ static inline bool pci_aer_available(void) { return false; } bool pci_ats_disabled(void); -#ifdef CONFIG_PCIE_PTM -int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); -#else -static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) -{ return -EINVAL; } -#endif - void pci_cfg_access_lock(struct pci_dev *dev); bool pci_cfg_access_trylock(struct pci_dev *dev); void pci_cfg_access_unlock(struct pci_dev *dev); From 621f7e354fd8f99db0e3f7a0e8cda238caee9f3a Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:48 -0600 Subject: [PATCH 0019/2880] PCI: Make pci_set_of_node(), etc private These interfaces: void pci_set_of_node(struct pci_dev *dev); void pci_release_of_node(struct pci_dev *dev); void pci_set_bus_of_node(struct pci_bus *bus); void pci_release_bus_of_node(struct pci_bus *bus); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-12-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 9 +++++++++ include/linux/pci.h | 8 -------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 8935fc1ff446..61bbfd611140 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -592,6 +592,10 @@ struct device_node; int of_pci_parse_bus_range(struct device_node *node, struct resource *res); int of_get_pci_domain_nr(struct device_node *node); int of_pci_get_max_link_speed(struct device_node *node); +void pci_set_of_node(struct pci_dev *dev); +void pci_release_of_node(struct pci_dev *dev); +void pci_set_bus_of_node(struct pci_bus *bus); +void pci_release_bus_of_node(struct pci_bus *bus); #else static inline int @@ -611,6 +615,11 @@ of_pci_get_max_link_speed(struct device_node *node) { return -EINVAL; } + +static inline void pci_set_of_node(struct pci_dev *dev) { } +static inline void pci_release_of_node(struct pci_dev *dev) { } +static inline void pci_set_bus_of_node(struct pci_bus *bus) { } +static inline void pci_release_bus_of_node(struct pci_bus *bus) { } #endif /* CONFIG_OF */ #if defined(CONFIG_OF_ADDRESS) diff --git a/include/linux/pci.h b/include/linux/pci.h index ef73b0c87f01..5a26b7db71b3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2259,10 +2259,6 @@ int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, #ifdef CONFIG_OF struct device_node; struct irq_domain; -void pci_set_of_node(struct pci_dev *dev); -void pci_release_of_node(struct pci_dev *dev); -void pci_set_bus_of_node(struct pci_bus *bus); -void pci_release_bus_of_node(struct pci_bus *bus); struct irq_domain *pci_host_bridge_of_msi_domain(struct pci_bus *bus); int pci_parse_request_of_pci_ranges(struct device *dev, struct list_head *resources, @@ -2272,10 +2268,6 @@ int pci_parse_request_of_pci_ranges(struct device *dev, struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus); #else /* CONFIG_OF */ -static inline void pci_set_of_node(struct pci_dev *dev) { } -static inline void pci_release_of_node(struct pci_dev *dev) { } -static inline void pci_set_bus_of_node(struct pci_bus *bus) { } -static inline void pci_release_bus_of_node(struct pci_bus *bus) { } static inline struct irq_domain * pci_host_bridge_of_msi_domain(struct pci_bus *bus) { return NULL; } static inline int pci_parse_request_of_pci_ranges(struct device *dev, From 64a38e840ce5940253208eaba40265c73decc4ee Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Fri, 26 Jul 2019 18:33:01 -0400 Subject: [PATCH 0020/2880] SUNRPC: Track writers of the 'channel' file to improve cache_listeners_exist The sunrpc cache interface is susceptible to being fooled by a rogue process just reading a 'channel' file. If this happens the kernel may think a valid daemon exists to service the cache when it does not. For example, the following may fool the kernel: cat /proc/net/rpc/auth.unix.gid/channel Change the tracking of readers to writers when considering whether a listener exists as all valid daemon processes either open a channel file O_RDWR or O_WRONLY. While this does not prevent a rogue process from "stealing" a message from the kernel, it does at least improve the kernels perception of whether a valid process servicing the cache exists. Signed-off-by: Dave Wysochanski Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/cache.h | 6 +++--- net/sunrpc/cache.c | 12 ++++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index c7f38e897174..f7d086b77a21 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -107,9 +107,9 @@ struct cache_detail { /* fields for communication over channel */ struct list_head queue; - atomic_t readers; /* how many time is /chennel open */ - time_t last_close; /* if no readers, when did last close */ - time_t last_warn; /* when we last warned about no readers */ + atomic_t writers; /* how many time is /channel open */ + time_t last_close; /* if no writers, when did last close */ + time_t last_warn; /* when we last warned about no writers */ union { struct proc_dir_entry *procfs; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 6f1528f271ee..a6a6190ad37a 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -373,7 +373,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; - atomic_set(&cd->readers, 0); + atomic_set(&cd->writers, 0); cd->last_close = 0; cd->last_warn = -1; list_add(&cd->others, &cache_list); @@ -1029,11 +1029,13 @@ static int cache_open(struct inode *inode, struct file *filp, } rp->offset = 0; rp->q.reader = 1; - atomic_inc(&cd->readers); + spin_lock(&queue_lock); list_add(&rp->q.list, &cd->queue); spin_unlock(&queue_lock); } + if (filp->f_mode & FMODE_WRITE) + atomic_inc(&cd->writers); filp->private_data = rp; return 0; } @@ -1062,8 +1064,10 @@ static int cache_release(struct inode *inode, struct file *filp, filp->private_data = NULL; kfree(rp); + } + if (filp->f_mode & FMODE_WRITE) { + atomic_dec(&cd->writers); cd->last_close = seconds_since_boot(); - atomic_dec(&cd->readers); } module_put(cd->owner); return 0; @@ -1171,7 +1175,7 @@ static void warn_no_listener(struct cache_detail *detail) static bool cache_listeners_exist(struct cache_detail *detail) { - if (atomic_read(&detail->readers)) + if (atomic_read(&detail->writers)) return true; if (detail->last_close == 0) /* This cache was never opened */ From 1fed17df7e501b170f876b87eec11f930e3f1df5 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 14 Jun 2019 18:42:17 +0000 Subject: [PATCH 0021/2880] hv_balloon: Use a static page for the balloon_up send buffer It's unnecessary to dynamically allocate the buffer. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/hv_balloon.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 6fb4ea5f0304..4b06f076374a 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -494,7 +494,7 @@ enum hv_dm_state { static __u8 recv_buffer[PAGE_SIZE]; -static __u8 *send_buffer; +static __u8 balloon_up_send_buffer[PAGE_SIZE]; #define PAGES_IN_2M 512 #define HA_CHUNK (32 * 1024) @@ -1292,8 +1292,8 @@ static void balloon_up(struct work_struct *dummy) } while (!done) { - bl_resp = (struct dm_balloon_response *)send_buffer; - memset(send_buffer, 0, PAGE_SIZE); + memset(balloon_up_send_buffer, 0, PAGE_SIZE); + bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer; bl_resp->hdr.type = DM_BALLOON_RESPONSE; bl_resp->hdr.size = sizeof(struct dm_balloon_response); bl_resp->more_pages = 1; @@ -1578,19 +1578,11 @@ static int balloon_probe(struct hv_device *dev, do_hot_add = false; #endif - /* - * First allocate a send buffer. - */ - - send_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!send_buffer) - return -ENOMEM; - ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0, balloon_onchannelcallback, dev); if (ret) - goto probe_error0; + return ret; dm_device.dev = dev; dm_device.state = DM_INITIALIZING; @@ -1716,8 +1708,6 @@ probe_error2: probe_error1: vmbus_close(dev->channel); -probe_error0: - kfree(send_buffer); return ret; } @@ -1736,7 +1726,6 @@ static int balloon_remove(struct hv_device *dev) vmbus_close(dev->channel); kthread_stop(dm->thread); - kfree(send_buffer); #ifdef CONFIG_MEMORY_HOTPLUG restore_online_page_callback(&hv_online_page); unregister_memory_notifier(&hv_memory_nb); From 221f6df008ab39151600d4b09c85fcc730716bcb Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 14 Jun 2019 18:42:30 +0000 Subject: [PATCH 0022/2880] hv_balloon: Reorganize the probe function Move the code that negotiates with the host to a new function balloon_connect_vsp() and improve the error handling. This makes the code more readable and paves the way for the support of hibernation in future. Makes no real logic change here. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/hv_balloon.c | 124 +++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 58 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 4b06f076374a..34bd73526afd 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1564,50 +1564,18 @@ static void balloon_onchannelcallback(void *context) } -static int balloon_probe(struct hv_device *dev, - const struct hv_vmbus_device_id *dev_id) +static int balloon_connect_vsp(struct hv_device *dev) { - int ret; - unsigned long t; struct dm_version_request version_req; struct dm_capabilities cap_msg; - -#ifdef CONFIG_MEMORY_HOTPLUG - do_hot_add = hot_add; -#else - do_hot_add = false; -#endif + unsigned long t; + int ret; ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0, - balloon_onchannelcallback, dev); - + balloon_onchannelcallback, dev); if (ret) return ret; - dm_device.dev = dev; - dm_device.state = DM_INITIALIZING; - dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8; - init_completion(&dm_device.host_event); - init_completion(&dm_device.config_event); - INIT_LIST_HEAD(&dm_device.ha_region_list); - spin_lock_init(&dm_device.ha_lock); - INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); - INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); - dm_device.host_specified_ha_region = false; - - dm_device.thread = - kthread_run(dm_thread_func, &dm_device, "hv_balloon"); - if (IS_ERR(dm_device.thread)) { - ret = PTR_ERR(dm_device.thread); - goto probe_error1; - } - -#ifdef CONFIG_MEMORY_HOTPLUG - set_online_page_callback(&hv_online_page); - register_memory_notifier(&hv_memory_nb); -#endif - - hv_set_drvdata(dev, &dm_device); /* * Initiate the hand shake with the host and negotiate * a version that the host can support. We start with the @@ -1623,16 +1591,15 @@ static int balloon_probe(struct hv_device *dev, dm_device.version = version_req.version.version; ret = vmbus_sendpacket(dev->channel, &version_req, - sizeof(struct dm_version_request), - (unsigned long)NULL, - VM_PKT_DATA_INBAND, 0); + sizeof(struct dm_version_request), + (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); if (ret) - goto probe_error2; + goto out; t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); if (t == 0) { ret = -ETIMEDOUT; - goto probe_error2; + goto out; } /* @@ -1640,8 +1607,8 @@ static int balloon_probe(struct hv_device *dev, * fail the probe function. */ if (dm_device.state == DM_INIT_ERROR) { - ret = -ETIMEDOUT; - goto probe_error2; + ret = -EPROTO; + goto out; } pr_info("Using Dynamic Memory protocol version %u.%u\n", @@ -1674,16 +1641,15 @@ static int balloon_probe(struct hv_device *dev, cap_msg.max_page_number = -1; ret = vmbus_sendpacket(dev->channel, &cap_msg, - sizeof(struct dm_capabilities), - (unsigned long)NULL, - VM_PKT_DATA_INBAND, 0); + sizeof(struct dm_capabilities), + (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); if (ret) - goto probe_error2; + goto out; t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); if (t == 0) { ret = -ETIMEDOUT; - goto probe_error2; + goto out; } /* @@ -1691,23 +1657,65 @@ static int balloon_probe(struct hv_device *dev, * fail the probe function. */ if (dm_device.state == DM_INIT_ERROR) { - ret = -ETIMEDOUT; - goto probe_error2; + ret = -EPROTO; + goto out; } + return 0; +out: + vmbus_close(dev->channel); + return ret; +} + +static int balloon_probe(struct hv_device *dev, + const struct hv_vmbus_device_id *dev_id) +{ + int ret; + +#ifdef CONFIG_MEMORY_HOTPLUG + do_hot_add = hot_add; +#else + do_hot_add = false; +#endif + dm_device.dev = dev; + dm_device.state = DM_INITIALIZING; + dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8; + init_completion(&dm_device.host_event); + init_completion(&dm_device.config_event); + INIT_LIST_HEAD(&dm_device.ha_region_list); + spin_lock_init(&dm_device.ha_lock); + INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); + INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); + dm_device.host_specified_ha_region = false; + +#ifdef CONFIG_MEMORY_HOTPLUG + set_online_page_callback(&hv_online_page); + register_memory_notifier(&hv_memory_nb); +#endif + + hv_set_drvdata(dev, &dm_device); + + ret = balloon_connect_vsp(dev); + if (ret != 0) + return ret; + dm_device.state = DM_INITIALIZED; - last_post_time = jiffies; + + dm_device.thread = + kthread_run(dm_thread_func, &dm_device, "hv_balloon"); + if (IS_ERR(dm_device.thread)) { + ret = PTR_ERR(dm_device.thread); + goto probe_error; + } return 0; -probe_error2: +probe_error: + vmbus_close(dev->channel); #ifdef CONFIG_MEMORY_HOTPLUG + unregister_memory_notifier(&hv_memory_nb); restore_online_page_callback(&hv_online_page); #endif - kthread_stop(dm_device.thread); - -probe_error1: - vmbus_close(dev->channel); return ret; } @@ -1724,11 +1732,11 @@ static int balloon_remove(struct hv_device *dev) cancel_work_sync(&dm->balloon_wrk.wrk); cancel_work_sync(&dm->ha_wrk.wrk); - vmbus_close(dev->channel); kthread_stop(dm->thread); + vmbus_close(dev->channel); #ifdef CONFIG_MEMORY_HOTPLUG - restore_online_page_callback(&hv_online_page); unregister_memory_notifier(&hv_memory_nb); + restore_online_page_callback(&hv_online_page); #endif spin_lock_irqsave(&dm_device.ha_lock, flags); list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) { From 81b23ba645e6b2b446093b2d927c261a17f7dee3 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 30 Jul 2019 14:08:07 +0800 Subject: [PATCH 0023/2880] csky: Fixup mb() synchronization problem The mb() is the superset of dma and smp. Using bar.xxx to implement mb() will cause problem when sync data with dma device, becasue bar.xxx couldn't guarantee bus transactions finished at outside bus level. We must use sync.s instead of bar.xxx for dma data synchronization and it will guarantee retirement after getting the bus bresponse. Changes for V2: - Use sync.s for all mb, rmb, wmb, dma_wmb, dma_rmb. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/include/asm/barrier.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h index 476eb786f22d..a430e7fddf35 100644 --- a/arch/csky/include/asm/barrier.h +++ b/arch/csky/include/asm/barrier.h @@ -9,11 +9,12 @@ #define nop() asm volatile ("nop\n":::"memory") /* - * sync: completion barrier - * sync.s: completion barrier and shareable to other cores - * sync.i: completion barrier with flush cpu pipeline - * sync.is: completion barrier with flush cpu pipeline and shareable to - * other cores + * sync: completion barrier, all sync.xx instructions + * guarantee the last response recieved by bus transaction + * made by ld/st instructions before sync.s + * sync.s: inherit from sync, but also shareable to other cores + * sync.i: inherit from sync, but also flush cpu pipeline + * sync.is: the same with sync.i + sync.s * * bar.brwarw: ordering barrier for all load/store instructions before it * bar.brwarws: ordering barrier for all load/store instructions before it @@ -27,9 +28,7 @@ */ #ifdef CONFIG_CPU_HAS_CACHEV2 -#define mb() asm volatile ("bar.brwarw\n":::"memory") -#define rmb() asm volatile ("bar.brar\n":::"memory") -#define wmb() asm volatile ("bar.bwaw\n":::"memory") +#define mb() asm volatile ("sync.s\n":::"memory") #ifdef CONFIG_SMP #define __smp_mb() asm volatile ("bar.brwarws\n":::"memory") From 7f80fe207de9602aaff028c79345caa68c90cd31 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 30 Jul 2019 14:43:22 +0800 Subject: [PATCH 0024/2880] csky: Fixup dma_alloc_coherent with PAGE_SO attribute This bug is from commit: 2b070ccdf8c0 (fixup abiv2 mmap(... O_SYNC) failed). In that patch we remove the _PAGE_SO for memory noncache mapping and this will cause problem when drivers use dma descriptors to control the transcations without dma_w/rmb(). After referencing other archs' implementation, pgprot_writecombine is introduced for mmap(... O_SYNC). Signed-off-by: Guo Ren --- arch/csky/include/asm/pgtable.h | 10 ++++++++++ arch/csky/mm/ioremap.c | 6 ++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index c429a6f347de..fc19ba446d62 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -258,6 +258,16 @@ static inline pgprot_t pgprot_noncached(pgprot_t _prot) { unsigned long prot = pgprot_val(_prot); + prot = (prot & ~_CACHE_MASK) | _CACHE_UNCACHED | _PAGE_SO; + + return __pgprot(prot); +} + +#define pgprot_writecombine pgprot_writecombine +static inline pgprot_t pgprot_writecombine(pgprot_t _prot) +{ + unsigned long prot = pgprot_val(_prot); + prot = (prot & ~_CACHE_MASK) | _CACHE_UNCACHED; return __pgprot(prot); diff --git a/arch/csky/mm/ioremap.c b/arch/csky/mm/ioremap.c index 8473b6bdf512..48531115fd9d 100644 --- a/arch/csky/mm/ioremap.c +++ b/arch/csky/mm/ioremap.c @@ -29,8 +29,7 @@ void __iomem *ioremap(phys_addr_t addr, size_t size) vaddr = (unsigned long)area->addr; - prot = __pgprot(_PAGE_PRESENT | __READABLE | __WRITEABLE | - _PAGE_GLOBAL | _CACHE_UNCACHED | _PAGE_SO); + prot = pgprot_noncached(PAGE_KERNEL); if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) { free_vm_area(area); @@ -51,10 +50,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { if (!pfn_valid(pfn)) { - vma_prot.pgprot |= _PAGE_SO; return pgprot_noncached(vma_prot); } else if (file->f_flags & O_SYNC) { - return pgprot_noncached(vma_prot); + return pgprot_writecombine(vma_prot); } return vma_prot; From b36f281f4a314de4be0a51d6511b794691f8a244 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Fri, 19 Jul 2019 07:16:57 -0400 Subject: [PATCH 0025/2880] ima: initialize the "template" field with the default template IMA policy rules are walked sequentially. Depending on the ordering of the policy rules, the "template" field might be defined in one rule, but will be replaced by subsequent, applicable rules, even if the rule does not explicitly define the "template" field. This patch initializes the "template" once and only replaces the "template", when explicitly defined. Fixes: 19453ce0bcfb ("IMA: support for per policy rule template formats") Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_policy.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 6df7f641ff66..36a0727f1d7a 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -491,6 +491,9 @@ int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid, struct ima_rule_entry *entry; int action = 0, actmask = flags | (flags << 1); + if (template_desc) + *template_desc = ima_template_desc_current(); + rcu_read_lock(); list_for_each_entry_rcu(entry, ima_rules, list) { @@ -510,6 +513,7 @@ int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid, action |= IMA_FAIL_UNVERIFIABLE_SIGS; } + if (entry->action & IMA_DO_MASK) actmask &= ~(entry->action | entry->action << 1); else @@ -520,8 +524,6 @@ int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid, if (template_desc && entry->template) *template_desc = entry->template; - else if (template_desc) - *template_desc = ima_template_desc_current(); if (!actmask) break; From e5738bc46d49642dc4804bad3656b23016f8e1f1 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 8 Jul 2019 02:12:34 +0300 Subject: [PATCH 0026/2880] i2c: tegra: Compile PM functions unconditionally The I2C driver fails to probe if CONFIG_PM_SLEEP=n because runtime PM doesn't depend on the PM sleep and in this case the runtime PM ops are not included in the driver, resulting in I2C clock not being enabled. It's much cleaner to simply allow compiler to remove the dead code instead of messing with the #ifdefs. This patch fixes such errors when CONFIG_PM_SLEEP=n: tegra-i2c 7000c400.i2c: timeout waiting for fifo flush tegra-i2c 7000c400.i2c: Failed to initialize i2c controller Signed-off-by: Dmitry Osipenko Acked-by: Jon Hunter Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-tegra.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 9fcb13beeb8f..18f0ceed9f1b 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -636,7 +636,7 @@ static void tegra_dvc_init(struct tegra_i2c_dev *i2c_dev) dvc_writel(i2c_dev, val, DVC_CTRL_REG1); } -static int tegra_i2c_runtime_resume(struct device *dev) +static int __maybe_unused tegra_i2c_runtime_resume(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); int ret; @@ -665,7 +665,7 @@ static int tegra_i2c_runtime_resume(struct device *dev) return 0; } -static int tegra_i2c_runtime_suspend(struct device *dev) +static int __maybe_unused tegra_i2c_runtime_suspend(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); @@ -1711,8 +1711,7 @@ static int tegra_i2c_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM_SLEEP -static int tegra_i2c_suspend(struct device *dev) +static int __maybe_unused tegra_i2c_suspend(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); @@ -1721,7 +1720,7 @@ static int tegra_i2c_suspend(struct device *dev) return 0; } -static int tegra_i2c_resume(struct device *dev) +static int __maybe_unused tegra_i2c_resume(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); int err; @@ -1741,18 +1740,13 @@ static const struct dev_pm_ops tegra_i2c_pm = { NULL) }; -#define TEGRA_I2C_PM (&tegra_i2c_pm) -#else -#define TEGRA_I2C_PM NULL -#endif - static struct platform_driver tegra_i2c_driver = { .probe = tegra_i2c_probe, .remove = tegra_i2c_remove, .driver = { .name = "tegra-i2c", .of_match_table = tegra_i2c_of_match, - .pm = TEGRA_I2C_PM, + .pm = &tegra_i2c_pm, }, }; From 34de3513e668d5ad46bd32ebee63f9e3dc57e1db Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Mon, 15 Jul 2019 11:17:39 +0800 Subject: [PATCH 0027/2880] i2c: ismt: Remove call to memset after dmam_alloc_coherent In commit 518a2f1925c3 ("dma-mapping: zero memory returned from dma_alloc_*"), dma_alloc_coherent has already zeroed the memory. So memset is not needed. Acked-by: Neil Horman Signed-off-by: Fuqian Huang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-ismt.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index 02d23edb2fb1..2f95e25a10f7 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -781,8 +781,6 @@ static int ismt_dev_init(struct ismt_priv *priv) if (!priv->hw) return -ENOMEM; - memset(priv->hw, 0, (ISMT_DESC_ENTRIES * sizeof(struct ismt_desc))); - priv->head = 0; init_completion(&priv->cmp); From b17e6d19dcd3fcb2b93166034e739e82b49d2a51 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Wed, 17 Jul 2019 16:40:16 +0800 Subject: [PATCH 0028/2880] i2c: mxs: use devm_platform_ioremap_resource() to simplify code Use the new helper devm_platform_ioremap_resource() which wraps the platform_get_resource() and devm_ioremap_resource() together, to simplify the code. Signed-off-by: Anson Huang Reviewed-by: Dong Aisheng Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mxs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-mxs.c b/drivers/i2c/busses/i2c-mxs.c index 7d79317a1046..89224913f578 100644 --- a/drivers/i2c/busses/i2c-mxs.c +++ b/drivers/i2c/busses/i2c-mxs.c @@ -802,7 +802,6 @@ static int mxs_i2c_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct mxs_i2c_dev *i2c; struct i2c_adapter *adap; - struct resource *res; int err, irq; i2c = devm_kzalloc(dev, sizeof(*i2c), GFP_KERNEL); @@ -814,8 +813,7 @@ static int mxs_i2c_probe(struct platform_device *pdev) i2c->dev_type = device_id->driver_data; } - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - i2c->regs = devm_ioremap_resource(&pdev->dev, res); + i2c->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(i2c->regs)) return PTR_ERR(i2c->regs); From 5667b5b59f45bd0bebadfae9ed55745b4cff2198 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Wed, 17 Jul 2019 16:40:17 +0800 Subject: [PATCH 0029/2880] i2c: imx-lpi2c: use devm_platform_ioremap_resource() to simplify code Use the new helper devm_platform_ioremap_resource() which wraps the platform_get_resource() and devm_ioremap_resource() together, to simplify the code. Signed-off-by: Anson Huang Acked-by: Dong Aisheng Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-imx-lpi2c.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-imx-lpi2c.c b/drivers/i2c/busses/i2c-imx-lpi2c.c index dc00fabc919a..c92b56485fa6 100644 --- a/drivers/i2c/busses/i2c-imx-lpi2c.c +++ b/drivers/i2c/busses/i2c-imx-lpi2c.c @@ -545,7 +545,6 @@ MODULE_DEVICE_TABLE(of, lpi2c_imx_of_match); static int lpi2c_imx_probe(struct platform_device *pdev) { struct lpi2c_imx_struct *lpi2c_imx; - struct resource *res; unsigned int temp; int irq, ret; @@ -553,8 +552,7 @@ static int lpi2c_imx_probe(struct platform_device *pdev) if (!lpi2c_imx) return -ENOMEM; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - lpi2c_imx->base = devm_ioremap_resource(&pdev->dev, res); + lpi2c_imx->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(lpi2c_imx->base)) return PTR_ERR(lpi2c_imx->base); From 7735eeebd2be26d6d3fa151c85d839d62d710d93 Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Tue, 23 Jul 2019 19:11:10 +0800 Subject: [PATCH 0030/2880] i2c: busses: Use dev_get_drvdata where possible Instead of using to_pci_dev + pci_get_drvdata, use dev_get_drvdata to make code simpler. Signed-off-by: Chuhong Yuan Reviewed-by: Jean Delvare Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-pcidrv.c | 6 ++---- drivers/i2c/busses/i2c-i801.c | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c index 76810deb2de6..7d2e6959679c 100644 --- a/drivers/i2c/busses/i2c-designware-pcidrv.c +++ b/drivers/i2c/busses/i2c-designware-pcidrv.c @@ -173,8 +173,7 @@ static struct dw_pci_controller dw_pci_controllers[] = { #ifdef CONFIG_PM static int i2c_dw_pci_suspend(struct device *dev) { - struct pci_dev *pdev = to_pci_dev(dev); - struct dw_i2c_dev *i_dev = pci_get_drvdata(pdev); + struct dw_i2c_dev *i_dev = dev_get_drvdata(dev); i_dev->suspended = true; i_dev->disable(i_dev); @@ -184,8 +183,7 @@ static int i2c_dw_pci_suspend(struct device *dev) static int i2c_dw_pci_resume(struct device *dev) { - struct pci_dev *pdev = to_pci_dev(dev); - struct dw_i2c_dev *i_dev = pci_get_drvdata(pdev); + struct dw_i2c_dev *i_dev = dev_get_drvdata(dev); int ret; ret = i_dev->init(i_dev); diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index f2956936c3f2..a6469978e735 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -1912,8 +1912,7 @@ static int i801_suspend(struct device *dev) static int i801_resume(struct device *dev) { - struct pci_dev *pci_dev = to_pci_dev(dev); - struct i801_priv *priv = pci_get_drvdata(pci_dev); + struct i801_priv *priv = dev_get_drvdata(dev); i801_enable_host_notify(&priv->adapter); From 23c2556d8fbec57e72e2c8fd8b7be2715c192b25 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 24 Jul 2019 14:15:56 +0200 Subject: [PATCH 0031/2880] dt-bindings: i2c: sh_mobile: Rename bindings documentation file Rename the bindings documentation file for sh_mobile I2C controller from i2c-sh_mobile.txt to renesas,iic.txt. This is part of an ongoing effort to name bindings documentation files for Renesas IP blocks consistently, in line with the compat strings they document. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- .../bindings/i2c/{i2c-sh_mobile.txt => renesas,iic.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{i2c-sh_mobile.txt => renesas,iic.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt b/Documentation/devicetree/bindings/i2c/renesas,iic.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt rename to Documentation/devicetree/bindings/i2c/renesas,iic.txt diff --git a/MAINTAINERS b/MAINTAINERS index 6426db5198f0..c8a78239447a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13651,7 +13651,7 @@ RENESAS R-CAR I2C DRIVERS M: Wolfram Sang S: Supported F: Documentation/devicetree/bindings/i2c/i2c-rcar.txt -F: Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt +F: Documentation/devicetree/bindings/i2c/renesas,iic.txt F: drivers/i2c/busses/i2c-rcar.c F: drivers/i2c/busses/i2c-sh_mobile.c From d13ed84b195cc6e5789b446f07aede357939f7ad Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 24 Jul 2019 14:15:57 +0200 Subject: [PATCH 0032/2880] dt-bindings: i2c: rcar: Rename bindings documentation file Rename the bindings documentation file for R-Car I2C controller from i2c-rcar.txt to renesas,rcar.txt. This is part of an ongoing effort to name bindings documentation files for Renesas IP blocks consistently, in line with the compat strings they document. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/{i2c-rcar.txt => renesas,i2c.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{i2c-rcar.txt => renesas,i2c.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/i2c-rcar.txt b/Documentation/devicetree/bindings/i2c/renesas,i2c.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-rcar.txt rename to Documentation/devicetree/bindings/i2c/renesas,i2c.txt diff --git a/MAINTAINERS b/MAINTAINERS index c8a78239447a..f92a88594dfc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13650,7 +13650,7 @@ F: drivers/iio/adc/rcar-gyroadc.c RENESAS R-CAR I2C DRIVERS M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-rcar.txt +F: Documentation/devicetree/bindings/i2c/renesas,i2c.txt F: Documentation/devicetree/bindings/i2c/renesas,iic.txt F: drivers/i2c/busses/i2c-rcar.c F: drivers/i2c/busses/i2c-sh_mobile.c From 1d583590514a4f1c9aa582d1234fca7aa3d627e9 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 24 Jul 2019 14:15:58 +0200 Subject: [PATCH 0033/2880] dt-bindings: i2c: riic: Rename bindings documentation file Rename the bindings documentation file for RIIC controller from i2c-riic.txt to renesas,riic.txt. This is part of an ongoing effort to name bindings documentation files for Renesas IP blocks consistently, in line with the compat strings they document. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/{i2c-riic.txt => renesas,riic.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{i2c-riic.txt => renesas,riic.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/i2c-riic.txt b/Documentation/devicetree/bindings/i2c/renesas,riic.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-riic.txt rename to Documentation/devicetree/bindings/i2c/renesas,riic.txt diff --git a/MAINTAINERS b/MAINTAINERS index f92a88594dfc..a14ce44c33b3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13658,7 +13658,7 @@ F: drivers/i2c/busses/i2c-sh_mobile.c RENESAS RIIC DRIVER M: Chris Brandt S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-riic.txt +F: Documentation/devicetree/bindings/i2c/renesas,riic.txt F: drivers/i2c/busses/i2c-riic.c RENESAS USB PHY DRIVER From 684ca71259a69c5a3019da72fe718bf983841926 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 24 Jul 2019 14:15:59 +0200 Subject: [PATCH 0034/2880] dt-bindings: i2c: riic: Rename bindings documentation file Rename the bindings documentation file for Renesas EMEV2 IIC controller from i2c-emev2.txt to renesas,iic-emev2.txt. This is part of an ongoing effort to name bindings documentation files for Renesas IP blocks consistently, in line with the compat strings they document. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- .../bindings/i2c/{i2c-emev2.txt => renesas,iic-emev2.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{i2c-emev2.txt => renesas,iic-emev2.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/i2c-emev2.txt b/Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-emev2.txt rename to Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt diff --git a/MAINTAINERS b/MAINTAINERS index a14ce44c33b3..f89f27a2d09a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13628,7 +13628,7 @@ F: drivers/clk/renesas/ RENESAS EMEV2 I2C DRIVER M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-emev2.txt +F: Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt F: drivers/i2c/busses/i2c-emev2.c RENESAS ETHERNET DRIVERS From 33eb09a02e8d8fad976019694196eec2cc210d62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 25 Jul 2019 22:21:36 +0200 Subject: [PATCH 0035/2880] i2c: designware: make use of devm_gpiod_get_optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a semantical change: if devm_gpiod_get_optional returns -ENOSYS this is passed as error to the caller. This effectively reverts commit d1fa74520dcd ("i2c: designware: Consider SCL GPIO optional") which shouldn't be necessary any more since gpiod_get_optional doesn't return -ENOSYS any more with GPIOLIB=n. Signed-off-by: Uwe Kleine-König Reviewed-by: Andy Shevchenko Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-master.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index d464799e40a3..867787dade43 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -657,13 +657,10 @@ static int i2c_dw_init_recovery_info(struct dw_i2c_dev *dev) struct gpio_desc *gpio; int r; - gpio = devm_gpiod_get(dev->dev, "scl", GPIOD_OUT_HIGH); - if (IS_ERR(gpio)) { - r = PTR_ERR(gpio); - if (r == -ENOENT || r == -ENOSYS) - return 0; - return r; - } + gpio = devm_gpiod_get_optional(dev->dev, "scl", GPIOD_OUT_HIGH); + if (IS_ERR_OR_NULL(gpio)) + return PTR_ERR_OR_ZERO(gpio); + rinfo->scl_gpiod = gpio; gpio = devm_gpiod_get_optional(dev->dev, "sda", GPIOD_IN); From c8424e776b093280d3fdd104d850706b3b229ac8 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 4 Jul 2019 15:57:34 -0300 Subject: [PATCH 0036/2880] MODSIGN: Export module signature definitions IMA will use the module_signature format for append signatures, so export the relevant definitions and factor out the code which verifies that the appended signature trailer is valid. Also, create a CONFIG_MODULE_SIG_FORMAT option so that IMA can select it and be able to use mod_check_sig() without having to depend on either CONFIG_MODULE_SIG or CONFIG_MODULES. s390 duplicated the definition of struct module_signature so now they can use the new header instead. Signed-off-by: Thiago Jung Bauermann Acked-by: Jessica Yu Reviewed-by: Philipp Rudo Cc: Heiko Carstens Signed-off-by: Mimi Zohar --- arch/s390/Kconfig | 2 +- arch/s390/kernel/machine_kexec_file.c | 24 +----------- include/linux/module.h | 3 -- include/linux/module_signature.h | 44 +++++++++++++++++++++ init/Kconfig | 6 ++- kernel/Makefile | 1 + kernel/module.c | 1 + kernel/module_signature.c | 46 ++++++++++++++++++++++ kernel/module_signing.c | 56 ++++----------------------- scripts/Makefile | 2 +- 10 files changed, 108 insertions(+), 77 deletions(-) create mode 100644 include/linux/module_signature.h create mode 100644 kernel/module_signature.c diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a4ad2733eedf..e0ae0d51f985 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -538,7 +538,7 @@ config ARCH_HAS_KEXEC_PURGATORY config KEXEC_VERIFY_SIG bool "Verify kernel signature during kexec_file_load() syscall" - depends on KEXEC_FILE && SYSTEM_DATA_VERIFICATION + depends on KEXEC_FILE && MODULE_SIG_FORMAT help This option makes kernel signature verification mandatory for the kexec_file_load() syscall. diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index fbdd3ea73667..1ac9fbc6e01e 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -23,28 +23,6 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { }; #ifdef CONFIG_KEXEC_VERIFY_SIG -/* - * Module signature information block. - * - * The constituents of the signature section are, in order: - * - * - Signer's name - * - Key identifier - * - Signature data - * - Information block - */ -struct module_signature { - u8 algo; /* Public-key crypto algorithm [0] */ - u8 hash; /* Digest algorithm [0] */ - u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ - u8 signer_len; /* Length of signer's name [0] */ - u8 key_id_len; /* Length of key identifier [0] */ - u8 __pad[3]; - __be32 sig_len; /* Length of signature data */ -}; - -#define PKEY_ID_PKCS7 2 - int s390_verify_sig(const char *kernel, unsigned long kernel_len) { const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1; diff --git a/include/linux/module.h b/include/linux/module.h index 1455812dd325..f6fc1dae74f4 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -26,9 +26,6 @@ #include #include -/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */ -#define MODULE_SIG_STRING "~Module signature appended~\n" - /* Not Yet Implemented */ #define MODULE_SUPPORTED_DEVICE(name) diff --git a/include/linux/module_signature.h b/include/linux/module_signature.h new file mode 100644 index 000000000000..523617fc5b6a --- /dev/null +++ b/include/linux/module_signature.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Module signature handling. + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#ifndef _LINUX_MODULE_SIGNATURE_H +#define _LINUX_MODULE_SIGNATURE_H + +/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */ +#define MODULE_SIG_STRING "~Module signature appended~\n" + +enum pkey_id_type { + PKEY_ID_PGP, /* OpenPGP generated key ID */ + PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ + PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ +}; + +/* + * Module signature information block. + * + * The constituents of the signature section are, in order: + * + * - Signer's name + * - Key identifier + * - Signature data + * - Information block + */ +struct module_signature { + u8 algo; /* Public-key crypto algorithm [0] */ + u8 hash; /* Digest algorithm [0] */ + u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ + u8 signer_len; /* Length of signer's name [0] */ + u8 key_id_len; /* Length of key identifier [0] */ + u8 __pad[3]; + __be32 sig_len; /* Length of signature data */ +}; + +int mod_check_sig(const struct module_signature *ms, size_t file_len, + const char *name); + +#endif /* _LINUX_MODULE_SIGNATURE_H */ diff --git a/init/Kconfig b/init/Kconfig index bd7d650d4a99..2dca877c9ed7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1930,6 +1930,10 @@ config BASE_SMALL default 0 if BASE_FULL default 1 if !BASE_FULL +config MODULE_SIG_FORMAT + def_bool n + select SYSTEM_DATA_VERIFICATION + menuconfig MODULES bool "Enable loadable module support" option modules @@ -2007,7 +2011,7 @@ config MODULE_SRCVERSION_ALL config MODULE_SIG bool "Module signature verification" depends on MODULES - select SYSTEM_DATA_VERIFICATION + select MODULE_SIG_FORMAT help Check modules for valid signatures upon load: the signature is simply appended to the module. For more information see diff --git a/kernel/Makefile b/kernel/Makefile index a8d923b5481b..179b44ab8b46 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,6 +58,7 @@ endif obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o +obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_CRASH_CORE) += crash_core.o diff --git a/kernel/module.c b/kernel/module.c index 5933395af9a0..5ac22efc3685 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/module_signature.c b/kernel/module_signature.c new file mode 100644 index 000000000000..4224a1086b7d --- /dev/null +++ b/kernel/module_signature.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include + +/** + * mod_check_sig - check that the given signature is sane + * + * @ms: Signature to check. + * @file_len: Size of the file to which @ms is appended. + * @name: What is being checked. Used for error messages. + */ +int mod_check_sig(const struct module_signature *ms, size_t file_len, + const char *name) +{ + if (be32_to_cpu(ms->sig_len) >= file_len - sizeof(*ms)) + return -EBADMSG; + + if (ms->id_type != PKEY_ID_PKCS7) { + pr_err("%s: Module is not signed with expected PKCS#7 message\n", + name); + return -ENOPKG; + } + + if (ms->algo != 0 || + ms->hash != 0 || + ms->signer_len != 0 || + ms->key_id_len != 0 || + ms->__pad[0] != 0 || + ms->__pad[1] != 0 || + ms->__pad[2] != 0) { + pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", + name); + return -EBADMSG; + } + + return 0; +} diff --git a/kernel/module_signing.c b/kernel/module_signing.c index b10fb1986ca9..9d9fc678c91d 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -7,37 +7,13 @@ #include #include +#include +#include #include #include #include #include "module-internal.h" -enum pkey_id_type { - PKEY_ID_PGP, /* OpenPGP generated key ID */ - PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ - PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ -}; - -/* - * Module signature information block. - * - * The constituents of the signature section are, in order: - * - * - Signer's name - * - Key identifier - * - Signature data - * - Information block - */ -struct module_signature { - u8 algo; /* Public-key crypto algorithm [0] */ - u8 hash; /* Digest algorithm [0] */ - u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ - u8 signer_len; /* Length of signer's name [0] */ - u8 key_id_len; /* Length of key identifier [0] */ - u8 __pad[3]; - __be32 sig_len; /* Length of signature data */ -}; - /* * Verify the signature on a module. */ @@ -45,6 +21,7 @@ int mod_verify_sig(const void *mod, struct load_info *info) { struct module_signature ms; size_t sig_len, modlen = info->len; + int ret; pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -52,32 +29,15 @@ int mod_verify_sig(const void *mod, struct load_info *info) return -EBADMSG; memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); - modlen -= sizeof(ms); + + ret = mod_check_sig(&ms, modlen, info->name); + if (ret) + return ret; sig_len = be32_to_cpu(ms.sig_len); - if (sig_len >= modlen) - return -EBADMSG; - modlen -= sig_len; + modlen -= sig_len + sizeof(ms); info->len = modlen; - if (ms.id_type != PKEY_ID_PKCS7) { - pr_err("%s: Module is not signed with expected PKCS#7 message\n", - info->name); - return -ENOPKG; - } - - if (ms.algo != 0 || - ms.hash != 0 || - ms.signer_len != 0 || - ms.key_id_len != 0 || - ms.__pad[0] != 0 || - ms.__pad[1] != 0 || - ms.__pad[2] != 0) { - pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", - info->name); - return -EBADMSG; - } - return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, VERIFY_USE_SECONDARY_KEYRING, VERIFYING_MODULE_SIGNATURE, diff --git a/scripts/Makefile b/scripts/Makefile index 16bcb8087899..532f7e0915c3 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -17,7 +17,7 @@ hostprogs-$(CONFIG_VT) += conmakehash hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount hostprogs-$(CONFIG_BUILDTIME_EXTABLE_SORT) += sortextable hostprogs-$(CONFIG_ASN1) += asn1_compiler -hostprogs-$(CONFIG_MODULE_SIG) += sign-file +hostprogs-$(CONFIG_MODULE_SIG_FORMAT) += sign-file hostprogs-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert hostprogs-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert From 2a7bf671186eb5d1a47ef192aefbdf788f5b38fe Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:25 -0300 Subject: [PATCH 0037/2880] PKCS#7: Refactor verify_pkcs7_signature() IMA will need to verify a PKCS#7 signature which has already been parsed. For this reason, factor out the code which does that from verify_pkcs7_signature() into a new function which takes a struct pkcs7_message instead of a data buffer. Signed-off-by: Thiago Jung Bauermann Reviewed-by: Mimi Zohar Cc: David Howells Cc: David Woodhouse Cc: Herbert Xu Cc: "David S. Miller" Signed-off-by: Mimi Zohar --- certs/system_keyring.c | 61 ++++++++++++++++++++++++++---------- include/linux/verification.h | 10 ++++++ 2 files changed, 55 insertions(+), 16 deletions(-) diff --git a/certs/system_keyring.c b/certs/system_keyring.c index 1eba08a1af82..798291177186 100644 --- a/certs/system_keyring.c +++ b/certs/system_keyring.c @@ -190,33 +190,27 @@ late_initcall(load_system_certificate_list); #ifdef CONFIG_SYSTEM_DATA_VERIFICATION /** - * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data. + * verify_pkcs7_message_sig - Verify a PKCS#7-based signature on system data. * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. - * @raw_pkcs7: The PKCS#7 message that is the signature. - * @pkcs7_len: The size of @raw_pkcs7. + * @pkcs7: The PKCS#7 message that is the signature. * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, * (void *)1UL for all trusted keys). * @usage: The use to which the key is being put. * @view_content: Callback to gain access to content. * @ctx: Context for callback. */ -int verify_pkcs7_signature(const void *data, size_t len, - const void *raw_pkcs7, size_t pkcs7_len, - struct key *trusted_keys, - enum key_being_used_for usage, - int (*view_content)(void *ctx, - const void *data, size_t len, - size_t asn1hdrlen), - void *ctx) +int verify_pkcs7_message_sig(const void *data, size_t len, + struct pkcs7_message *pkcs7, + struct key *trusted_keys, + enum key_being_used_for usage, + int (*view_content)(void *ctx, + const void *data, size_t len, + size_t asn1hdrlen), + void *ctx) { - struct pkcs7_message *pkcs7; int ret; - pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len); - if (IS_ERR(pkcs7)) - return PTR_ERR(pkcs7); - /* The data should be detached - so we need to supply it. */ if (data && pkcs7_supply_detached_data(pkcs7, data, len) < 0) { pr_err("PKCS#7 signature with non-detached data\n"); @@ -269,6 +263,41 @@ int verify_pkcs7_signature(const void *data, size_t len, } error: + pr_devel("<==%s() = %d\n", __func__, ret); + return ret; +} + +/** + * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data. + * @data: The data to be verified (NULL if expecting internal data). + * @len: Size of @data. + * @raw_pkcs7: The PKCS#7 message that is the signature. + * @pkcs7_len: The size of @raw_pkcs7. + * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, + * (void *)1UL for all trusted keys). + * @usage: The use to which the key is being put. + * @view_content: Callback to gain access to content. + * @ctx: Context for callback. + */ +int verify_pkcs7_signature(const void *data, size_t len, + const void *raw_pkcs7, size_t pkcs7_len, + struct key *trusted_keys, + enum key_being_used_for usage, + int (*view_content)(void *ctx, + const void *data, size_t len, + size_t asn1hdrlen), + void *ctx) +{ + struct pkcs7_message *pkcs7; + int ret; + + pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len); + if (IS_ERR(pkcs7)) + return PTR_ERR(pkcs7); + + ret = verify_pkcs7_message_sig(data, len, pkcs7, trusted_keys, usage, + view_content, ctx); + pkcs7_free_message(pkcs7); pr_devel("<==%s() = %d\n", __func__, ret); return ret; diff --git a/include/linux/verification.h b/include/linux/verification.h index 32d990d163c4..911ab7c2b1ab 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -32,6 +32,7 @@ extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR]; #ifdef CONFIG_SYSTEM_DATA_VERIFICATION struct key; +struct pkcs7_message; extern int verify_pkcs7_signature(const void *data, size_t len, const void *raw_pkcs7, size_t pkcs7_len, @@ -41,6 +42,15 @@ extern int verify_pkcs7_signature(const void *data, size_t len, const void *data, size_t len, size_t asn1hdrlen), void *ctx); +extern int verify_pkcs7_message_sig(const void *data, size_t len, + struct pkcs7_message *pkcs7, + struct key *trusted_keys, + enum key_being_used_for usage, + int (*view_content)(void *ctx, + const void *data, + size_t len, + size_t asn1hdrlen), + void *ctx); #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION extern int verify_pefile_signature(const void *pebuf, unsigned pelen, From e201af16d1ec76ccd19b90484d767984ff451f18 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:26 -0300 Subject: [PATCH 0038/2880] PKCS#7: Introduce pkcs7_get_digest() IMA will need to access the digest of the PKCS7 message (as calculated by the kernel) before the signature is verified, so introduce pkcs7_get_digest() for that purpose. Also, modify pkcs7_digest() to detect when the digest was already calculated so that it doesn't have to do redundant work. Verifying that sinfo->sig->digest isn't NULL is sufficient because both places which allocate sinfo->sig (pkcs7_parse_message() and pkcs7_note_signed_info()) use kzalloc() so sig->digest is always initialized to zero. Signed-off-by: Thiago Jung Bauermann Reviewed-by: Mimi Zohar Cc: David Howells Cc: David Woodhouse Cc: Herbert Xu Cc: "David S. Miller" Signed-off-by: Mimi Zohar --- crypto/asymmetric_keys/pkcs7_verify.c | 33 +++++++++++++++++++++++++++ include/crypto/pkcs7.h | 4 ++++ 2 files changed, 37 insertions(+) diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c index 11bee67fa9cc..ce49820caa97 100644 --- a/crypto/asymmetric_keys/pkcs7_verify.c +++ b/crypto/asymmetric_keys/pkcs7_verify.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "pkcs7_parser.h" @@ -29,6 +30,10 @@ static int pkcs7_digest(struct pkcs7_message *pkcs7, kenter(",%u,%s", sinfo->index, sinfo->sig->hash_algo); + /* The digest was calculated already. */ + if (sig->digest) + return 0; + if (!sinfo->sig->hash_algo) return -ENOPKG; @@ -117,6 +122,34 @@ error_no_desc: return ret; } +int pkcs7_get_digest(struct pkcs7_message *pkcs7, const u8 **buf, u32 *len, + enum hash_algo *hash_algo) +{ + struct pkcs7_signed_info *sinfo = pkcs7->signed_infos; + int i, ret; + + /* + * This function doesn't support messages with more than one signature. + */ + if (sinfo == NULL || sinfo->next != NULL) + return -EBADMSG; + + ret = pkcs7_digest(pkcs7, sinfo); + if (ret) + return ret; + + *buf = sinfo->sig->digest; + *len = sinfo->sig->digest_size; + + for (i = 0; i < HASH_ALGO__LAST; i++) + if (!strcmp(hash_algo_name[i], sinfo->sig->hash_algo)) { + *hash_algo = i; + break; + } + + return 0; +} + /* * Find the key (X.509 certificate) to use to verify a PKCS#7 message. PKCS#7 * uses the issuer's name and the issuing certificate serial number for diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h index 96071bee03ac..38ec7f5f9041 100644 --- a/include/crypto/pkcs7.h +++ b/include/crypto/pkcs7.h @@ -9,6 +9,7 @@ #define _CRYPTO_PKCS7_H #include +#include #include struct key; @@ -40,4 +41,7 @@ extern int pkcs7_verify(struct pkcs7_message *pkcs7, extern int pkcs7_supply_detached_data(struct pkcs7_message *pkcs7, const void *data, size_t datalen); +extern int pkcs7_get_digest(struct pkcs7_message *pkcs7, const u8 **buf, + u32 *len, enum hash_algo *hash_algo); + #endif /* _CRYPTO_PKCS7_H */ From cf38fed1e183dd2410f62d49ae635fe593082f0c Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:27 -0300 Subject: [PATCH 0039/2880] integrity: Select CONFIG_KEYS instead of depending on it This avoids a dependency cycle in soon-to-be-introduced CONFIG_IMA_APPRAISE_MODSIG: it will select CONFIG_MODULE_SIG_FORMAT which in turn selects CONFIG_KEYS. Kconfig then complains that CONFIG_INTEGRITY_SIGNATURE depends on CONFIG_KEYS. Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- security/integrity/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/integrity/Kconfig b/security/integrity/Kconfig index c352532b8f84..0bae6adb63a9 100644 --- a/security/integrity/Kconfig +++ b/security/integrity/Kconfig @@ -18,8 +18,8 @@ if INTEGRITY config INTEGRITY_SIGNATURE bool "Digital signature verification using multiple keyrings" - depends on KEYS default n + select KEYS select SIGNATURE help This option enables digital signature verification support From 9044d627fd18f9fca49b62d4619ee14914b91464 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:28 -0300 Subject: [PATCH 0040/2880] ima: Add modsig appraise_type option for module-style appended signatures Introduce the modsig keyword to the IMA policy syntax to specify that a given hook should expect the file to have the IMA signature appended to it. Here is how it can be used in a rule: appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig With this rule, IMA will accept either a signature stored in the extended attribute or an appended signature. For now, the rule above will behave exactly the same as if appraise_type=imasig was specified. The actual modsig implementation will be introduced separately. Suggested-by: Mimi Zohar Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- Documentation/ABI/testing/ima_policy | 6 +++++- security/integrity/ima/Kconfig | 10 +++++++++ security/integrity/ima/Makefile | 1 + security/integrity/ima/ima.h | 9 ++++++++ security/integrity/ima/ima_modsig.c | 31 ++++++++++++++++++++++++++++ security/integrity/ima/ima_policy.c | 12 +++++++++-- security/integrity/integrity.h | 1 + 7 files changed, 67 insertions(+), 3 deletions(-) create mode 100644 security/integrity/ima/ima_modsig.c diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy index fc376a323908..29ebe9afdac4 100644 --- a/Documentation/ABI/testing/ima_policy +++ b/Documentation/ABI/testing/ima_policy @@ -37,7 +37,7 @@ Description: euid:= decimal value fowner:= decimal value lsm: are LSM specific - option: appraise_type:= [imasig] + option: appraise_type:= [imasig] [imasig|modsig] template:= name of a defined IMA template type (eg, ima-ng). Only valid when action is "measure". pcr:= decimal value @@ -105,3 +105,7 @@ Description: measure func=KEXEC_KERNEL_CHECK pcr=4 measure func=KEXEC_INITRAMFS_CHECK pcr=5 + + Example of appraise rule allowing modsig appended signatures: + + appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index 2ced99dde694..8bf46646b185 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -233,6 +233,16 @@ config IMA_APPRAISE_BOOTPARAM This option enables the different "ima_appraise=" modes (eg. fix, log) from the boot command line. +config IMA_APPRAISE_MODSIG + bool "Support module-style signatures for appraisal" + depends on IMA_APPRAISE + default n + help + Adds support for signatures appended to files. The format of the + appended signature is the same used for signed kernel modules. + The modsig keyword can be used in the IMA policy to allow a hook + to accept such signatures. + config IMA_TRUSTED_KEYRING bool "Require all keys on the .ima keyring be signed (deprecated)" depends on IMA_APPRAISE && SYSTEM_TRUSTED_KEYRING diff --git a/security/integrity/ima/Makefile b/security/integrity/ima/Makefile index d921dc4f9eb0..31d57cdf2421 100644 --- a/security/integrity/ima/Makefile +++ b/security/integrity/ima/Makefile @@ -9,5 +9,6 @@ obj-$(CONFIG_IMA) += ima.o ima-y := ima_fs.o ima_queue.o ima_init.o ima_main.o ima_crypto.o ima_api.o \ ima_policy.o ima_template.o ima_template_lib.o ima-$(CONFIG_IMA_APPRAISE) += ima_appraise.o +ima-$(CONFIG_IMA_APPRAISE_MODSIG) += ima_modsig.o ima-$(CONFIG_HAVE_IMA_KEXEC) += ima_kexec.o obj-$(CONFIG_IMA_BLACKLIST_KEYRING) += ima_mok.o diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 011b91c79351..e21b06942858 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -302,6 +302,15 @@ static inline int ima_read_xattr(struct dentry *dentry, #endif /* CONFIG_IMA_APPRAISE */ +#ifdef CONFIG_IMA_APPRAISE_MODSIG +bool ima_hook_supports_modsig(enum ima_hooks func); +#else +static inline bool ima_hook_supports_modsig(enum ima_hooks func) +{ + return false; +} +#endif /* CONFIG_IMA_APPRAISE_MODSIG */ + /* LSM based policy rules require audit */ #ifdef CONFIG_IMA_LSM_RULES diff --git a/security/integrity/ima/ima_modsig.c b/security/integrity/ima/ima_modsig.c new file mode 100644 index 000000000000..87503bfe8c8b --- /dev/null +++ b/security/integrity/ima/ima_modsig.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * IMA support for appraising module-style appended signatures. + * + * Copyright (C) 2019 IBM Corporation + * + * Author: + * Thiago Jung Bauermann + */ + +#include "ima.h" + +/** + * ima_hook_supports_modsig - can the policy allow modsig for this hook? + * + * modsig is only supported by hooks using ima_post_read_file(), because only + * they preload the contents of the file in a buffer. FILE_CHECK does that in + * some cases, but not when reached from vfs_open(). POLICY_CHECK can support + * it, but it's not useful in practice because it's a text file so deny. + */ +bool ima_hook_supports_modsig(enum ima_hooks func) +{ + switch (func) { + case KEXEC_KERNEL_CHECK: + case KEXEC_INITRAMFS_CHECK: + case MODULE_CHECK: + return true; + default: + return false; + } +} diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 36a0727f1d7a..5b6061d6bce0 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1130,6 +1130,10 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry) ima_log_string(ab, "appraise_type", args[0].from); if ((strcmp(args[0].from, "imasig")) == 0) entry->flags |= IMA_DIGSIG_REQUIRED; + else if (ima_hook_supports_modsig(entry->func) && + strcmp(args[0].from, "imasig|modsig") == 0) + entry->flags |= IMA_DIGSIG_REQUIRED | + IMA_MODSIG_ALLOWED; else result = -EINVAL; break; @@ -1449,8 +1453,12 @@ int ima_policy_show(struct seq_file *m, void *v) } if (entry->template) seq_printf(m, "template=%s ", entry->template->name); - if (entry->flags & IMA_DIGSIG_REQUIRED) - seq_puts(m, "appraise_type=imasig "); + if (entry->flags & IMA_DIGSIG_REQUIRED) { + if (entry->flags & IMA_MODSIG_ALLOWED) + seq_puts(m, "appraise_type=imasig|modsig "); + else + seq_puts(m, "appraise_type=imasig "); + } if (entry->flags & IMA_PERMIT_DIRECTIO) seq_puts(m, "permit_directio "); rcu_read_unlock(); diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index ed12d8e13d04..8c5736b68156 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -31,6 +31,7 @@ #define IMA_NEW_FILE 0x04000000 #define EVM_IMMUTABLE_DIGSIG 0x08000000 #define IMA_FAIL_UNVERIFIABLE_SIGS 0x10000000 +#define IMA_MODSIG_ALLOWED 0x20000000 #define IMA_DO_MASK (IMA_MEASURE | IMA_APPRAISE | IMA_AUDIT | \ IMA_HASH | IMA_APPRAISE_SUBMASK) From a5fbeb615ca42f913ace3291d636e96feabcc545 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:29 -0300 Subject: [PATCH 0041/2880] ima: Factor xattr_verify() out of ima_appraise_measurement() Verify xattr signature in a separate function so that the logic in ima_appraise_measurement() remains clear when it gains the ability to also verify an appended module signature. The code in the switch statement is unchanged except for having to dereference the status and cause variables (since they're now pointers), and fixing the style of a block comment to appease checkpatch. Suggested-by: Mimi Zohar Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_appraise.c | 141 +++++++++++++++----------- 1 file changed, 81 insertions(+), 60 deletions(-) diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index 89b83194d1dc..d3298045737f 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -199,6 +199,83 @@ int ima_read_xattr(struct dentry *dentry, return ret; } +/* + * xattr_verify - verify xattr digest or signature + * + * Verify whether the hash or signature matches the file contents. + * + * Return 0 on success, error code otherwise. + */ +static int xattr_verify(enum ima_hooks func, struct integrity_iint_cache *iint, + struct evm_ima_xattr_data *xattr_value, int xattr_len, + enum integrity_status *status, const char **cause) +{ + int rc = -EINVAL, hash_start = 0; + + switch (xattr_value->type) { + case IMA_XATTR_DIGEST_NG: + /* first byte contains algorithm id */ + hash_start = 1; + /* fall through */ + case IMA_XATTR_DIGEST: + if (iint->flags & IMA_DIGSIG_REQUIRED) { + *cause = "IMA-signature-required"; + *status = INTEGRITY_FAIL; + break; + } + clear_bit(IMA_DIGSIG, &iint->atomic_flags); + if (xattr_len - sizeof(xattr_value->type) - hash_start >= + iint->ima_hash->length) + /* + * xattr length may be longer. md5 hash in previous + * version occupied 20 bytes in xattr, instead of 16 + */ + rc = memcmp(&xattr_value->data[hash_start], + iint->ima_hash->digest, + iint->ima_hash->length); + else + rc = -EINVAL; + if (rc) { + *cause = "invalid-hash"; + *status = INTEGRITY_FAIL; + break; + } + *status = INTEGRITY_PASS; + break; + case EVM_IMA_XATTR_DIGSIG: + set_bit(IMA_DIGSIG, &iint->atomic_flags); + rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, + (const char *)xattr_value, + xattr_len, + iint->ima_hash->digest, + iint->ima_hash->length); + if (rc == -EOPNOTSUPP) { + *status = INTEGRITY_UNKNOWN; + break; + } + if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && + func == KEXEC_KERNEL_CHECK) + rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, + (const char *)xattr_value, + xattr_len, + iint->ima_hash->digest, + iint->ima_hash->length); + if (rc) { + *cause = "invalid-signature"; + *status = INTEGRITY_FAIL; + } else { + *status = INTEGRITY_PASS; + } + break; + default: + *status = INTEGRITY_UNKNOWN; + *cause = "unknown-ima-data"; + break; + } + + return rc; +} + /* * ima_appraise_measurement - appraise file measurement * @@ -218,7 +295,7 @@ int ima_appraise_measurement(enum ima_hooks func, struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; - int rc = xattr_len, hash_start = 0; + int rc = xattr_len; if (!(inode->i_opflags & IOP_XATTR)) return INTEGRITY_UNKNOWN; @@ -256,65 +333,9 @@ int ima_appraise_measurement(enum ima_hooks func, WARN_ONCE(true, "Unexpected integrity status %d\n", status); } - switch (xattr_value->type) { - case IMA_XATTR_DIGEST_NG: - /* first byte contains algorithm id */ - hash_start = 1; - /* fall through */ - case IMA_XATTR_DIGEST: - if (iint->flags & IMA_DIGSIG_REQUIRED) { - cause = "IMA-signature-required"; - status = INTEGRITY_FAIL; - break; - } - clear_bit(IMA_DIGSIG, &iint->atomic_flags); - if (xattr_len - sizeof(xattr_value->type) - hash_start >= - iint->ima_hash->length) - /* xattr length may be longer. md5 hash in previous - version occupied 20 bytes in xattr, instead of 16 - */ - rc = memcmp(&xattr_value->data[hash_start], - iint->ima_hash->digest, - iint->ima_hash->length); - else - rc = -EINVAL; - if (rc) { - cause = "invalid-hash"; - status = INTEGRITY_FAIL; - break; - } - status = INTEGRITY_PASS; - break; - case EVM_IMA_XATTR_DIGSIG: - set_bit(IMA_DIGSIG, &iint->atomic_flags); - rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, - (const char *)xattr_value, - xattr_len, - iint->ima_hash->digest, - iint->ima_hash->length); - if (rc == -EOPNOTSUPP) { - status = INTEGRITY_UNKNOWN; - break; - } - if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && - func == KEXEC_KERNEL_CHECK) - rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, - (const char *)xattr_value, - xattr_len, - iint->ima_hash->digest, - iint->ima_hash->length); - if (rc) { - cause = "invalid-signature"; - status = INTEGRITY_FAIL; - } else { - status = INTEGRITY_PASS; - } - break; - default: - status = INTEGRITY_UNKNOWN; - cause = "unknown-ima-data"; - break; - } + if (xattr_value) + rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, + &cause); out: /* From 39b07096364a42c516415d5f841069e885234e61 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:30 -0300 Subject: [PATCH 0042/2880] ima: Implement support for module-style appended signatures Implement the appraise_type=imasig|modsig option, allowing IMA to read and verify modsig signatures. In case a file has both an xattr signature and an appended modsig, IMA will only use the appended signature if the key used by the xattr signature isn't present in the IMA or platform keyring. Because modsig verification needs to convert from an integrity keyring id to the keyring itself, add an integrity_keyring_from_id() function in digsig.c so that integrity_modsig_verify() can use it. Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- security/integrity/digsig.c | 43 ++++++++++++---- security/integrity/ima/Kconfig | 3 ++ security/integrity/ima/ima.h | 22 ++++++++- security/integrity/ima/ima_appraise.c | 51 +++++++++++++++++-- security/integrity/ima/ima_main.c | 11 ++++- security/integrity/ima/ima_modsig.c | 71 +++++++++++++++++++++++++++ security/integrity/ima/ima_policy.c | 12 ++--- security/integrity/integrity.h | 19 +++++++ 8 files changed, 209 insertions(+), 23 deletions(-) diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c index 868ade3e8970..ea1aae3d07b3 100644 --- a/security/integrity/digsig.c +++ b/security/integrity/digsig.c @@ -39,11 +39,10 @@ static const char * const keyring_name[INTEGRITY_KEYRING_MAX] = { #define restrict_link_to_ima restrict_link_by_builtin_trusted #endif -int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, - const char *digest, int digestlen) +static struct key *integrity_keyring_from_id(const unsigned int id) { - if (id >= INTEGRITY_KEYRING_MAX || siglen < 2) - return -EINVAL; + if (id >= INTEGRITY_KEYRING_MAX) + return ERR_PTR(-EINVAL); if (!keyring[id]) { keyring[id] = @@ -52,23 +51,49 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, int err = PTR_ERR(keyring[id]); pr_err("no %s keyring: %d\n", keyring_name[id], err); keyring[id] = NULL; - return err; + return ERR_PTR(err); } } + return keyring[id]; +} + +int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, + const char *digest, int digestlen) +{ + struct key *keyring; + + if (siglen < 2) + return -EINVAL; + + keyring = integrity_keyring_from_id(id); + if (IS_ERR(keyring)) + return PTR_ERR(keyring); + switch (sig[1]) { case 1: /* v1 API expect signature without xattr type */ - return digsig_verify(keyring[id], sig + 1, siglen - 1, - digest, digestlen); + return digsig_verify(keyring, sig + 1, siglen - 1, digest, + digestlen); case 2: - return asymmetric_verify(keyring[id], sig, siglen, - digest, digestlen); + return asymmetric_verify(keyring, sig, siglen, digest, + digestlen); } return -EOPNOTSUPP; } +int integrity_modsig_verify(const unsigned int id, const struct modsig *modsig) +{ + struct key *keyring; + + keyring = integrity_keyring_from_id(id); + if (IS_ERR(keyring)) + return PTR_ERR(keyring); + + return ima_modsig_verify(keyring, modsig); +} + static int __init __integrity_init_keyring(const unsigned int id, key_perm_t perm, struct key_restriction *restriction) diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index 8bf46646b185..897bafc59a33 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -236,6 +236,9 @@ config IMA_APPRAISE_BOOTPARAM config IMA_APPRAISE_MODSIG bool "Support module-style signatures for appraisal" depends on IMA_APPRAISE + depends on INTEGRITY_ASYMMETRIC_KEYS + select PKCS7_MESSAGE_PARSER + select MODULE_SIG_FORMAT default n help Adds support for signatures appended to files. The format of the diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index e21b06942858..ed8e19d38805 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -196,6 +196,10 @@ enum ima_hooks { __ima_hooks(__ima_hook_enumify) }; +extern const char *const func_tokens[]; + +struct modsig; + /* LIM API function definitions */ int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid, int mask, enum ima_hooks func, int *pcr, @@ -249,7 +253,7 @@ int ima_appraise_measurement(enum ima_hooks func, struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len); + int xattr_len, const struct modsig *modsig); int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func); void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file); enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint, @@ -265,7 +269,8 @@ static inline int ima_appraise_measurement(enum ima_hooks func, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len) + int xattr_len, + const struct modsig *modsig) { return INTEGRITY_UNKNOWN; } @@ -304,11 +309,24 @@ static inline int ima_read_xattr(struct dentry *dentry, #ifdef CONFIG_IMA_APPRAISE_MODSIG bool ima_hook_supports_modsig(enum ima_hooks func); +int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, + struct modsig **modsig); +void ima_free_modsig(struct modsig *modsig); #else static inline bool ima_hook_supports_modsig(enum ima_hooks func) { return false; } + +static inline int ima_read_modsig(enum ima_hooks func, const void *buf, + loff_t buf_len, struct modsig **modsig) +{ + return -EOPNOTSUPP; +} + +static inline void ima_free_modsig(struct modsig *modsig) +{ +} #endif /* CONFIG_IMA_APPRAISE_MODSIG */ /* LSM based policy rules require audit */ diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index d3298045737f..d21e40eaba42 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -276,6 +276,33 @@ static int xattr_verify(enum ima_hooks func, struct integrity_iint_cache *iint, return rc; } +/* + * modsig_verify - verify modsig signature + * + * Verify whether the signature matches the file contents. + * + * Return 0 on success, error code otherwise. + */ +static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, + enum integrity_status *status, const char **cause) +{ + int rc; + + rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig); + if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && + func == KEXEC_KERNEL_CHECK) + rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM, + modsig); + if (rc) { + *cause = "invalid-signature"; + *status = INTEGRITY_FAIL; + } else { + *status = INTEGRITY_PASS; + } + + return rc; +} + /* * ima_appraise_measurement - appraise file measurement * @@ -288,7 +315,7 @@ int ima_appraise_measurement(enum ima_hooks func, struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len) + int xattr_len, const struct modsig *modsig) { static const char op[] = "appraise_data"; const char *cause = "unknown"; @@ -296,11 +323,14 @@ int ima_appraise_measurement(enum ima_hooks func, struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len; + bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig; - if (!(inode->i_opflags & IOP_XATTR)) + /* If not appraising a modsig, we need an xattr. */ + if (!(inode->i_opflags & IOP_XATTR) && !try_modsig) return INTEGRITY_UNKNOWN; - if (rc <= 0) { + /* If reading the xattr failed and there's no modsig, error out. */ + if (rc <= 0 && !try_modsig) { if (rc && rc != -ENODATA) goto out; @@ -323,6 +353,10 @@ int ima_appraise_measurement(enum ima_hooks func, case INTEGRITY_UNKNOWN: break; case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */ + /* It's fine not to have xattrs when using a modsig. */ + if (try_modsig) + break; + /* fall through */ case INTEGRITY_NOLABEL: /* No security.evm xattr. */ cause = "missing-HMAC"; goto out; @@ -337,6 +371,15 @@ int ima_appraise_measurement(enum ima_hooks func, rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, &cause); + /* + * If we have a modsig and either no imasig or the imasig's key isn't + * known, then try verifying the modsig. + */ + if (try_modsig && + (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG || + rc == -ENOKEY)) + rc = modsig_verify(func, modsig, &status, &cause); + out: /* * File signatures on some filesystems can not be properly verified. @@ -353,7 +396,7 @@ out: op, cause, rc, 0); } else if (status != INTEGRITY_PASS) { /* Fix mode, but don't replace file signatures. */ - if ((ima_appraise & IMA_APPRAISE_FIX) && + if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig && (!xattr_value || xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 584019728660..d8672e850615 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -202,6 +202,7 @@ static int process_measurement(struct file *file, const struct cred *cred, int rc = 0, action, must_appraise = 0; int pcr = CONFIG_IMA_MEASURE_PCR_IDX; struct evm_ima_xattr_data *xattr_value = NULL; + struct modsig *modsig = NULL; int xattr_len = 0; bool violation_check; enum hash_algo hash_algo; @@ -302,10 +303,15 @@ static int process_measurement(struct file *file, const struct cred *cred, } if ((action & IMA_APPRAISE_SUBMASK) || - strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0) + strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0) { /* read 'security.ima' */ xattr_len = ima_read_xattr(file_dentry(file), &xattr_value); + /* Read the appended modsig if allowed by the policy. */ + if (iint->flags & IMA_MODSIG_ALLOWED) + ima_read_modsig(func, buf, size, &modsig); + } + hash_algo = ima_get_hash_algo(xattr_value, xattr_len); rc = ima_collect_measurement(iint, file, buf, size, hash_algo); @@ -322,7 +328,7 @@ static int process_measurement(struct file *file, const struct cred *cred, if (rc == 0 && (action & IMA_APPRAISE_SUBMASK)) { inode_lock(inode); rc = ima_appraise_measurement(func, iint, file, pathname, - xattr_value, xattr_len); + xattr_value, xattr_len, modsig); inode_unlock(inode); if (!rc) rc = mmap_violation_check(func, file, &pathbuf, @@ -339,6 +345,7 @@ out_locked: rc = -EACCES; mutex_unlock(&iint->mutex); kfree(xattr_value); + ima_free_modsig(modsig); out: if (pathbuf) __putname(pathbuf); diff --git a/security/integrity/ima/ima_modsig.c b/security/integrity/ima/ima_modsig.c index 87503bfe8c8b..f41ebe370fa0 100644 --- a/security/integrity/ima/ima_modsig.c +++ b/security/integrity/ima/ima_modsig.c @@ -8,8 +8,17 @@ * Thiago Jung Bauermann */ +#include +#include +#include +#include + #include "ima.h" +struct modsig { + struct pkcs7_message *pkcs7_msg; +}; + /** * ima_hook_supports_modsig - can the policy allow modsig for this hook? * @@ -29,3 +38,65 @@ bool ima_hook_supports_modsig(enum ima_hooks func) return false; } } + +/* + * ima_read_modsig - Read modsig from buf. + * + * Return: 0 on success, error code otherwise. + */ +int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, + struct modsig **modsig) +{ + const size_t marker_len = strlen(MODULE_SIG_STRING); + const struct module_signature *sig; + struct modsig *hdr; + size_t sig_len; + const void *p; + int rc; + + if (buf_len <= marker_len + sizeof(*sig)) + return -ENOENT; + + p = buf + buf_len - marker_len; + if (memcmp(p, MODULE_SIG_STRING, marker_len)) + return -ENOENT; + + buf_len -= marker_len; + sig = (const struct module_signature *)(p - sizeof(*sig)); + + rc = mod_check_sig(sig, buf_len, func_tokens[func]); + if (rc) + return rc; + + sig_len = be32_to_cpu(sig->sig_len); + buf_len -= sig_len + sizeof(*sig); + + hdr = kmalloc(sizeof(*hdr), GFP_KERNEL); + if (!hdr) + return -ENOMEM; + + hdr->pkcs7_msg = pkcs7_parse_message(buf + buf_len, sig_len); + if (IS_ERR(hdr->pkcs7_msg)) { + kfree(hdr); + return PTR_ERR(hdr->pkcs7_msg); + } + + *modsig = hdr; + + return 0; +} + +int ima_modsig_verify(struct key *keyring, const struct modsig *modsig) +{ + return verify_pkcs7_message_sig(NULL, 0, modsig->pkcs7_msg, keyring, + VERIFYING_MODULE_SIGNATURE, NULL, NULL); +} + +void ima_free_modsig(struct modsig *modsig) +{ + if (!modsig) + return; + + pkcs7_free_message(modsig->pkcs7_msg); + kfree(modsig); +} diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 5b6061d6bce0..873dd7edaa78 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1258,6 +1258,12 @@ void ima_delete_rules(void) } } +#define __ima_hook_stringify(str) (#str), + +const char *const func_tokens[] = { + __ima_hooks(__ima_hook_stringify) +}; + #ifdef CONFIG_IMA_READ_POLICY enum { mask_exec = 0, mask_write, mask_read, mask_append @@ -1270,12 +1276,6 @@ static const char *const mask_tokens[] = { "^MAY_APPEND" }; -#define __ima_hook_stringify(str) (#str), - -static const char *const func_tokens[] = { - __ima_hooks(__ima_hook_stringify) -}; - void *ima_policy_start(struct seq_file *m, loff_t *pos) { loff_t l = *pos; diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index 8c5736b68156..d9323d31a3a8 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -148,10 +148,13 @@ int integrity_kernel_read(struct file *file, loff_t offset, extern struct dentry *integrity_dir; +struct modsig; + #ifdef CONFIG_INTEGRITY_SIGNATURE int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, const char *digest, int digestlen); +int integrity_modsig_verify(unsigned int id, const struct modsig *modsig); int __init integrity_init_keyring(const unsigned int id); int __init integrity_load_x509(const unsigned int id, const char *path); @@ -166,6 +169,12 @@ static inline int integrity_digsig_verify(const unsigned int id, return -EOPNOTSUPP; } +static inline int integrity_modsig_verify(unsigned int id, + const struct modsig *modsig) +{ + return -EOPNOTSUPP; +} + static inline int integrity_init_keyring(const unsigned int id) { return 0; @@ -191,6 +200,16 @@ static inline int asymmetric_verify(struct key *keyring, const char *sig, } #endif +#ifdef CONFIG_IMA_APPRAISE_MODSIG +int ima_modsig_verify(struct key *keyring, const struct modsig *modsig); +#else +static inline int ima_modsig_verify(struct key *keyring, + const struct modsig *modsig) +{ + return -EOPNOTSUPP; +} +#endif + #ifdef CONFIG_IMA_LOAD_X509 void __init ima_load_x509(void); #else From 15588227e086ec662d59df144e48af82e3e592f1 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:31 -0300 Subject: [PATCH 0043/2880] ima: Collect modsig Obtain the modsig and calculate its corresponding hash in ima_collect_measurement(). Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- security/integrity/ima/ima.h | 8 ++++- security/integrity/ima/ima_api.c | 5 ++- security/integrity/ima/ima_appraise.c | 2 +- security/integrity/ima/ima_main.c | 2 +- security/integrity/ima/ima_modsig.c | 48 ++++++++++++++++++++++++++- 5 files changed, 60 insertions(+), 5 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index ed8e19d38805..0bc764c80327 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -207,7 +207,7 @@ int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid, int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func); int ima_collect_measurement(struct integrity_iint_cache *iint, struct file *file, void *buf, loff_t size, - enum hash_algo algo); + enum hash_algo algo, struct modsig *modsig); void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, @@ -311,6 +311,7 @@ static inline int ima_read_xattr(struct dentry *dentry, bool ima_hook_supports_modsig(enum ima_hooks func); int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig); +void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size); void ima_free_modsig(struct modsig *modsig); #else static inline bool ima_hook_supports_modsig(enum ima_hooks func) @@ -324,6 +325,11 @@ static inline int ima_read_modsig(enum ima_hooks func, const void *buf, return -EOPNOTSUPP; } +static inline void ima_collect_modsig(struct modsig *modsig, const void *buf, + loff_t size) +{ +} + static inline void ima_free_modsig(struct modsig *modsig) { } diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index f614e22bf39f..ff8b7fb03ea0 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -205,7 +205,7 @@ int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid, */ int ima_collect_measurement(struct integrity_iint_cache *iint, struct file *file, void *buf, loff_t size, - enum hash_algo algo) + enum hash_algo algo, struct modsig *modsig) { const char *audit_cause = "failed"; struct inode *inode = file_inode(file); @@ -252,6 +252,9 @@ int ima_collect_measurement(struct integrity_iint_cache *iint, memcpy(iint->ima_hash, &hash, length); iint->version = i_version; + if (modsig) + ima_collect_modsig(modsig, buf, size); + /* Possibly temporary failure due to type of read (eg. O_DIRECT) */ if (!result) iint->flags |= IMA_COLLECTED; diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index d21e40eaba42..136ae4e0ee92 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -435,7 +435,7 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file) !(iint->flags & IMA_HASH)) return; - rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo); + rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL); if (rc < 0) return; diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index d8672e850615..7c8d92cf2755 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -314,7 +314,7 @@ static int process_measurement(struct file *file, const struct cred *cred, hash_algo = ima_get_hash_algo(xattr_value, xattr_len); - rc = ima_collect_measurement(iint, file, buf, size, hash_algo); + rc = ima_collect_measurement(iint, file, buf, size, hash_algo, modsig); if (rc != 0 && rc != -EBADF && rc != -EINVAL) goto out_locked; diff --git a/security/integrity/ima/ima_modsig.c b/security/integrity/ima/ima_modsig.c index f41ebe370fa0..257387ce3b70 100644 --- a/security/integrity/ima/ima_modsig.c +++ b/security/integrity/ima/ima_modsig.c @@ -17,6 +17,19 @@ struct modsig { struct pkcs7_message *pkcs7_msg; + + enum hash_algo hash_algo; + + /* This digest will go in the 'd-modsig' field of the IMA template. */ + const u8 *digest; + u32 digest_size; + + /* + * This is what will go to the measurement list if the template requires + * storing the signature. + */ + int raw_pkcs7_len; + u8 raw_pkcs7[]; }; /** @@ -71,7 +84,8 @@ int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, sig_len = be32_to_cpu(sig->sig_len); buf_len -= sig_len + sizeof(*sig); - hdr = kmalloc(sizeof(*hdr), GFP_KERNEL); + /* Allocate sig_len additional bytes to hold the raw PKCS#7 data. */ + hdr = kzalloc(sizeof(*hdr) + sig_len, GFP_KERNEL); if (!hdr) return -ENOMEM; @@ -81,11 +95,43 @@ int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, return PTR_ERR(hdr->pkcs7_msg); } + memcpy(hdr->raw_pkcs7, buf + buf_len, sig_len); + hdr->raw_pkcs7_len = sig_len; + + /* We don't know the hash algorithm yet. */ + hdr->hash_algo = HASH_ALGO__LAST; + *modsig = hdr; return 0; } +/** + * ima_collect_modsig - Calculate the file hash without the appended signature. + * + * Since the modsig is part of the file contents, the hash used in its signature + * isn't the same one ordinarily calculated by IMA. Therefore PKCS7 code + * calculates a separate one for signature verification. + */ +void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size) +{ + int rc; + + /* + * Provide the file contents (minus the appended sig) so that the PKCS7 + * code can calculate the file hash. + */ + size -= modsig->raw_pkcs7_len + strlen(MODULE_SIG_STRING) + + sizeof(struct module_signature); + rc = pkcs7_supply_detached_data(modsig->pkcs7_msg, buf, size); + if (rc) + return; + + /* Ask the PKCS7 code to calculate the file hash. */ + rc = pkcs7_get_digest(modsig->pkcs7_msg, &modsig->digest, + &modsig->digest_size, &modsig->hash_algo); +} + int ima_modsig_verify(struct key *keyring, const struct modsig *modsig) { return verify_pkcs7_message_sig(NULL, 0, modsig->pkcs7_msg, keyring, From 3878d505aa718bcc7b1eb4089ab9b9fb27dee957 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:32 -0300 Subject: [PATCH 0044/2880] ima: Define ima-modsig template Define new "d-modsig" template field which holds the digest that is expected to match the one contained in the modsig, and also new "modsig" template field which holds the appended file signature. Add a new "ima-modsig" defined template descriptor with the new fields as well as the ones from the "ima-sig" descriptor. Change ima_store_measurement() to accept a struct modsig * argument so that it can be passed along to the templates via struct ima_event_data. Suggested-by: Mimi Zohar Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- Documentation/security/IMA-templates.rst | 3 ++ security/integrity/ima/ima.h | 20 ++++++- security/integrity/ima/ima_api.c | 5 +- security/integrity/ima/ima_main.c | 2 +- security/integrity/ima/ima_modsig.c | 19 +++++++ security/integrity/ima/ima_policy.c | 41 +++++++++++++++ security/integrity/ima/ima_template.c | 7 ++- security/integrity/ima/ima_template_lib.c | 64 ++++++++++++++++++++++- security/integrity/ima/ima_template_lib.h | 4 ++ 9 files changed, 159 insertions(+), 6 deletions(-) diff --git a/Documentation/security/IMA-templates.rst b/Documentation/security/IMA-templates.rst index 3d1cca287aa4..c5a8432972ef 100644 --- a/Documentation/security/IMA-templates.rst +++ b/Documentation/security/IMA-templates.rst @@ -68,8 +68,10 @@ descriptors by adding their identifier to the format string - 'd-ng': the digest of the event, calculated with an arbitrary hash algorithm (field format: [:]digest, where the digest prefix is shown only if the hash algorithm is not SHA1 or MD5); + - 'd-modsig': the digest of the event without the appended modsig; - 'n-ng': the name of the event, without size limitations; - 'sig': the file signature; + - 'modsig' the appended file signature; - 'buf': the buffer data that was used to generate the hash without size limitations; @@ -79,6 +81,7 @@ Below, there is the list of defined template descriptors: - "ima-ng" (default): its format is ``d-ng|n-ng``; - "ima-sig": its format is ``d-ng|n-ng|sig``; - "ima-buf": its format is ``d-ng|n-ng|buf``; + - "ima-modsig": its format is ``d-ng|n-ng|sig|d-modsig|modsig``; Use diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 0bc764c80327..cfc8b0887776 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -60,6 +60,7 @@ struct ima_event_data { const unsigned char *filename; struct evm_ima_xattr_data *xattr_value; int xattr_len; + const struct modsig *modsig; const char *violation; const void *buf; int buf_len; @@ -211,7 +212,7 @@ int ima_collect_measurement(struct integrity_iint_cache *iint, void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len, int pcr, + int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc); void ima_audit_measurement(struct integrity_iint_cache *iint, const unsigned char *filename); @@ -312,6 +313,10 @@ bool ima_hook_supports_modsig(enum ima_hooks func); int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig); void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size); +int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, + const u8 **digest, u32 *digest_size); +int ima_get_raw_modsig(const struct modsig *modsig, const void **data, + u32 *data_len); void ima_free_modsig(struct modsig *modsig); #else static inline bool ima_hook_supports_modsig(enum ima_hooks func) @@ -330,6 +335,19 @@ static inline void ima_collect_modsig(struct modsig *modsig, const void *buf, { } +static inline int ima_get_modsig_digest(const struct modsig *modsig, + enum hash_algo *algo, const u8 **digest, + u32 *digest_size) +{ + return -EOPNOTSUPP; +} + +static inline int ima_get_raw_modsig(const struct modsig *modsig, + const void **data, u32 *data_len) +{ + return -EOPNOTSUPP; +} + static inline void ima_free_modsig(struct modsig *modsig) { } diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index ff8b7fb03ea0..ca930e2ebc2c 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -288,7 +288,7 @@ out: void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len, int pcr, + int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc) { static const char op[] = "add_template_measure"; @@ -300,7 +300,8 @@ void ima_store_measurement(struct integrity_iint_cache *iint, .file = file, .filename = filename, .xattr_value = xattr_value, - .xattr_len = xattr_len }; + .xattr_len = xattr_len, + .modsig = modsig }; int violation = 0; if (iint->measured_pcrs & (0x1 << pcr)) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 7c8d92cf2755..c87645c2c4c0 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -323,7 +323,7 @@ static int process_measurement(struct file *file, const struct cred *cred, if (action & IMA_MEASURE) ima_store_measurement(iint, file, pathname, - xattr_value, xattr_len, pcr, + xattr_value, xattr_len, modsig, pcr, template_desc); if (rc == 0 && (action & IMA_APPRAISE_SUBMASK)) { inode_lock(inode); diff --git a/security/integrity/ima/ima_modsig.c b/security/integrity/ima/ima_modsig.c index 257387ce3b70..c412e31d1714 100644 --- a/security/integrity/ima/ima_modsig.c +++ b/security/integrity/ima/ima_modsig.c @@ -138,6 +138,25 @@ int ima_modsig_verify(struct key *keyring, const struct modsig *modsig) VERIFYING_MODULE_SIGNATURE, NULL, NULL); } +int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, + const u8 **digest, u32 *digest_size) +{ + *algo = modsig->hash_algo; + *digest = modsig->digest; + *digest_size = modsig->digest_size; + + return 0; +} + +int ima_get_raw_modsig(const struct modsig *modsig, const void **data, + u32 *data_len) +{ + *data = &modsig->raw_pkcs7; + *data_len = modsig->raw_pkcs7_len; + + return 0; +} + void ima_free_modsig(struct modsig *modsig) { if (!modsig) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 873dd7edaa78..4badc4fcda98 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -6,6 +6,9 @@ * ima_policy.c * - initialize default measure policy rules */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -845,6 +848,38 @@ static void ima_log_string(struct audit_buffer *ab, char *key, char *value) ima_log_string_op(ab, key, value, NULL); } +/* + * Validating the appended signature included in the measurement list requires + * the file hash calculated without the appended signature (i.e., the 'd-modsig' + * field). Therefore, notify the user if they have the 'modsig' field but not + * the 'd-modsig' field in the template. + */ +static void check_template_modsig(const struct ima_template_desc *template) +{ +#define MSG "template with 'modsig' field also needs 'd-modsig' field\n" + bool has_modsig, has_dmodsig; + static bool checked; + int i; + + /* We only need to notify the user once. */ + if (checked) + return; + + has_modsig = has_dmodsig = false; + for (i = 0; i < template->num_fields; i++) { + if (!strcmp(template->fields[i]->field_id, "modsig")) + has_modsig = true; + else if (!strcmp(template->fields[i]->field_id, "d-modsig")) + has_dmodsig = true; + } + + if (has_modsig && !has_dmodsig) + pr_notice(MSG); + + checked = true; +#undef MSG +} + static int ima_parse_rule(char *rule, struct ima_rule_entry *entry) { struct audit_buffer *ab; @@ -1187,6 +1222,12 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry) else if (entry->action == APPRAISE) temp_ima_appraise |= ima_appraise_flag(entry->func); + if (!result && entry->flags & IMA_MODSIG_ALLOWED) { + template_desc = entry->template ? entry->template : + ima_template_desc_current(); + check_template_modsig(template_desc); + } + audit_log_format(ab, "res=%d", !result); audit_log_end(ab); return result; diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index cb349d7b2601..88d494ca6248 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -23,6 +23,7 @@ static struct ima_template_desc builtin_templates[] = { {.name = "ima-ng", .fmt = "d-ng|n-ng"}, {.name = "ima-sig", .fmt = "d-ng|n-ng|sig"}, {.name = "ima-buf", .fmt = "d-ng|n-ng|buf"}, + {.name = "ima-modsig", .fmt = "d-ng|n-ng|sig|d-modsig|modsig"}, {.name = "", .fmt = ""}, /* placeholder for a custom format */ }; @@ -42,6 +43,10 @@ static const struct ima_template_field supported_fields[] = { .field_show = ima_show_template_sig}, {.field_id = "buf", .field_init = ima_eventbuf_init, .field_show = ima_show_template_buf}, + {.field_id = "d-modsig", .field_init = ima_eventdigest_modsig_init, + .field_show = ima_show_template_digest_ng}, + {.field_id = "modsig", .field_init = ima_eventmodsig_init, + .field_show = ima_show_template_sig}, }; /* @@ -49,7 +54,7 @@ static const struct ima_template_field supported_fields[] = { * need to be accounted for since they shouldn't be defined in the same template * description as 'd-ng' and 'n-ng' respectively. */ -#define MAX_TEMPLATE_NAME_LEN sizeof("d-ng|n-ng|sig|buf") +#define MAX_TEMPLATE_NAME_LEN sizeof("d-ng|n-ng|sig|buf|d-modisg|modsig") static struct ima_template_desc *ima_template; diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c index 2fb9a10bc6b7..32ae05d88257 100644 --- a/security/integrity/ima/ima_template_lib.c +++ b/security/integrity/ima/ima_template_lib.c @@ -225,7 +225,8 @@ int ima_parse_buf(void *bufstartp, void *bufendp, void **bufcurp, return 0; } -static int ima_eventdigest_init_common(u8 *digest, u32 digestsize, u8 hash_algo, +static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize, + u8 hash_algo, struct ima_field_data *field_data) { /* @@ -328,6 +329,41 @@ out: hash_algo, field_data); } +/* + * This function writes the digest of the file which is expected to match the + * digest contained in the file's appended signature. + */ +int ima_eventdigest_modsig_init(struct ima_event_data *event_data, + struct ima_field_data *field_data) +{ + enum hash_algo hash_algo; + const u8 *cur_digest; + u32 cur_digestsize; + + if (!event_data->modsig) + return 0; + + if (event_data->violation) { + /* Recording a violation. */ + hash_algo = HASH_ALGO_SHA1; + cur_digest = NULL; + cur_digestsize = 0; + } else { + int rc; + + rc = ima_get_modsig_digest(event_data->modsig, &hash_algo, + &cur_digest, &cur_digestsize); + if (rc) + return rc; + else if (hash_algo == HASH_ALGO__LAST || cur_digestsize == 0) + /* There was some error collecting the digest. */ + return -EINVAL; + } + + return ima_eventdigest_init_common(cur_digest, cur_digestsize, + hash_algo, field_data); +} + static int ima_eventname_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, bool size_limit) @@ -406,3 +442,29 @@ int ima_eventbuf_init(struct ima_event_data *event_data, event_data->buf_len, DATA_FMT_HEX, field_data); } + +/* + * ima_eventmodsig_init - include the appended file signature as part of the + * template data + */ +int ima_eventmodsig_init(struct ima_event_data *event_data, + struct ima_field_data *field_data) +{ + const void *data; + u32 data_len; + int rc; + + if (!event_data->modsig) + return 0; + + /* + * modsig is a runtime structure containing pointers. Get its raw data + * instead. + */ + rc = ima_get_raw_modsig(event_data->modsig, &data, &data_len); + if (rc) + return rc; + + return ima_write_template_field_data(data, data_len, DATA_FMT_HEX, + field_data); +} diff --git a/security/integrity/ima/ima_template_lib.h b/security/integrity/ima/ima_template_lib.h index 652aa5de81ef..9a88c79a7a61 100644 --- a/security/integrity/ima/ima_template_lib.h +++ b/security/integrity/ima/ima_template_lib.h @@ -36,10 +36,14 @@ int ima_eventname_init(struct ima_event_data *event_data, struct ima_field_data *field_data); int ima_eventdigest_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data); +int ima_eventdigest_modsig_init(struct ima_event_data *event_data, + struct ima_field_data *field_data); int ima_eventname_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data); int ima_eventsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data); int ima_eventbuf_init(struct ima_event_data *event_data, struct ima_field_data *field_data); +int ima_eventmodsig_init(struct ima_event_data *event_data, + struct ima_field_data *field_data); #endif /* __LINUX_IMA_TEMPLATE_LIB_H */ From e5092255bb3967bcc473dc86492dbbd5f7714023 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 27 Jun 2019 23:19:33 -0300 Subject: [PATCH 0045/2880] ima: Store the measurement again when appraising a modsig If the IMA template contains the "modsig" or "d-modsig" field, then the modsig should be added to the measurement list when the file is appraised. And that is what normally happens, but if a measurement rule caused a file containing a modsig to be measured before a different rule causes it to be appraised, the resulting measurement entry will not contain the modsig because it is only fetched during appraisal. When the appraisal rule triggers, it won't store a new measurement containing the modsig because the file was already measured. We need to detect that situation and store an additional measurement with the modsig. This is done by adding an IMA_MEASURE action flag if we read a modsig and the IMA template contains a modsig field. Suggested-by: Mimi Zohar Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar --- security/integrity/ima/ima.h | 1 + security/integrity/ima/ima_api.c | 19 +++++++++++++++---- security/integrity/ima/ima_main.c | 15 ++++++++++++--- security/integrity/ima/ima_template.c | 19 +++++++++++++++++++ 4 files changed, 47 insertions(+), 7 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index cfc8b0887776..19769bf5f6ab 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -150,6 +150,7 @@ int template_desc_init_fields(const char *template_fmt, int *num_fields); struct ima_template_desc *ima_template_desc_current(void); struct ima_template_desc *lookup_template_desc(const char *name); +bool ima_template_has_modsig(const struct ima_template_desc *ima_template); int ima_restore_measurement_entry(struct ima_template_entry *entry); int ima_restore_measurement_list(loff_t bufsize, void *buf); int ima_measurements_show(struct seq_file *m, void *v); diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index ca930e2ebc2c..65224474675b 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -219,6 +219,14 @@ int ima_collect_measurement(struct integrity_iint_cache *iint, char digest[IMA_MAX_DIGEST_SIZE]; } hash; + /* + * Always collect the modsig, because IMA might have already collected + * the file digest without collecting the modsig in a previous + * measurement rule. + */ + if (modsig) + ima_collect_modsig(modsig, buf, size); + if (iint->flags & IMA_COLLECTED) goto out; @@ -252,9 +260,6 @@ int ima_collect_measurement(struct integrity_iint_cache *iint, memcpy(iint->ima_hash, &hash, length); iint->version = i_version; - if (modsig) - ima_collect_modsig(modsig, buf, size); - /* Possibly temporary failure due to type of read (eg. O_DIRECT) */ if (!result) iint->flags |= IMA_COLLECTED; @@ -304,7 +309,13 @@ void ima_store_measurement(struct integrity_iint_cache *iint, .modsig = modsig }; int violation = 0; - if (iint->measured_pcrs & (0x1 << pcr)) + /* + * We still need to store the measurement in the case of MODSIG because + * we only have its contents to put in the list at the time of + * appraisal, but a file measurement from earlier might already exist in + * the measurement list. + */ + if (iint->measured_pcrs & (0x1 << pcr) && !modsig) return; result = ima_alloc_init_template(&event_data, &entry, template_desc); diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index c87645c2c4c0..79c01516211b 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -307,9 +307,18 @@ static int process_measurement(struct file *file, const struct cred *cred, /* read 'security.ima' */ xattr_len = ima_read_xattr(file_dentry(file), &xattr_value); - /* Read the appended modsig if allowed by the policy. */ - if (iint->flags & IMA_MODSIG_ALLOWED) - ima_read_modsig(func, buf, size, &modsig); + /* + * Read the appended modsig if allowed by the policy, and allow + * an additional measurement list entry, if needed, based on the + * template format and whether the file was already measured. + */ + if (iint->flags & IMA_MODSIG_ALLOWED) { + rc = ima_read_modsig(func, buf, size, &modsig); + + if (!rc && ima_template_has_modsig(template_desc) && + iint->flags & IMA_MEASURED) + action |= IMA_MEASURE; + } } hash_algo = ima_get_hash_algo(xattr_value, xattr_len); diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index 88d494ca6248..f5b950e0a955 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -58,6 +58,25 @@ static const struct ima_template_field supported_fields[] = { static struct ima_template_desc *ima_template; +/** + * ima_template_has_modsig - Check whether template has modsig-related fields. + * @ima_template: IMA template to check. + * + * Tells whether the given template has fields referencing a file's appended + * signature. + */ +bool ima_template_has_modsig(const struct ima_template_desc *ima_template) +{ + int i; + + for (i = 0; i < ima_template->num_fields; i++) + if (!strcmp(ima_template->fields[i]->field_id, "modsig") || + !strcmp(ima_template->fields[i]->field_id, "d-modsig")) + return true; + + return false; +} + static int __init ima_template_setup(char *str) { struct ima_template_desc *template_desc; From f5e1040196dbfe14c77ce3dfe3b7b08d2d961e88 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 2 Jul 2019 10:00:40 +0200 Subject: [PATCH 0046/2880] ima: always return negative code for error integrity_kernel_read() returns the number of bytes read. If this is a short read then this positive value is returned from ima_calc_file_hash_atfm(). Currently this is only indirectly called from ima_calc_file_hash() and this function only tests for the return value being zero or nonzero and also doesn't forward the return value. Nevertheless there's no point in returning a positive value as an error, so translate a short read into -EINVAL. Signed-off-by: Sascha Hauer Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_crypto.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index d4c7b8e1b083..7532b062be59 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -268,8 +268,11 @@ static int ima_calc_file_hash_atfm(struct file *file, rbuf_len = min_t(loff_t, i_size - offset, rbuf_size[active]); rc = integrity_kernel_read(file, offset, rbuf[active], rbuf_len); - if (rc != rbuf_len) + if (rc != rbuf_len) { + if (rc >= 0) + rc = -EINVAL; goto out3; + } if (rbuf[1] && offset) { /* Using two buffers, and it is not the first From 4ece3125f21b1d42b84896c5646dbf0e878464e1 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 2 Jul 2019 10:00:41 +0200 Subject: [PATCH 0047/2880] ima: fix freeing ongoing ahash_request integrity_kernel_read() can fail in which case we forward to call ahash_request_free() on a currently running request. We have to wait for its completion before we can free the request. This was observed by interrupting a "find / -type f -xdev -print0 | xargs -0 cat 1>/dev/null" with ctrl-c on an IMA enabled filesystem. Signed-off-by: Sascha Hauer Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_crypto.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 7532b062be59..73044fc6a952 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -271,6 +271,11 @@ static int ima_calc_file_hash_atfm(struct file *file, if (rc != rbuf_len) { if (rc >= 0) rc = -EINVAL; + /* + * Forward current rc, do not overwrite with return value + * from ahash_wait() + */ + ahash_wait(ahash_rc, &wait); goto out3; } From 4af9027d3f4061992c0b065102a0a666b72f073b Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 30 Jul 2019 17:02:26 +0800 Subject: [PATCH 0048/2880] csky/dma: Fixup cache_op failed when cross memory ZONEs If the paddr and size are cross between NORMAL_ZONE and HIGHMEM_ZONE memory range, cache_op will panic in do_page_fault with bad_area. Optimize the code to support the range which cross memory ZONEs. Changes for V2: - Revert back to postcore_initcall Signed-off-by: Guo Ren Cc: Christoph Hellwig Cc: Arnd Bergmann --- arch/csky/mm/dma-mapping.c | 71 ++++++++++++++------------------------ 1 file changed, 26 insertions(+), 45 deletions(-) diff --git a/arch/csky/mm/dma-mapping.c b/arch/csky/mm/dma-mapping.c index 80783bb71c5c..65f531d54814 100644 --- a/arch/csky/mm/dma-mapping.c +++ b/arch/csky/mm/dma-mapping.c @@ -20,69 +20,50 @@ static int __init atomic_pool_init(void) } postcore_initcall(atomic_pool_init); -void arch_dma_prep_coherent(struct page *page, size_t size) -{ - if (PageHighMem(page)) { - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - - do { - void *ptr = kmap_atomic(page); - size_t _size = (size < PAGE_SIZE) ? size : PAGE_SIZE; - - memset(ptr, 0, _size); - dma_wbinv_range((unsigned long)ptr, - (unsigned long)ptr + _size); - - kunmap_atomic(ptr); - - page++; - size -= PAGE_SIZE; - count--; - } while (count); - } else { - void *ptr = page_address(page); - - memset(ptr, 0, size); - dma_wbinv_range((unsigned long)ptr, (unsigned long)ptr + size); - } -} - static inline void cache_op(phys_addr_t paddr, size_t size, void (*fn)(unsigned long start, unsigned long end)) { - struct page *page = pfn_to_page(paddr >> PAGE_SHIFT); - unsigned int offset = paddr & ~PAGE_MASK; - size_t left = size; - unsigned long start; + struct page *page = phys_to_page(paddr); + void *start = __va(page_to_phys(page)); + unsigned long offset = offset_in_page(paddr); + size_t left = size; do { size_t len = left; + if (offset + len > PAGE_SIZE) + len = PAGE_SIZE - offset; + if (PageHighMem(page)) { - void *addr; + start = kmap_atomic(page); - if (offset + len > PAGE_SIZE) { - if (offset >= PAGE_SIZE) { - page += offset >> PAGE_SHIFT; - offset &= ~PAGE_MASK; - } - len = PAGE_SIZE - offset; - } + fn((unsigned long)start + offset, + (unsigned long)start + offset + len); - addr = kmap_atomic(page); - start = (unsigned long)(addr + offset); - fn(start, start + len); - kunmap_atomic(addr); + kunmap_atomic(start); } else { - start = (unsigned long)phys_to_virt(paddr); - fn(start, start + size); + fn((unsigned long)start + offset, + (unsigned long)start + offset + len); } offset = 0; + page++; + start += PAGE_SIZE; left -= len; } while (left); } +static void dma_wbinv_set_zero_range(unsigned long start, unsigned long end) +{ + memset((void *)start, 0, end - start); + dma_wbinv_range(start, end); +} + +void arch_dma_prep_coherent(struct page *page, size_t size) +{ + cache_op(page_to_phys(page), size, dma_wbinv_set_zero_range); +} + void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir) { From ae76f635d4e1cffa6870cc5472567ca9d6940a22 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 30 Jul 2019 17:16:28 +0800 Subject: [PATCH 0049/2880] csky: Optimize arch_sync_dma_for_cpu/device with dma_inv_range DMA_FROM_DEVICE only need to read dma data of memory into CPU cache, so there is no need to clear cache before. Also clear + inv for DMA_FROM_DEVICE won't cause problem, because the memory range for dma won't be touched by software during dma working. Changes for V2: - Remove clr cache and ignore the DMA_TO_DEVICE in _for_cpu. - Change inv to wbinv cache with DMA_FROM_DEVICE in _for_device. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/include/asm/cache.h | 1 + arch/csky/mm/cachev1.c | 7 ++++++- arch/csky/mm/cachev2.c | 11 ++++++++++- arch/csky/mm/dma-mapping.c | 5 ++--- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/csky/include/asm/cache.h b/arch/csky/include/asm/cache.h index d68373463676..1d5fc2f78fd7 100644 --- a/arch/csky/include/asm/cache.h +++ b/arch/csky/include/asm/cache.h @@ -24,6 +24,7 @@ void cache_wbinv_range(unsigned long start, unsigned long end); void cache_wbinv_all(void); void dma_wbinv_range(unsigned long start, unsigned long end); +void dma_inv_range(unsigned long start, unsigned long end); void dma_wb_range(unsigned long start, unsigned long end); #endif diff --git a/arch/csky/mm/cachev1.c b/arch/csky/mm/cachev1.c index b8a75cce0b8c..494ec912abff 100644 --- a/arch/csky/mm/cachev1.c +++ b/arch/csky/mm/cachev1.c @@ -120,7 +120,12 @@ void dma_wbinv_range(unsigned long start, unsigned long end) cache_op_range(start, end, DATA_CACHE|CACHE_CLR|CACHE_INV, 1); } +void dma_inv_range(unsigned long start, unsigned long end) +{ + cache_op_range(start, end, DATA_CACHE|CACHE_CLR|CACHE_INV, 1); +} + void dma_wb_range(unsigned long start, unsigned long end) { - cache_op_range(start, end, DATA_CACHE|CACHE_INV, 1); + cache_op_range(start, end, DATA_CACHE|CACHE_CLR|CACHE_INV, 1); } diff --git a/arch/csky/mm/cachev2.c b/arch/csky/mm/cachev2.c index baaf05d69f44..b61be6518e21 100644 --- a/arch/csky/mm/cachev2.c +++ b/arch/csky/mm/cachev2.c @@ -69,11 +69,20 @@ void dma_wbinv_range(unsigned long start, unsigned long end) sync_is(); } +void dma_inv_range(unsigned long start, unsigned long end) +{ + unsigned long i = start & ~(L1_CACHE_BYTES - 1); + + for (; i < end; i += L1_CACHE_BYTES) + asm volatile("dcache.iva %0\n"::"r"(i):"memory"); + sync_is(); +} + void dma_wb_range(unsigned long start, unsigned long end) { unsigned long i = start & ~(L1_CACHE_BYTES - 1); for (; i < end; i += L1_CACHE_BYTES) - asm volatile("dcache.civa %0\n"::"r"(i):"memory"); + asm volatile("dcache.cva %0\n"::"r"(i):"memory"); sync_is(); } diff --git a/arch/csky/mm/dma-mapping.c b/arch/csky/mm/dma-mapping.c index 65f531d54814..106ef02a8f89 100644 --- a/arch/csky/mm/dma-mapping.c +++ b/arch/csky/mm/dma-mapping.c @@ -85,11 +85,10 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, { switch (dir) { case DMA_TO_DEVICE: - cache_op(paddr, size, dma_wb_range); - break; + return; case DMA_FROM_DEVICE: case DMA_BIDIRECTIONAL: - cache_op(paddr, size, dma_wbinv_range); + cache_op(paddr, size, dma_inv_range); break; default: BUG(); From 70433f67ec3a54710744902d782f8954325e25b8 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 6 Aug 2019 12:15:19 +1000 Subject: [PATCH 0050/2880] MODSIGN: make new include file self contained Fixes: c8424e776b09 ("MODSIGN: Export module signature definitions") Signed-off-by: Stephen Rothwell Signed-off-by: Mimi Zohar --- include/linux/module_signature.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/module_signature.h b/include/linux/module_signature.h index 523617fc5b6a..7eb4b00381ac 100644 --- a/include/linux/module_signature.h +++ b/include/linux/module_signature.h @@ -9,6 +9,8 @@ #ifndef _LINUX_MODULE_SIGNATURE_H #define _LINUX_MODULE_SIGNATURE_H +#include + /* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */ #define MODULE_SIG_STRING "~Module signature appended~\n" From f29b7f39c0acbc66f18309227719460adab6851d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 6 Aug 2019 22:17:51 +0200 Subject: [PATCH 0051/2880] Revert "dt-bindings: i2c: rcar: Rename bindings documentation file" This reverts commit d13ed84b195cc6e5789b446f07aede357939f7ad. I overlooked that the Rev-by tag was given in advance assuming a fix was made for the next revision. Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/{renesas,i2c.txt => i2c-rcar.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{renesas,i2c.txt => i2c-rcar.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/renesas,i2c.txt b/Documentation/devicetree/bindings/i2c/i2c-rcar.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/renesas,i2c.txt rename to Documentation/devicetree/bindings/i2c/i2c-rcar.txt diff --git a/MAINTAINERS b/MAINTAINERS index f89f27a2d09a..119d2542c897 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13650,7 +13650,7 @@ F: drivers/iio/adc/rcar-gyroadc.c RENESAS R-CAR I2C DRIVERS M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/renesas,i2c.txt +F: Documentation/devicetree/bindings/i2c/i2c-rcar.txt F: Documentation/devicetree/bindings/i2c/renesas,iic.txt F: drivers/i2c/busses/i2c-rcar.c F: drivers/i2c/busses/i2c-sh_mobile.c From e611ee0b3b06b4d28216567a251e4de46e595b5d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 6 Aug 2019 22:19:32 +0200 Subject: [PATCH 0052/2880] Revert "dt-bindings: i2c: riic: Rename bindings documentation file" This reverts commit 684ca71259a69c5a3019da72fe718bf983841926. I overlooked that the Rev-by tag was given in advance assuming a fix was made for the next revision. Signed-off-by: Wolfram Sang --- .../bindings/i2c/{renesas,iic-emev2.txt => i2c-emev2.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{renesas,iic-emev2.txt => i2c-emev2.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt b/Documentation/devicetree/bindings/i2c/i2c-emev2.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt rename to Documentation/devicetree/bindings/i2c/i2c-emev2.txt diff --git a/MAINTAINERS b/MAINTAINERS index 119d2542c897..b49db5ad0b64 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13628,7 +13628,7 @@ F: drivers/clk/renesas/ RENESAS EMEV2 I2C DRIVER M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt +F: Documentation/devicetree/bindings/i2c/i2c-emev2.txt F: drivers/i2c/busses/i2c-emev2.c RENESAS ETHERNET DRIVERS From f91b2ab0e0c5f7cb4b11012748a184b91a396f0f Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Mon, 5 Aug 2019 17:31:08 +0800 Subject: [PATCH 0053/2880] i2c: designware: Fix unused variable warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/i2c/busses/i2c-designware-master.c: In function ‘i2c_dw_init_recovery_info’: drivers/i2c/busses/i2c-designware-master.c:658:6: warning: unused variable ‘r’ [-Wunused-variable] int r; ^ Fixes: 33eb09a02e8d ("i2c: designware: make use of devm_gpiod_get_optional") Signed-off-by: Shaokun Zhang Acked-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-master.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index 867787dade43..e8b328242256 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -655,7 +655,6 @@ static int i2c_dw_init_recovery_info(struct dw_i2c_dev *dev) struct i2c_bus_recovery_info *rinfo = &dev->rinfo; struct i2c_adapter *adap = &dev->adapter; struct gpio_desc *gpio; - int r; gpio = devm_gpiod_get_optional(dev->dev, "scl", GPIOD_OUT_HIGH); if (IS_ERR_OR_NULL(gpio)) From 3e99834cc0c7f0612dc790ad7342f18c375d285b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 26 Jul 2019 16:14:21 +0300 Subject: [PATCH 0054/2880] i2c: Drop unneeded check for of_node of_find_property() will return NULL if of_node is NULL, thus of_irq_get_by_name() returns -EINVAL which we ignore, so no need to double check. Signed-off-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index f26ed495d384..b4d84bd475da 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -350,13 +350,11 @@ static int i2c_device_probe(struct device *dev) return -ENODEV; if (client->flags & I2C_CLIENT_WAKE) { - int wakeirq = -ENOENT; + int wakeirq; - if (dev->of_node) { - wakeirq = of_irq_get_byname(dev->of_node, "wakeup"); - if (wakeirq == -EPROBE_DEFER) - return wakeirq; - } + wakeirq = of_irq_get_byname(dev->of_node, "wakeup"); + if (wakeirq == -EPROBE_DEFER) + return wakeirq; device_init_wakeup(&client->dev, true); From 4d7802aa434a39921c699683c4ca2ad9068b0033 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 Jul 2019 15:56:16 +0800 Subject: [PATCH 0055/2880] i2c: sprd: Make I2C driver can be built as a module Now there is no need to keep our I2C driver to be initialized so early, thus changing to module level and let it can be built as a module, meanwhile adding some module information. Signed-off-by: Baolin Wang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/Kconfig | 2 +- drivers/i2c/busses/i2c-sprd.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 09367fc014c3..69f19312a574 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -977,7 +977,7 @@ config I2C_SIRF will be called i2c-sirf. config I2C_SPRD - bool "Spreadtrum I2C interface" + tristate "Spreadtrum I2C interface" depends on I2C=y && ARCH_SPRD help If you say yes to this option, support will be included for the diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c index 961123529678..8002835d1543 100644 --- a/drivers/i2c/busses/i2c-sprd.c +++ b/drivers/i2c/busses/i2c-sprd.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -644,8 +645,7 @@ static struct platform_driver sprd_i2c_driver = { }, }; -static int sprd_i2c_init(void) -{ - return platform_driver_register(&sprd_i2c_driver); -} -arch_initcall_sync(sprd_i2c_init); +module_platform_driver(sprd_i2c_driver); + +MODULE_DESCRIPTION("Spreadtrum I2C master controller driver"); +MODULE_LICENSE("GPL v2"); From 3c2588fab65fb083873c6c1e74a1be54345615eb Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 Jul 2019 15:56:17 +0800 Subject: [PATCH 0056/2880] i2c: sprd: Change to use devm_platform_ioremap_resource() Use the new helper that wraps the calls to platform_get_resource() and devm_ioremap_resource() together. Signed-off-by: Baolin Wang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-sprd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c index 8002835d1543..bbcb0569522d 100644 --- a/drivers/i2c/busses/i2c-sprd.c +++ b/drivers/i2c/busses/i2c-sprd.c @@ -478,7 +478,6 @@ static int sprd_i2c_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct sprd_i2c *i2c_dev; - struct resource *res; u32 prop; int ret; @@ -488,8 +487,7 @@ static int sprd_i2c_probe(struct platform_device *pdev) if (!i2c_dev) return -ENOMEM; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - i2c_dev->base = devm_ioremap_resource(dev, res); + i2c_dev->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(i2c_dev->base)) return PTR_ERR(i2c_dev->base); From bbeb6b6c07960b8d33bed5352ef462228110c5ab Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 Jul 2019 15:56:18 +0800 Subject: [PATCH 0057/2880] i2c: sprd: Validate the return value of clock initialization The 'enable' clock of I2C master is required, we should return an error if failed to get the 'enable' clock, to make sure the I2C driver can be defer probe if the clock resource is not ready. Signed-off-by: Baolin Wang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-sprd.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c index bbcb0569522d..b432e7580458 100644 --- a/drivers/i2c/busses/i2c-sprd.c +++ b/drivers/i2c/busses/i2c-sprd.c @@ -466,9 +466,9 @@ static int sprd_i2c_clk_init(struct sprd_i2c *i2c_dev) i2c_dev->clk = devm_clk_get(i2c_dev->dev, "enable"); if (IS_ERR(i2c_dev->clk)) { - dev_warn(i2c_dev->dev, "i2c%d can't get the enable clock\n", - i2c_dev->adap.nr); - i2c_dev->clk = NULL; + dev_err(i2c_dev->dev, "i2c%d can't get the enable clock\n", + i2c_dev->adap.nr); + return PTR_ERR(i2c_dev->clk); } return 0; @@ -519,7 +519,10 @@ static int sprd_i2c_probe(struct platform_device *pdev) if (i2c_dev->bus_freq != 100000 && i2c_dev->bus_freq != 400000) return -EINVAL; - sprd_i2c_clk_init(i2c_dev); + ret = sprd_i2c_clk_init(i2c_dev); + if (ret) + return ret; + platform_set_drvdata(pdev, i2c_dev); ret = clk_prepare_enable(i2c_dev->clk); From f4c737d6194f509733641017b50548abc01f4b0d Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Fri, 28 Jun 2019 15:34:24 +0800 Subject: [PATCH 0058/2880] dt-bindings: PCI: Add support for MT7629 MT7629 is an ARM based platform SoC integrating the same PCIe IP as MT7622, add a binding for it. Signed-off-by: Jianjun Wang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Ryder Lee --- Documentation/devicetree/bindings/pci/mediatek-pcie.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie.txt b/Documentation/devicetree/bindings/pci/mediatek-pcie.txt index 92437a366e5f..7468d666763a 100644 --- a/Documentation/devicetree/bindings/pci/mediatek-pcie.txt +++ b/Documentation/devicetree/bindings/pci/mediatek-pcie.txt @@ -6,6 +6,7 @@ Required properties: "mediatek,mt2712-pcie" "mediatek,mt7622-pcie" "mediatek,mt7623-pcie" + "mediatek,mt7629-pcie" - device_type: Must be "pci" - reg: Base addresses and lengths of the PCIe subsys and root ports. - reg-names: Names of the above areas to use during resource lookup. From 0cccd42e6193e168cbecc271dae464e4a53fd7b3 Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Fri, 28 Jun 2019 15:34:25 +0800 Subject: [PATCH 0059/2880] PCI: mediatek: Add controller support for MT7629 MT7629 is an ARM platform SoC which has the same PCIe IP as MT7622. The HW default value of its PCI host controller Device ID is invalid, fix it to match the hardware implementation. Signed-off-by: Jianjun Wang [lorenzo.pieralisi@arm.com: commit log/minor spelling update] Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Acked-by: Ryder Lee --- drivers/pci/controller/pcie-mediatek.c | 18 ++++++++++++++++++ include/linux/pci_ids.h | 1 + 2 files changed, 19 insertions(+) diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c index 80601e1b939e..3eaa7081ab2a 100644 --- a/drivers/pci/controller/pcie-mediatek.c +++ b/drivers/pci/controller/pcie-mediatek.c @@ -73,6 +73,7 @@ #define PCIE_MSI_VECTOR 0x0c0 #define PCIE_CONF_VEND_ID 0x100 +#define PCIE_CONF_DEVICE_ID 0x102 #define PCIE_CONF_CLASS_ID 0x106 #define PCIE_INT_MASK 0x420 @@ -141,12 +142,16 @@ struct mtk_pcie_port; /** * struct mtk_pcie_soc - differentiate between host generations * @need_fix_class_id: whether this host's class ID needed to be fixed or not + * @need_fix_device_id: whether this host's device ID needed to be fixed or not + * @device_id: device ID which this host need to be fixed * @ops: pointer to configuration access functions * @startup: pointer to controller setting functions * @setup_irq: pointer to initialize IRQ functions */ struct mtk_pcie_soc { bool need_fix_class_id; + bool need_fix_device_id; + unsigned int device_id; struct pci_ops *ops; int (*startup)(struct mtk_pcie_port *port); int (*setup_irq)(struct mtk_pcie_port *port, struct device_node *node); @@ -696,6 +701,9 @@ static int mtk_pcie_startup_port_v2(struct mtk_pcie_port *port) writew(val, port->base + PCIE_CONF_CLASS_ID); } + if (soc->need_fix_device_id) + writew(soc->device_id, port->base + PCIE_CONF_DEVICE_ID); + /* 100ms timeout value should be enough for Gen1/2 training */ err = readl_poll_timeout(port->base + PCIE_LINK_STATUS_V2, val, !!(val & PCIE_PORT_LINKUP_V2), 20, @@ -1216,11 +1224,21 @@ static const struct mtk_pcie_soc mtk_pcie_soc_mt7622 = { .setup_irq = mtk_pcie_setup_irq, }; +static const struct mtk_pcie_soc mtk_pcie_soc_mt7629 = { + .need_fix_class_id = true, + .need_fix_device_id = true, + .device_id = PCI_DEVICE_ID_MEDIATEK_7629, + .ops = &mtk_pcie_ops_v2, + .startup = mtk_pcie_startup_port_v2, + .setup_irq = mtk_pcie_setup_irq, +}; + static const struct of_device_id mtk_pcie_ids[] = { { .compatible = "mediatek,mt2701-pcie", .data = &mtk_pcie_soc_v1 }, { .compatible = "mediatek,mt7623-pcie", .data = &mtk_pcie_soc_v1 }, { .compatible = "mediatek,mt2712-pcie", .data = &mtk_pcie_soc_mt2712 }, { .compatible = "mediatek,mt7622-pcie", .data = &mtk_pcie_soc_mt7622 }, + { .compatible = "mediatek,mt7629-pcie", .data = &mtk_pcie_soc_mt7629 }, {}, }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index c842735a4f45..f59a6f98900c 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2132,6 +2132,7 @@ #define PCI_VENDOR_ID_MYRICOM 0x14c1 #define PCI_VENDOR_ID_MEDIATEK 0x14c3 +#define PCI_DEVICE_ID_MEDIATEK_7629 0x7629 #define PCI_VENDOR_ID_TITAN 0x14D2 #define PCI_DEVICE_ID_TITAN_010L 0x8001 From a92c7ba982e3950a82fd63b3fd289248bd99e8f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Kl=C4=93tnieks?= Date: Wed, 7 Aug 2019 19:22:34 -0400 Subject: [PATCH 0060/2880] fs/handle.c - fix up kerneldoc When building with W=1, we get some kerneldoc warnings: CC fs/fhandle.o fs/fhandle.c:259: warning: Function parameter or member 'flags' not described in 'sys_open_by_handle_at' fs/fhandle.c:259: warning: Excess function parameter 'flag' description in 'sys_open_by_handle_at' Fix the typo that caused it. Signed-off-by: Valdis Kletnieks Signed-off-by: Al Viro --- fs/fhandle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index 0ee727485615..01263ffbc4c0 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -246,7 +246,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, * sys_open_by_handle_at: Open the file handle * @mountdirfd: directory file descriptor * @handle: file handle to be opened - * @flag: open flags. + * @flags: open flags. * * @mountdirfd indicate the directory file descriptor * of the mount point. file handle is decoded relative From b8074aa2460b535915e8f65bf83c4bcb4220f804 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 29 Jul 2019 13:13:57 +0300 Subject: [PATCH 0061/2880] PCI: Convert pci_resource_to_user() to a weak function Convert pci_resource_to_user() to a weak function so the existing architecture-specific implementations will automatically override the generic one. This allows us to remove HAVE_ARCH_PCI_RESOURCE_TO_USER definitions and avoid the conditional compilation for this single function. Link: https://lore.kernel.org/r/20190729101401.28068-1-efremov@linux.com Link: https://lore.kernel.org/r/20190729101401.28068-2-efremov@linux.com Link: https://lore.kernel.org/r/20190729101401.28068-3-efremov@linux.com Link: https://lore.kernel.org/r/20190729101401.28068-4-efremov@linux.com Link: https://lore.kernel.org/r/20190729101401.28068-5-efremov@linux.com Link: https://lore.kernel.org/r/20190729101401.28068-6-efremov@linux.com Signed-off-by: Denis Efremov [bhelgaas: squash into one commit] Signed-off-by: Bjorn Helgaas Acked-by: Paul Burton # MIPS --- arch/microblaze/include/asm/pci.h | 2 -- arch/mips/include/asm/pci.h | 1 - arch/powerpc/include/asm/pci.h | 2 -- arch/sparc/include/asm/pci.h | 2 -- drivers/pci/pci.c | 12 ++++++++++++ include/linux/pci.h | 16 ---------------- 6 files changed, 12 insertions(+), 23 deletions(-) diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h index 21ddba9188b2..7c4dc5d85f53 100644 --- a/arch/microblaze/include/asm/pci.h +++ b/arch/microblaze/include/asm/pci.h @@ -66,8 +66,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, unsigned long size, pgprot_t prot); -#define HAVE_ARCH_PCI_RESOURCE_TO_USER - /* This part of code was originally in xilinx-pci.h */ #ifdef CONFIG_PCI_XILINX extern void __init xilinx_pci_init(void); diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h index 436099883022..6f48649201c5 100644 --- a/arch/mips/include/asm/pci.h +++ b/arch/mips/include/asm/pci.h @@ -108,7 +108,6 @@ extern unsigned long PCIBIOS_MIN_MEM; #define HAVE_PCI_MMAP #define ARCH_GENERIC_PCI_MMAP_RESOURCE -#define HAVE_ARCH_PCI_RESOURCE_TO_USER /* * Dynamic DMA mapping stuff. diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 2372d35533ad..327567b8f7d6 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -112,8 +112,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, unsigned long size, pgprot_t prot); -#define HAVE_ARCH_PCI_RESOURCE_TO_USER - extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose); extern void pcibios_setup_bus_devices(struct pci_bus *bus); extern void pcibios_setup_bus_self(struct pci_bus *bus); diff --git a/arch/sparc/include/asm/pci.h b/arch/sparc/include/asm/pci.h index cfec79bb1831..4deddf430e5d 100644 --- a/arch/sparc/include/asm/pci.h +++ b/arch/sparc/include/asm/pci.h @@ -38,8 +38,6 @@ static inline int pci_proc_domain(struct pci_bus *bus) #define arch_can_pci_mmap_io() 1 #define HAVE_ARCH_PCI_GET_UNMAPPED_AREA #define get_pci_unmapped_area get_fb_unmapped_area - -#define HAVE_ARCH_PCI_RESOURCE_TO_USER #endif /* CONFIG_SPARC64 */ #if defined(CONFIG_SPARC64) || defined(CONFIG_LEON_PCI) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 29ed5ec1ac27..da3241bb4479 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5932,6 +5932,18 @@ resource_size_t __weak pcibios_default_alignment(void) return 0; } +/* + * Arches that don't want to expose struct resource to userland as-is in + * sysfs and /proc can implement their own pci_resource_to_user(). + */ +void __weak pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + resource_size_t *start, resource_size_t *end) +{ + *start = rsrc->start; + *end = rsrc->end; +} + #define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE static char resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0}; static DEFINE_SPINLOCK(resource_alignment_lock); diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..dcc5dd310c14 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1870,25 +1870,9 @@ static inline const char *pci_name(const struct pci_dev *pdev) return dev_name(&pdev->dev); } - -/* - * Some archs don't want to expose struct resource to userland as-is - * in sysfs and /proc - */ -#ifdef HAVE_ARCH_PCI_RESOURCE_TO_USER void pci_resource_to_user(const struct pci_dev *dev, int bar, const struct resource *rsrc, resource_size_t *start, resource_size_t *end); -#else -static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, resource_size_t *start, - resource_size_t *end) -{ - *start = rsrc->start; - *end = rsrc->end; -} -#endif /* HAVE_ARCH_PCI_RESOURCE_TO_USER */ - /* * The world is not perfect and supplies us with broken PCI devices. From 39098edbd79e5c9a4357eb924cb259d1c8a11346 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 6 Aug 2019 17:07:15 +0300 Subject: [PATCH 0062/2880] PCI: Use PCI_SRIOV_NUM_BARS in loops instead of PCI_IOV_RESOURCE_END Writing loop conditions as "i < NUM" is a common C idiom; using "i <= END" is unusual and thus prone to errors. Change loops to use the former. Link: https://lore.kernel.org/r/20190806140715.19847-1-efremov@linux.com Signed-off-by: Denis Efremov Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan --- drivers/pci/iov.c | 4 ++-- drivers/pci/setup-bus.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 525fd3f272b3..9b48818ced01 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -557,8 +557,8 @@ static void sriov_restore_state(struct pci_dev *dev) ctrl |= iov->ctrl & PCI_SRIOV_CTRL_ARI; pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, ctrl); - for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++) - pci_update_resource(dev, i); + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) + pci_update_resource(dev, i + PCI_IOV_RESOURCES); pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz); pci_iov_set_numvfs(dev, iov->num_VFs); diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1fa6519be..e7dbe21705ba 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1662,8 +1662,8 @@ static int iov_resources_unassigned(struct pci_dev *dev, void *data) int i; bool *unassigned = data; - for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++) { - struct resource *r = &dev->resource[i]; + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + struct resource *r = &dev->resource[i + PCI_IOV_RESOURCES]; struct pci_bus_region region; /* Not assigned or rejected by kernel? */ From d2182b2d4b71ff0549a07f414d921525fade707b Mon Sep 17 00:00:00 2001 From: Sumit Saxena Date: Fri, 26 Jul 2019 00:55:52 +0530 Subject: [PATCH 0063/2880] PCI: Restore Resizable BAR size bits correctly for 1MB BARs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In a Resizable BAR Control Register, bits 13:8 control the size of the BAR. The encoded values of these bits are as follows (see PCIe r5.0, sec 7.8.6.3): Value BAR size 0 1 MB (2^20 bytes) 1 2 MB (2^21 bytes) 2 4 MB (2^22 bytes) ... 43 8 EB (2^63 bytes) Previously we incorrectly set the BAR size bits for a 1 MB BAR to 0x1f instead of 0, so devices that support that size, e.g., new megaraid_sas and mpt3sas adapters, fail to initialize during resume from S3 sleep. Correctly calculate the BAR size bits for Resizable BAR control registers. Link: https://lore.kernel.org/r/20190725192552.24295-1-sumit.saxena@broadcom.com Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=203939 Fixes: d3252ace0bc6 ("PCI: Restore resized BAR state on resume") Signed-off-by: Sumit Saxena Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Cc: stable@vger.kernel.org # v4.19+ --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index da3241bb4479..5836eb576d96 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1438,7 +1438,7 @@ static void pci_restore_rebar_state(struct pci_dev *pdev) pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX; res = pdev->resource + bar_idx; - size = order_base_2((resource_size(res) >> 20) | 1) - 1; + size = ilog2(resource_size(res)) - 20; ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE; ctrl |= size << PCI_REBAR_CTRL_BAR_SHIFT; pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl); From 3b1b1ce3596458de0d5c28309954886a9cb6d5cc Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 6 Jun 2019 13:25:57 +1000 Subject: [PATCH 0064/2880] PCI: Correct pci=resource_alignment parameter example The "pci=resource_alignment" parameter is described as requiring an order (not a size) and the code in pci_specified_resource_alignment() expects an order. But the example wrongly shows a size. Convert the example to an order. Fixes: 8b078c603249 ("PCI: Update "pci=resource_alignment" documentation") Link: https://lore.kernel.org/r/20190606032557.107542-1-aik@ozlabs.ru Signed-off-by: Alexey Kardashevskiy Signed-off-by: Bjorn Helgaas --- Documentation/admin-guide/kernel-parameters.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 46b826fcb5ad..7d407bdded99 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3452,12 +3452,13 @@ specify the device is described above. If is not specified, PAGE_SIZE is used as alignment. - PCI-PCI bridge can be specified, if resource + A PCI-PCI bridge can be specified if resource windows need to be expanded. To specify the alignment for several instances of a device, the PCI vendor, device, subvendor, and subdevice may be - specified, e.g., 4096@pci:8086:9c22:103c:198f + specified, e.g., 12@pci:8086:9c22:103c:198f + for 4096-byte alignment. ecrc= Enable/disable PCIe ECRC (transaction layer end-to-end CRC checking). bios: Use BIOS/firmware settings. This is the From 5a2e340690f2908371eb9c1b4cfb82ba0b235b51 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 1 Aug 2019 20:22:48 -0500 Subject: [PATCH 0065/2880] PCI: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. This fixes the following warning (Building: allmodconfig i386): drivers/pci/hotplug/ibmphp_res.c: In function ‘update_bridge_ranges’: drivers/pci/hotplug/ibmphp_res.c:1943:16: warning: this statement may fall through [-Wimplicit-fallthrough=] function = 0x8; ~~~~~~~~~^~~~~ drivers/pci/hotplug/ibmphp_res.c:1944:6: note: here case PCI_HEADER_TYPE_MULTIBRIDGE: ^~~~ Link: https://lore.kernel.org/r/20190802012248.GA22622@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Bjorn Helgaas Reviewed-by: Kees Cook --- drivers/pci/hotplug/ibmphp_res.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/hotplug/ibmphp_res.c b/drivers/pci/hotplug/ibmphp_res.c index 5e8caf7a4452..5c93aa14f0de 100644 --- a/drivers/pci/hotplug/ibmphp_res.c +++ b/drivers/pci/hotplug/ibmphp_res.c @@ -1941,6 +1941,7 @@ static int __init update_bridge_ranges(struct bus_node **bus) break; case PCI_HEADER_TYPE_BRIDGE: function = 0x8; + /* fall through */ case PCI_HEADER_TYPE_MULTIBRIDGE: /* We assume here that only 1 bus behind the bridge TO DO: add functionality for several: From 2a9af0273c1cd6647df4f6475b45d823494bf13a Mon Sep 17 00:00:00 2001 From: Wesley Terpstra Date: Thu, 25 Jul 2019 14:28:07 -0700 Subject: [PATCH 0066/2880] PCI/MSI: Enable PCI_MSI_IRQ_DOMAIN support for RISC-V Add RISC-V as an arch that supports PCI_MSI_IRQ_DOMAIN. The related change to generate asm/msi.h is 251a44888183 ("riscv: include generic support for MSI irqdomains"). Link: https://lore.kernel.org/r/alpine.DEB.2.21.9999.1907251426450.32766@viisi.sifive.com Signed-off-by: Wesley Terpstra [paul.walmsley@sifive.com: wrote patch description; split this patch from the arch/riscv patch] Signed-off-by: Paul Walmsley Signed-off-by: Bjorn Helgaas Reviewed-by: Bin Meng --- drivers/pci/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 2ab92409210a..beb3408a0272 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -52,7 +52,7 @@ config PCI_MSI If you don't know what to do here, say Y. config PCI_MSI_IRQ_DOMAIN - def_bool ARC || ARM || ARM64 || X86 + def_bool ARC || ARM || ARM64 || X86 || RISCV depends on PCI_MSI select GENERIC_MSI_IRQ_DOMAIN From efecc3b531a30004d0993907897ab977747af108 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 6 Jul 2019 18:47:20 +0200 Subject: [PATCH 0067/2880] mfd: ab3100: No need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Signed-off-by: Greg Kroah-Hartman Reviewed-by: Linus Walleij Signed-off-by: Lee Jones --- drivers/mfd/ab3100-core.c | 45 ++++++--------------------------------- drivers/mfd/ab3100-otp.c | 21 ++++++------------ 2 files changed, 13 insertions(+), 53 deletions(-) diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c index e350ab64238e..9f3dbc31d3e9 100644 --- a/drivers/mfd/ab3100-core.c +++ b/drivers/mfd/ab3100-core.c @@ -575,58 +575,27 @@ static const struct file_operations ab3100_get_set_reg_fops = { .llseek = noop_llseek, }; -static struct dentry *ab3100_dir; -static struct dentry *ab3100_reg_file; static struct ab3100_get_set_reg_priv ab3100_get_priv; -static struct dentry *ab3100_get_reg_file; static struct ab3100_get_set_reg_priv ab3100_set_priv; -static struct dentry *ab3100_set_reg_file; static void ab3100_setup_debugfs(struct ab3100 *ab3100) { - int err; + struct dentry *ab3100_dir; ab3100_dir = debugfs_create_dir("ab3100", NULL); - if (!ab3100_dir) - goto exit_no_debugfs; - ab3100_reg_file = debugfs_create_file("registers", - S_IRUGO, ab3100_dir, ab3100, - &ab3100_registers_fops); - if (!ab3100_reg_file) { - err = -ENOMEM; - goto exit_destroy_dir; - } + debugfs_create_file("registers", S_IRUGO, ab3100_dir, ab3100, + &ab3100_registers_fops); ab3100_get_priv.ab3100 = ab3100; ab3100_get_priv.mode = false; - ab3100_get_reg_file = debugfs_create_file("get_reg", - S_IWUSR, ab3100_dir, &ab3100_get_priv, - &ab3100_get_set_reg_fops); - if (!ab3100_get_reg_file) { - err = -ENOMEM; - goto exit_destroy_reg; - } + debugfs_create_file("get_reg", S_IWUSR, ab3100_dir, &ab3100_get_priv, + &ab3100_get_set_reg_fops); ab3100_set_priv.ab3100 = ab3100; ab3100_set_priv.mode = true; - ab3100_set_reg_file = debugfs_create_file("set_reg", - S_IWUSR, ab3100_dir, &ab3100_set_priv, - &ab3100_get_set_reg_fops); - if (!ab3100_set_reg_file) { - err = -ENOMEM; - goto exit_destroy_get_reg; - } - return; - - exit_destroy_get_reg: - debugfs_remove(ab3100_get_reg_file); - exit_destroy_reg: - debugfs_remove(ab3100_reg_file); - exit_destroy_dir: - debugfs_remove(ab3100_dir); - exit_no_debugfs: - return; + debugfs_create_file("set_reg", S_IWUSR, ab3100_dir, &ab3100_set_priv, + &ab3100_get_set_reg_fops); } #else static inline void ab3100_setup_debugfs(struct ab3100 *ab3100) diff --git a/drivers/mfd/ab3100-otp.c b/drivers/mfd/ab3100-otp.c index b3f8d359f409..c4751fb9bc22 100644 --- a/drivers/mfd/ab3100-otp.c +++ b/drivers/mfd/ab3100-otp.c @@ -122,17 +122,11 @@ static const struct file_operations ab3100_otp_operations = { .release = single_release, }; -static int __init ab3100_otp_init_debugfs(struct device *dev, - struct ab3100_otp *otp) +static void __init ab3100_otp_init_debugfs(struct device *dev, + struct ab3100_otp *otp) { otp->debugfs = debugfs_create_file("ab3100_otp", S_IFREG | S_IRUGO, - NULL, otp, - &ab3100_otp_operations); - if (!otp->debugfs) { - dev_err(dev, "AB3100 debugfs OTP file registration failed!\n"); - return -ENOENT; - } - return 0; + NULL, otp, &ab3100_otp_operations); } static void __exit ab3100_otp_exit_debugfs(struct ab3100_otp *otp) @@ -141,10 +135,9 @@ static void __exit ab3100_otp_exit_debugfs(struct ab3100_otp *otp) } #else /* Compile this out if debugfs not selected */ -static inline int __init ab3100_otp_init_debugfs(struct device *dev, - struct ab3100_otp *otp) +static inline void __init ab3100_otp_init_debugfs(struct device *dev, + struct ab3100_otp *otp) { - return 0; } static inline void __exit ab3100_otp_exit_debugfs(struct ab3100_otp *otp) @@ -211,9 +204,7 @@ static int __init ab3100_otp_probe(struct platform_device *pdev) } /* debugfs entries */ - err = ab3100_otp_init_debugfs(&pdev->dev, otp); - if (err) - goto err; + ab3100_otp_init_debugfs(&pdev->dev, otp); return 0; From 64e8a9bacadb58e04ab517d8fb5735302b96db5f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 6 Jul 2019 18:47:21 +0200 Subject: [PATCH 0068/2880] mfd: ab8500: No need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Signed-off-by: Greg Kroah-Hartman Reviewed-by: Linus Walleij Signed-off-by: Lee Jones --- drivers/mfd/ab8500-debugfs.c | 322 +++++++++++------------------------ 1 file changed, 97 insertions(+), 225 deletions(-) diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c index d24c6ecccb88..567a34b073dd 100644 --- a/drivers/mfd/ab8500-debugfs.c +++ b/drivers/mfd/ab8500-debugfs.c @@ -2644,12 +2644,10 @@ static const struct file_operations ab8500_hwreg_fops = { .owner = THIS_MODULE, }; -static struct dentry *ab8500_dir; -static struct dentry *ab8500_gpadc_dir; - static int ab8500_debug_probe(struct platform_device *plf) { - struct dentry *file; + struct dentry *ab8500_dir; + struct dentry *ab8500_gpadc_dir; struct ab8500 *ab8500; struct resource *res; @@ -2694,47 +2692,22 @@ static int ab8500_debug_probe(struct platform_device *plf) } ab8500_dir = debugfs_create_dir(AB8500_NAME_STRING, NULL); - if (!ab8500_dir) - goto err; ab8500_gpadc_dir = debugfs_create_dir(AB8500_ADC_NAME_STRING, ab8500_dir); - if (!ab8500_gpadc_dir) - goto err; - file = debugfs_create_file("all-bank-registers", S_IRUGO, ab8500_dir, - &plf->dev, &ab8500_bank_registers_fops); - if (!file) - goto err; - - file = debugfs_create_file("all-banks", S_IRUGO, ab8500_dir, - &plf->dev, &ab8500_all_banks_fops); - if (!file) - goto err; - - file = debugfs_create_file("register-bank", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_dir, &plf->dev, &ab8500_bank_fops); - if (!file) - goto err; - - file = debugfs_create_file("register-address", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_dir, &plf->dev, &ab8500_address_fops); - if (!file) - goto err; - - file = debugfs_create_file("register-value", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_dir, &plf->dev, &ab8500_val_fops); - if (!file) - goto err; - - file = debugfs_create_file("irq-subscribe", - (S_IRUGO | S_IWUSR | S_IWGRP), ab8500_dir, - &plf->dev, &ab8500_subscribe_fops); - if (!file) - goto err; + debugfs_create_file("all-bank-registers", S_IRUGO, ab8500_dir, + &plf->dev, &ab8500_bank_registers_fops); + debugfs_create_file("all-banks", S_IRUGO, ab8500_dir, + &plf->dev, &ab8500_all_banks_fops); + debugfs_create_file("register-bank", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_dir, &plf->dev, &ab8500_bank_fops); + debugfs_create_file("register-address", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_dir, &plf->dev, &ab8500_address_fops); + debugfs_create_file("register-value", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_dir, &plf->dev, &ab8500_val_fops); + debugfs_create_file("irq-subscribe", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_dir, &plf->dev, &ab8500_subscribe_fops); if (is_ab8500(ab8500)) { debug_ranges = ab8500_debug_ranges; @@ -2750,194 +2723,93 @@ static int ab8500_debug_probe(struct platform_device *plf) num_interrupt_lines = AB8540_NR_IRQS; } - file = debugfs_create_file("interrupts", (S_IRUGO), ab8500_dir, - &plf->dev, &ab8500_interrupts_fops); - if (!file) - goto err; - - file = debugfs_create_file("irq-unsubscribe", - (S_IRUGO | S_IWUSR | S_IWGRP), ab8500_dir, - &plf->dev, &ab8500_unsubscribe_fops); - if (!file) - goto err; - - file = debugfs_create_file("hwreg", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_dir, &plf->dev, &ab8500_hwreg_fops); - if (!file) - goto err; - - file = debugfs_create_file("all-modem-registers", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_dir, &plf->dev, &ab8500_modem_fops); - if (!file) - goto err; - - file = debugfs_create_file("bat_ctrl", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_bat_ctrl_fops); - if (!file) - goto err; - - file = debugfs_create_file("btemp_ball", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, - &plf->dev, &ab8500_gpadc_btemp_ball_fops); - if (!file) - goto err; - - file = debugfs_create_file("main_charger_v", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_main_charger_v_fops); - if (!file) - goto err; - - file = debugfs_create_file("acc_detect1", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_acc_detect1_fops); - if (!file) - goto err; - - file = debugfs_create_file("acc_detect2", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_acc_detect2_fops); - if (!file) - goto err; - - file = debugfs_create_file("adc_aux1", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_aux1_fops); - if (!file) - goto err; - - file = debugfs_create_file("adc_aux2", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_aux2_fops); - if (!file) - goto err; - - file = debugfs_create_file("main_bat_v", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_main_bat_v_fops); - if (!file) - goto err; - - file = debugfs_create_file("vbus_v", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_vbus_v_fops); - if (!file) - goto err; - - file = debugfs_create_file("main_charger_c", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_main_charger_c_fops); - if (!file) - goto err; - - file = debugfs_create_file("usb_charger_c", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, - &plf->dev, &ab8500_gpadc_usb_charger_c_fops); - if (!file) - goto err; - - file = debugfs_create_file("bk_bat_v", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_bk_bat_v_fops); - if (!file) - goto err; - - file = debugfs_create_file("die_temp", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_die_temp_fops); - if (!file) - goto err; - - file = debugfs_create_file("usb_id", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_usb_id_fops); - if (!file) - goto err; - + debugfs_create_file("interrupts", (S_IRUGO), ab8500_dir, &plf->dev, + &ab8500_interrupts_fops); + debugfs_create_file("irq-unsubscribe", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_dir, &plf->dev, &ab8500_unsubscribe_fops); + debugfs_create_file("hwreg", (S_IRUGO | S_IWUSR | S_IWGRP), ab8500_dir, + &plf->dev, &ab8500_hwreg_fops); + debugfs_create_file("all-modem-registers", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_dir, &plf->dev, &ab8500_modem_fops); + debugfs_create_file("bat_ctrl", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_bat_ctrl_fops); + debugfs_create_file("btemp_ball", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_btemp_ball_fops); + debugfs_create_file("main_charger_v", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_main_charger_v_fops); + debugfs_create_file("acc_detect1", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_acc_detect1_fops); + debugfs_create_file("acc_detect2", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_acc_detect2_fops); + debugfs_create_file("adc_aux1", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_aux1_fops); + debugfs_create_file("adc_aux2", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_aux2_fops); + debugfs_create_file("main_bat_v", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_main_bat_v_fops); + debugfs_create_file("vbus_v", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_vbus_v_fops); + debugfs_create_file("main_charger_c", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_main_charger_c_fops); + debugfs_create_file("usb_charger_c", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_usb_charger_c_fops); + debugfs_create_file("bk_bat_v", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_bk_bat_v_fops); + debugfs_create_file("die_temp", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_die_temp_fops); + debugfs_create_file("usb_id", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_usb_id_fops); if (is_ab8540(ab8500)) { - file = debugfs_create_file("xtal_temp", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8540_gpadc_xtal_temp_fops); - if (!file) - goto err; - file = debugfs_create_file("vbattruemeas", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8540_gpadc_vbat_true_meas_fops); - if (!file) - goto err; - file = debugfs_create_file("batctrl_and_ibat", - (S_IRUGO | S_IWUGO), - ab8500_gpadc_dir, - &plf->dev, - &ab8540_gpadc_bat_ctrl_and_ibat_fops); - if (!file) - goto err; - file = debugfs_create_file("vbatmeas_and_ibat", - (S_IRUGO | S_IWUGO), - ab8500_gpadc_dir, &plf->dev, - &ab8540_gpadc_vbat_meas_and_ibat_fops); - if (!file) - goto err; - file = debugfs_create_file("vbattruemeas_and_ibat", - (S_IRUGO | S_IWUGO), - ab8500_gpadc_dir, - &plf->dev, - &ab8540_gpadc_vbat_true_meas_and_ibat_fops); - if (!file) - goto err; - file = debugfs_create_file("battemp_and_ibat", - (S_IRUGO | S_IWUGO), - ab8500_gpadc_dir, - &plf->dev, &ab8540_gpadc_bat_temp_and_ibat_fops); - if (!file) - goto err; - file = debugfs_create_file("otp_calib", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, - &plf->dev, &ab8540_gpadc_otp_calib_fops); - if (!file) - goto err; + debugfs_create_file("xtal_temp", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_xtal_temp_fops); + debugfs_create_file("vbattruemeas", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_vbat_true_meas_fops); + debugfs_create_file("batctrl_and_ibat", (S_IRUGO | S_IWUGO), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_bat_ctrl_and_ibat_fops); + debugfs_create_file("vbatmeas_and_ibat", (S_IRUGO | S_IWUGO), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_vbat_meas_and_ibat_fops); + debugfs_create_file("vbattruemeas_and_ibat", (S_IRUGO | S_IWUGO), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_vbat_true_meas_and_ibat_fops); + debugfs_create_file("battemp_and_ibat", (S_IRUGO | S_IWUGO), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_bat_temp_and_ibat_fops); + debugfs_create_file("otp_calib", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_otp_calib_fops); } - file = debugfs_create_file("avg_sample", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_avg_sample_fops); - if (!file) - goto err; - - file = debugfs_create_file("trig_edge", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_trig_edge_fops); - if (!file) - goto err; - - file = debugfs_create_file("trig_timer", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_trig_timer_fops); - if (!file) - goto err; - - file = debugfs_create_file("conv_type", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_conv_type_fops); - if (!file) - goto err; + debugfs_create_file("avg_sample", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_avg_sample_fops); + debugfs_create_file("trig_edge", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_trig_edge_fops); + debugfs_create_file("trig_timer", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_trig_timer_fops); + debugfs_create_file("conv_type", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_conv_type_fops); return 0; - -err: - debugfs_remove_recursive(ab8500_dir); - dev_err(&plf->dev, "failed to create debugfs entries.\n"); - - return -ENOMEM; } static struct platform_driver ab8500_debug_driver = { From cbfe612d471fc3eda048a6a70c5c8f5ee34026a4 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 6 Jul 2019 18:47:22 +0200 Subject: [PATCH 0069/2880] mfd: aat2870: No need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Signed-off-by: Greg Kroah-Hartman Signed-off-by: Lee Jones --- drivers/mfd/aat2870-core.c | 13 ++----------- include/linux/mfd/aat2870.h | 1 - 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/mfd/aat2870-core.c b/drivers/mfd/aat2870-core.c index 9f58cb2d3789..78ee4b28fca2 100644 --- a/drivers/mfd/aat2870-core.c +++ b/drivers/mfd/aat2870-core.c @@ -321,18 +321,9 @@ static const struct file_operations aat2870_reg_fops = { static void aat2870_init_debugfs(struct aat2870_data *aat2870) { aat2870->dentry_root = debugfs_create_dir("aat2870", NULL); - if (!aat2870->dentry_root) { - dev_warn(aat2870->dev, - "Failed to create debugfs root directory\n"); - return; - } - aat2870->dentry_reg = debugfs_create_file("regs", 0644, - aat2870->dentry_root, - aat2870, &aat2870_reg_fops); - if (!aat2870->dentry_reg) - dev_warn(aat2870->dev, - "Failed to create debugfs register file\n"); + debugfs_create_file("regs", 0644, aat2870->dentry_root, aat2870, + &aat2870_reg_fops); } #else diff --git a/include/linux/mfd/aat2870.h b/include/linux/mfd/aat2870.h index af7267c480ee..2445842d482d 100644 --- a/include/linux/mfd/aat2870.h +++ b/include/linux/mfd/aat2870.h @@ -136,7 +136,6 @@ struct aat2870_data { /* for debugfs */ struct dentry *dentry_root; - struct dentry *dentry_reg; }; struct aat2870_subdev_info { From ed075453d527a2635f245e0a91b92252ad3d1c8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Tue, 29 Jan 2019 14:59:17 +0100 Subject: [PATCH 0070/2880] dt-bindings: mfd: rn5t618: Document optional property system-power-controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RN5T618 family of PMICs can be used as system management controllers, in which case they handle poweroff and restart. Document this capability by referring to the corresponding generic DT binding. Signed-off-by: Jonathan Neuschäfer Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/rn5t618.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/mfd/rn5t618.txt b/Documentation/devicetree/bindings/mfd/rn5t618.txt index 65c23263cc54..b74e5e94d1cb 100644 --- a/Documentation/devicetree/bindings/mfd/rn5t618.txt +++ b/Documentation/devicetree/bindings/mfd/rn5t618.txt @@ -14,6 +14,10 @@ Required properties: "ricoh,rc5t619" - reg: the I2C slave address of the device +Optional properties: + - system-power-controller: + See Documentation/devicetree/bindings/power/power-controller.txt + Sub-nodes: - regulators: the node is required if the regulator functionality is needed. The valid regulator names are: DCDC1, DCDC2, DCDC3, DCDC4 @@ -28,6 +32,7 @@ Example: pmic@32 { compatible = "ricoh,rn5t618"; reg = <0x32>; + system-power-controller; regulators { DCDC1 { From 624e3fceb533b33927535aaec713bd91ec80e99b Mon Sep 17 00:00:00 2001 From: Yicheng Li Date: Mon, 8 Jul 2019 11:15:37 -0700 Subject: [PATCH 0071/2880] mfd: cros_ec: Update cros_ec_commands.h Update cros_ec_commands.h to match the fingerprint MCU section in the current ec_commands.h Signed-off-by: Yicheng Li Reviewed-by: Gwendal Grignou Reviewed-by: Enric Balletbo i Serra Signed-off-by: Lee Jones --- include/linux/mfd/cros_ec_commands.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h index 7ccb8757b79d..98415686cbfa 100644 --- a/include/linux/mfd/cros_ec_commands.h +++ b/include/linux/mfd/cros_ec_commands.h @@ -5513,6 +5513,18 @@ struct ec_params_fp_seed { uint8_t seed[FP_CONTEXT_TPM_BYTES]; } __ec_align4; +#define EC_CMD_FP_ENC_STATUS 0x0409 + +/* FP TPM seed has been set or not */ +#define FP_ENC_STATUS_SEED_SET BIT(0) + +struct ec_response_fp_encryption_status { + /* Used bits in encryption engine status */ + uint32_t valid_flags; + /* Encryption engine status */ + uint32_t status; +} __ec_align4; + /*****************************************************************************/ /* Touchpad MCU commands: range 0x0500-0x05FF */ From a604e5b29ce6cf5ec2950df7a5562ac2dd8d70e2 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:35 +0200 Subject: [PATCH 0072/2880] mfd: tps80031: Convert to devm_i2c_new_dummy_device Move from i2c_new_dummy() to devm_i2c_new_dummy_device(). So, we now get an ERRPTR which we use in error handling and we can skip removal of the created devices. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/tps80031.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/drivers/mfd/tps80031.c b/drivers/mfd/tps80031.c index 865257ade8ac..907452b86e32 100644 --- a/drivers/mfd/tps80031.c +++ b/drivers/mfd/tps80031.c @@ -437,12 +437,11 @@ static int tps80031_probe(struct i2c_client *client, if (tps80031_slave_address[i] == client->addr) tps80031->clients[i] = client; else - tps80031->clients[i] = i2c_new_dummy(client->adapter, - tps80031_slave_address[i]); - if (!tps80031->clients[i]) { + tps80031->clients[i] = devm_i2c_new_dummy_device(&client->dev, + client->adapter, tps80031_slave_address[i]); + if (IS_ERR(tps80031->clients[i])) { dev_err(&client->dev, "can't attach client %d\n", i); - ret = -ENOMEM; - goto fail_client_reg; + return PTR_ERR(tps80031->clients[i]); } i2c_set_clientdata(tps80031->clients[i], tps80031); @@ -452,7 +451,7 @@ static int tps80031_probe(struct i2c_client *client, ret = PTR_ERR(tps80031->regmap[i]); dev_err(&client->dev, "regmap %d init failed, err %d\n", i, ret); - goto fail_client_reg; + return ret; } } @@ -461,7 +460,7 @@ static int tps80031_probe(struct i2c_client *client, if (ret < 0) { dev_err(&client->dev, "Silicon version number read failed: %d\n", ret); - goto fail_client_reg; + return ret; } ret = tps80031_read(&client->dev, TPS80031_SLAVE_ID3, @@ -469,7 +468,7 @@ static int tps80031_probe(struct i2c_client *client, if (ret < 0) { dev_err(&client->dev, "Silicon eeprom version read failed: %d\n", ret); - goto fail_client_reg; + return ret; } dev_info(&client->dev, "ES version 0x%02x and EPROM version 0x%02x\n", @@ -482,7 +481,7 @@ static int tps80031_probe(struct i2c_client *client, ret = tps80031_irq_init(tps80031, client->irq, pdata->irq_base); if (ret) { dev_err(&client->dev, "IRQ init failed: %d\n", ret); - goto fail_client_reg; + return ret; } tps80031_pupd_init(tps80031, pdata); @@ -506,12 +505,6 @@ static int tps80031_probe(struct i2c_client *client, fail_mfd_add: regmap_del_irq_chip(client->irq, tps80031->irq_data); - -fail_client_reg: - for (i = 0; i < TPS80031_NUM_SLAVES; i++) { - if (tps80031->clients[i] && (tps80031->clients[i] != client)) - i2c_unregister_device(tps80031->clients[i]); - } return ret; } From e406b832d89d63b9eb525fa8fea18eb7a1598c60 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 20:26:28 +0200 Subject: [PATCH 0073/2880] mfd: da9063: Remove now unused platform_data All preparational patches have been applied, we can now remove the include file for platform_data. Yiha! Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- include/Kbuild | 1 - include/linux/mfd/da9063/pdata.h | 60 -------------------------------- 2 files changed, 61 deletions(-) delete mode 100644 include/linux/mfd/da9063/pdata.h diff --git a/include/Kbuild b/include/Kbuild index c38f0d46b267..68aa094fe86e 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -313,7 +313,6 @@ header-test- += linux/mfd/as3722.h header-test- += linux/mfd/cros_ec_commands.h header-test- += linux/mfd/da903x.h header-test- += linux/mfd/da9055/pdata.h -header-test- += linux/mfd/da9063/pdata.h header-test- += linux/mfd/db8500-prcmu.h header-test- += linux/mfd/dbx500-prcmu.h header-test- += linux/mfd/dln2.h diff --git a/include/linux/mfd/da9063/pdata.h b/include/linux/mfd/da9063/pdata.h deleted file mode 100644 index 085edbf7601b..000000000000 --- a/include/linux/mfd/da9063/pdata.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Platform configuration options for DA9063 - * - * Copyright 2012 Dialog Semiconductor Ltd. - * - * Author: Michal Hajduk, Dialog Semiconductor - * Author: Krystian Garbaciak, Dialog Semiconductor - */ - -#ifndef __MFD_DA9063_PDATA_H__ -#define __MFD_DA9063_PDATA_H__ - -/* - * RGB LED configuration - */ -/* LED IDs for flags in struct led_info. */ -enum { - DA9063_GPIO11_LED, - DA9063_GPIO14_LED, - DA9063_GPIO15_LED, - - DA9063_LED_NUM -}; -#define DA9063_LED_ID_MASK 0x3 - -/* LED polarity for flags in struct led_info. */ -#define DA9063_LED_HIGH_LEVEL_ACTIVE 0x0 -#define DA9063_LED_LOW_LEVEL_ACTIVE 0x4 - - -/* - * General PMIC configuration - */ -/* HWMON ADC channels configuration */ -#define DA9063_FLG_FORCE_IN0_MANUAL_MODE 0x0010 -#define DA9063_FLG_FORCE_IN0_AUTO_MODE 0x0020 -#define DA9063_FLG_FORCE_IN1_MANUAL_MODE 0x0040 -#define DA9063_FLG_FORCE_IN1_AUTO_MODE 0x0080 -#define DA9063_FLG_FORCE_IN2_MANUAL_MODE 0x0100 -#define DA9063_FLG_FORCE_IN2_AUTO_MODE 0x0200 -#define DA9063_FLG_FORCE_IN3_MANUAL_MODE 0x0400 -#define DA9063_FLG_FORCE_IN3_AUTO_MODE 0x0800 - -/* Disable register caching. */ -#define DA9063_FLG_NO_CACHE 0x0008 - -struct da9063; - -/* DA9063 platform data */ -struct da9063_pdata { - int (*init)(struct da9063 *da9063); - int irq_base; - bool key_power; - unsigned flags; - struct da9063_regulators_pdata *regulators_pdata; - struct led_platform_data *leds_pdata; -}; - -#endif /* __MFD_DA9063_PDATA_H__ */ From aee36174b22d57775b3445399491777843f294ed Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Tue, 23 Jul 2019 19:51:03 +0800 Subject: [PATCH 0074/2880] mfd: timberdale: Use dev_get_drvdata Instead of using to_pci_dev + pci_get_drvdata, use dev_get_drvdata to make code simpler. Signed-off-by: Chuhong Yuan Signed-off-by: Lee Jones --- drivers/mfd/timberdale.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mfd/timberdale.c b/drivers/mfd/timberdale.c index 60c122e9b39f..faecbca6dba3 100644 --- a/drivers/mfd/timberdale.c +++ b/drivers/mfd/timberdale.c @@ -626,8 +626,7 @@ static const struct mfd_cell timberdale_cells_bar2[] = { static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf) { - struct pci_dev *pdev = to_pci_dev(dev); - struct timberdale_device *priv = pci_get_drvdata(pdev); + struct timberdale_device *priv = dev_get_drvdata(dev); return sprintf(buf, "%d.%d.%d\n", priv->fw.major, priv->fw.minor, priv->fw.config); From 83215897356fb54fdb7b9e399d5256421d4cfe4a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:08 +0200 Subject: [PATCH 0075/2880] mfd: 88pm800: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/88pm800.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mfd/88pm800.c b/drivers/mfd/88pm800.c index f2d9fb4c4e8e..4e8d0d6b9b5c 100644 --- a/drivers/mfd/88pm800.c +++ b/drivers/mfd/88pm800.c @@ -425,10 +425,10 @@ static int pm800_pages_init(struct pm80x_chip *chip) return -ENODEV; /* PM800 block power page */ - subchip->power_page = i2c_new_dummy(client->adapter, + subchip->power_page = i2c_new_dummy_device(client->adapter, subchip->power_page_addr); - if (subchip->power_page == NULL) { - ret = -ENODEV; + if (IS_ERR(subchip->power_page)) { + ret = PTR_ERR(subchip->power_page); goto out; } @@ -444,10 +444,10 @@ static int pm800_pages_init(struct pm80x_chip *chip) i2c_set_clientdata(subchip->power_page, chip); /* PM800 block GPADC */ - subchip->gpadc_page = i2c_new_dummy(client->adapter, + subchip->gpadc_page = i2c_new_dummy_device(client->adapter, subchip->gpadc_page_addr); - if (subchip->gpadc_page == NULL) { - ret = -ENODEV; + if (IS_ERR(subchip->gpadc_page)) { + ret = PTR_ERR(subchip->gpadc_page); goto out; } From 9520b835ffda85cdc1c84b1946c4dd3b6d00184c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:09 +0200 Subject: [PATCH 0076/2880] mfd: 88pm860x-core: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/88pm860x-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/88pm860x-core.c b/drivers/mfd/88pm860x-core.c index 9e0bd135730f..c9bae71f643a 100644 --- a/drivers/mfd/88pm860x-core.c +++ b/drivers/mfd/88pm860x-core.c @@ -1178,12 +1178,12 @@ static int pm860x_probe(struct i2c_client *client) */ if (pdata->companion_addr && (pdata->companion_addr != client->addr)) { chip->companion_addr = pdata->companion_addr; - chip->companion = i2c_new_dummy(chip->client->adapter, + chip->companion = i2c_new_dummy_device(chip->client->adapter, chip->companion_addr); - if (!chip->companion) { + if (IS_ERR(chip->companion)) { dev_err(&client->dev, "Failed to allocate I2C companion device\n"); - return -ENODEV; + return PTR_ERR(chip->companion); } chip->regmap_companion = regmap_init_i2c(chip->companion, &pm860x_regmap_config); From f5d5d193c5f777fb9c03298992668a34c575abdd Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:10 +0200 Subject: [PATCH 0077/2880] mfd: ab3100-core: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Reviewed-by: Linus Walleij Signed-off-by: Lee Jones --- drivers/mfd/ab3100-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c index 9f3dbc31d3e9..57723f116bb5 100644 --- a/drivers/mfd/ab3100-core.c +++ b/drivers/mfd/ab3100-core.c @@ -865,10 +865,10 @@ static int ab3100_probe(struct i2c_client *client, &ab3100->chip_name[0]); /* Attach a second dummy i2c_client to the test register address */ - ab3100->testreg_client = i2c_new_dummy(client->adapter, + ab3100->testreg_client = i2c_new_dummy_device(client->adapter, client->addr + 1); - if (!ab3100->testreg_client) { - err = -ENOMEM; + if (IS_ERR(ab3100->testreg_client)) { + err = PTR_ERR(ab3100->testreg_client); goto exit_no_testreg_client; } From 98f0c05f409e4fbf186b11dcb2baed84a253c224 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:11 +0200 Subject: [PATCH 0078/2880] mfd: bcm590xx: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/bcm590xx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/bcm590xx.c b/drivers/mfd/bcm590xx.c index 1aeb5e498d91..bfac5dc091ca 100644 --- a/drivers/mfd/bcm590xx.c +++ b/drivers/mfd/bcm590xx.c @@ -61,11 +61,11 @@ static int bcm590xx_i2c_probe(struct i2c_client *i2c_pri, } /* Secondary I2C slave address is the base address with A(2) asserted */ - bcm590xx->i2c_sec = i2c_new_dummy(i2c_pri->adapter, + bcm590xx->i2c_sec = i2c_new_dummy_device(i2c_pri->adapter, i2c_pri->addr | BIT(2)); - if (!bcm590xx->i2c_sec) { + if (IS_ERR(bcm590xx->i2c_sec)) { dev_err(&i2c_pri->dev, "failed to add secondary I2C device\n"); - return -ENODEV; + return PTR_ERR(bcm590xx->i2c_sec); } i2c_set_clientdata(bcm590xx->i2c_sec, bcm590xx); From e310ee86f9efb92104ac84c178fd11bdf83216eb Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:12 +0200 Subject: [PATCH 0079/2880] mfd: da9150-core: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/da9150-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/da9150-core.c b/drivers/mfd/da9150-core.c index 13033068721a..7f0aa1e8db96 100644 --- a/drivers/mfd/da9150-core.c +++ b/drivers/mfd/da9150-core.c @@ -420,10 +420,10 @@ static int da9150_probe(struct i2c_client *client, qif_addr = da9150_reg_read(da9150, DA9150_CORE2WIRE_CTRL_A); qif_addr = (qif_addr & DA9150_CORE_BASE_ADDR_MASK) >> 1; qif_addr |= DA9150_QIF_I2C_ADDR_LSB; - da9150->core_qif = i2c_new_dummy(client->adapter, qif_addr); - if (!da9150->core_qif) { + da9150->core_qif = i2c_new_dummy_device(client->adapter, qif_addr); + if (IS_ERR(da9150->core_qif)) { dev_err(da9150->dev, "Failed to attach QIF client\n"); - return -ENODEV; + return PTR_ERR(da9150->core_qif); } i2c_set_clientdata(da9150->core_qif, da9150); From f6ae8129631f6a3e072ec8d3ab60a65aa825f5f0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:13 +0200 Subject: [PATCH 0080/2880] mfd: max14577: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Reviewed-by: Krzysztof Kozlowski Signed-off-by: Lee Jones --- drivers/mfd/max14577.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/max14577.c b/drivers/mfd/max14577.c index ebb13d5de530..fd8864cafd25 100644 --- a/drivers/mfd/max14577.c +++ b/drivers/mfd/max14577.c @@ -297,11 +297,11 @@ static int max77836_init(struct max14577 *max14577) int ret; u8 intsrc_mask; - max14577->i2c_pmic = i2c_new_dummy(max14577->i2c->adapter, + max14577->i2c_pmic = i2c_new_dummy_device(max14577->i2c->adapter, I2C_ADDR_PMIC); - if (!max14577->i2c_pmic) { + if (IS_ERR(max14577->i2c_pmic)) { dev_err(max14577->dev, "Failed to register PMIC I2C device\n"); - return -ENODEV; + return PTR_ERR(max14577->i2c_pmic); } i2c_set_clientdata(max14577->i2c_pmic, max14577); From ad28edcb87337a1c9e52cae4694eb419883b16a9 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:14 +0200 Subject: [PATCH 0081/2880] mfd: max77693: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Reviewed-by: Krzysztof Kozlowski Signed-off-by: Lee Jones --- drivers/mfd/max77693.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c index 901d99d65924..596ed85cab3b 100644 --- a/drivers/mfd/max77693.c +++ b/drivers/mfd/max77693.c @@ -183,17 +183,17 @@ static int max77693_i2c_probe(struct i2c_client *i2c, } else dev_info(max77693->dev, "device ID: 0x%x\n", reg_data); - max77693->i2c_muic = i2c_new_dummy(i2c->adapter, I2C_ADDR_MUIC); - if (!max77693->i2c_muic) { + max77693->i2c_muic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_MUIC); + if (IS_ERR(max77693->i2c_muic)) { dev_err(max77693->dev, "Failed to allocate I2C device for MUIC\n"); - return -ENODEV; + return PTR_ERR(max77693->i2c_muic); } i2c_set_clientdata(max77693->i2c_muic, max77693); - max77693->i2c_haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC); - if (!max77693->i2c_haptic) { + max77693->i2c_haptic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_HAPTIC); + if (IS_ERR(max77693->i2c_haptic)) { dev_err(max77693->dev, "Failed to allocate I2C device for Haptic\n"); - ret = -ENODEV; + ret = PTR_ERR(max77693->i2c_haptic); goto err_i2c_haptic; } i2c_set_clientdata(max77693->i2c_haptic, max77693); From 0005a9e1bab7d2d46538d3db16ef1fde3433c710 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:15 +0200 Subject: [PATCH 0082/2880] mfd: max77843: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/max77843.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/max77843.c b/drivers/mfd/max77843.c index 25cbb2242b26..209ee24d9ce1 100644 --- a/drivers/mfd/max77843.c +++ b/drivers/mfd/max77843.c @@ -70,11 +70,11 @@ static int max77843_chg_init(struct max77693_dev *max77843) { int ret; - max77843->i2c_chg = i2c_new_dummy(max77843->i2c->adapter, I2C_ADDR_CHG); - if (!max77843->i2c_chg) { + max77843->i2c_chg = i2c_new_dummy_device(max77843->i2c->adapter, I2C_ADDR_CHG); + if (IS_ERR(max77843->i2c_chg)) { dev_err(&max77843->i2c->dev, "Cannot allocate I2C device for Charger\n"); - return -ENODEV; + return PTR_ERR(max77843->i2c_chg); } i2c_set_clientdata(max77843->i2c_chg, max77843); From b8afcd54db8aae362095c8b490f6240520c4e9a4 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:16 +0200 Subject: [PATCH 0083/2880] mfd: max8907: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/max8907.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/max8907.c b/drivers/mfd/max8907.c index cc01f706cb32..d44baafd9d14 100644 --- a/drivers/mfd/max8907.c +++ b/drivers/mfd/max8907.c @@ -214,9 +214,9 @@ static int max8907_i2c_probe(struct i2c_client *i2c, goto err_regmap_gen; } - max8907->i2c_rtc = i2c_new_dummy(i2c->adapter, MAX8907_RTC_I2C_ADDR); - if (!max8907->i2c_rtc) { - ret = -ENOMEM; + max8907->i2c_rtc = i2c_new_dummy_device(i2c->adapter, MAX8907_RTC_I2C_ADDR); + if (IS_ERR(max8907->i2c_rtc)) { + ret = PTR_ERR(max8907->i2c_rtc); goto err_dummy_rtc; } i2c_set_clientdata(max8907->i2c_rtc, max8907); From ddbf6ffeb63cf06cfc9707a550ece773bbd2a925 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:17 +0200 Subject: [PATCH 0084/2880] mfd: max8925-i2c: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/max8925-i2c.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mfd/max8925-i2c.c b/drivers/mfd/max8925-i2c.c index 20bb19b71109..114e905bef25 100644 --- a/drivers/mfd/max8925-i2c.c +++ b/drivers/mfd/max8925-i2c.c @@ -176,18 +176,18 @@ static int max8925_probe(struct i2c_client *client, dev_set_drvdata(chip->dev, chip); mutex_init(&chip->io_lock); - chip->rtc = i2c_new_dummy(chip->i2c->adapter, RTC_I2C_ADDR); - if (!chip->rtc) { + chip->rtc = i2c_new_dummy_device(chip->i2c->adapter, RTC_I2C_ADDR); + if (IS_ERR(chip->rtc)) { dev_err(chip->dev, "Failed to allocate I2C device for RTC\n"); - return -ENODEV; + return PTR_ERR(chip->rtc); } i2c_set_clientdata(chip->rtc, chip); - chip->adc = i2c_new_dummy(chip->i2c->adapter, ADC_I2C_ADDR); - if (!chip->adc) { + chip->adc = i2c_new_dummy_device(chip->i2c->adapter, ADC_I2C_ADDR); + if (IS_ERR(chip->adc)) { dev_err(chip->dev, "Failed to allocate I2C device for ADC\n"); i2c_unregister_device(chip->rtc); - return -ENODEV; + return PTR_ERR(chip->adc); } i2c_set_clientdata(chip->adc, chip); From 4e32bff681fb4a7eb552af7187735a68c5afeab1 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:18 +0200 Subject: [PATCH 0085/2880] mfd: max8997: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/max8997.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/mfd/max8997.c b/drivers/mfd/max8997.c index 8c06c09e36d1..68d8f2b95287 100644 --- a/drivers/mfd/max8997.c +++ b/drivers/mfd/max8997.c @@ -185,25 +185,25 @@ static int max8997_i2c_probe(struct i2c_client *i2c, mutex_init(&max8997->iolock); - max8997->rtc = i2c_new_dummy(i2c->adapter, I2C_ADDR_RTC); - if (!max8997->rtc) { + max8997->rtc = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_RTC); + if (IS_ERR(max8997->rtc)) { dev_err(max8997->dev, "Failed to allocate I2C device for RTC\n"); - return -ENODEV; + return PTR_ERR(max8997->rtc); } i2c_set_clientdata(max8997->rtc, max8997); - max8997->haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC); - if (!max8997->haptic) { + max8997->haptic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_HAPTIC); + if (IS_ERR(max8997->haptic)) { dev_err(max8997->dev, "Failed to allocate I2C device for Haptic\n"); - ret = -ENODEV; + ret = PTR_ERR(max8997->haptic); goto err_i2c_haptic; } i2c_set_clientdata(max8997->haptic, max8997); - max8997->muic = i2c_new_dummy(i2c->adapter, I2C_ADDR_MUIC); - if (!max8997->muic) { + max8997->muic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_MUIC); + if (IS_ERR(max8997->muic)) { dev_err(max8997->dev, "Failed to allocate I2C device for MUIC\n"); - ret = -ENODEV; + ret = PTR_ERR(max8997->muic); goto err_i2c_muic; } i2c_set_clientdata(max8997->muic, max8997); From 7a99c8f3310b13c87ec87114e6fa3e915005b187 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:19 +0200 Subject: [PATCH 0086/2880] mfd: max8998: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/max8998.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/max8998.c b/drivers/mfd/max8998.c index 56409df120f8..785f8e9841b7 100644 --- a/drivers/mfd/max8998.c +++ b/drivers/mfd/max8998.c @@ -195,10 +195,10 @@ static int max8998_i2c_probe(struct i2c_client *i2c, } mutex_init(&max8998->iolock); - max8998->rtc = i2c_new_dummy(i2c->adapter, RTC_I2C_ADDR); - if (!max8998->rtc) { + max8998->rtc = i2c_new_dummy_device(i2c->adapter, RTC_I2C_ADDR); + if (IS_ERR(max8998->rtc)) { dev_err(&i2c->dev, "Failed to allocate I2C device for RTC\n"); - return -ENODEV; + return PTR_ERR(max8998->rtc); } i2c_set_clientdata(max8998->rtc, max8998); From ad9fc1f4229ed390590c3ea5be23addf84d54672 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:20 +0200 Subject: [PATCH 0087/2880] mfd: palmas: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/palmas.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/palmas.c b/drivers/mfd/palmas.c index 6818ff34837c..f5b3fa973b13 100644 --- a/drivers/mfd/palmas.c +++ b/drivers/mfd/palmas.c @@ -549,12 +549,12 @@ static int palmas_i2c_probe(struct i2c_client *i2c, palmas->i2c_clients[i] = i2c; else { palmas->i2c_clients[i] = - i2c_new_dummy(i2c->adapter, + i2c_new_dummy_device(i2c->adapter, i2c->addr + i); - if (!palmas->i2c_clients[i]) { + if (IS_ERR(palmas->i2c_clients[i])) { dev_err(palmas->dev, "can't attach client %d\n", i); - ret = -ENOMEM; + ret = PTR_ERR(palmas->i2c_clients[i]); goto err_i2c; } palmas->i2c_clients[i]->dev.of_node = of_node_get(node); From ba972dac9854a76202c4e798fa618e6d56ca2f81 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:21 +0200 Subject: [PATCH 0088/2880] mfd: twl-core: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/twl-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/twl-core.c b/drivers/mfd/twl-core.c index 448d9397ff04..20cf8cfe4f3b 100644 --- a/drivers/mfd/twl-core.c +++ b/drivers/mfd/twl-core.c @@ -1141,12 +1141,12 @@ twl_probe(struct i2c_client *client, const struct i2c_device_id *id) if (i == 0) { twl->client = client; } else { - twl->client = i2c_new_dummy(client->adapter, + twl->client = i2c_new_dummy_device(client->adapter, client->addr + i); - if (!twl->client) { + if (IS_ERR(twl->client)) { dev_err(&client->dev, "can't attach client %d\n", i); - status = -ENOMEM; + status = PTR_ERR(twl->client); goto fail; } } From 865cac14c2dac2fa5f16d1781ca2746b0d323796 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 8 Jul 2019 14:41:29 +0200 Subject: [PATCH 0089/2880] backlight: rave-sp: Leave initial state and register with correct device This way the backlight can be referenced through its device node and enabling/disabling can be managed through the panel driver. Signed-off-by: Lucas Stach Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/rave-sp-backlight.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/video/backlight/rave-sp-backlight.c b/drivers/video/backlight/rave-sp-backlight.c index 462f14a1b19d..05b5f003a3d1 100644 --- a/drivers/video/backlight/rave-sp-backlight.c +++ b/drivers/video/backlight/rave-sp-backlight.c @@ -48,14 +48,20 @@ static int rave_sp_backlight_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct backlight_device *bd; - bd = devm_backlight_device_register(dev, pdev->name, dev->parent, + bd = devm_backlight_device_register(dev, pdev->name, dev, dev_get_drvdata(dev->parent), &rave_sp_backlight_ops, &rave_sp_backlight_props); if (IS_ERR(bd)) return PTR_ERR(bd); - backlight_update_status(bd); + /* + * If there is a phandle pointing to the device node we can + * assume that another device will manage the status changes. + * If not we make sure the backlight is in a consistent state. + */ + if (!dev->of_node->phandle) + backlight_update_status(bd); return 0; } From 072e2c8192cfc6ae1de1a2aca02d4512b69bdeed Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 23 Jul 2019 22:34:00 +0300 Subject: [PATCH 0090/2880] backlight: lm3630a: Switch to use fwnode_property_count_uXX() Use use fwnode_property_count_uXX() directly, that makes code neater. Signed-off-by: Andy Shevchenko Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/lm3630a_bl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/video/backlight/lm3630a_bl.c b/drivers/video/backlight/lm3630a_bl.c index b04b35d007a2..2d8e8192e4e2 100644 --- a/drivers/video/backlight/lm3630a_bl.c +++ b/drivers/video/backlight/lm3630a_bl.c @@ -377,8 +377,7 @@ static int lm3630a_parse_led_sources(struct fwnode_handle *node, u32 sources[LM3630A_NUM_SINKS]; int ret, num_sources, i; - num_sources = fwnode_property_read_u32_array(node, "led-sources", NULL, - 0); + num_sources = fwnode_property_count_u32(node, "led-sources"); if (num_sources < 0) return default_led_sources; else if (num_sources > ARRAY_SIZE(sources)) From 30f644fd6ec9a0a241adb91579aaca7404132d7b Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 24 Jul 2019 23:38:28 +0200 Subject: [PATCH 0091/2880] backlight: lms283gf05: Fix a typo in the description passed to 'devm_gpio_request_one()' The description passed to 'devm_gpio_request_one()' should be related to LMS283GF05, not LMS285GF05. Signed-off-by: Christophe JAILLET Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/lms283gf05.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/backlight/lms283gf05.c b/drivers/video/backlight/lms283gf05.c index 35bc012b22cc..0e45685bcc1c 100644 --- a/drivers/video/backlight/lms283gf05.c +++ b/drivers/video/backlight/lms283gf05.c @@ -158,7 +158,7 @@ static int lms283gf05_probe(struct spi_device *spi) ret = devm_gpio_request_one(&spi->dev, pdata->reset_gpio, GPIOF_DIR_OUT | (!pdata->reset_inverted ? GPIOF_INIT_HIGH : GPIOF_INIT_LOW), - "LMS285GF05 RESET"); + "LMS283GF05 RESET"); if (ret) return ret; } From 76380a607ba0b28627c9b4b55cd47a079a59624b Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 5 Jul 2019 12:55:03 +0800 Subject: [PATCH 0092/2880] mfd: intel-lpss: Remove D3cold delay Goodix touchpad may drop its first couple input events when i2c-designware-platdrv and intel-lpss it connects to took too long to runtime resume from runtime suspended state. This issue happens becuase the touchpad has a rather small buffer to store up to 13 input events, so if the host doesn't read those events in time (i.e. runtime resume takes too long), events are dropped from the touchpad's buffer. The bottleneck is D3cold delay it waits when transitioning from D3cold to D0, hence remove the delay to make the resume faster. I've tested some systems with intel-lpss and haven't seen any regression. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202683 Signed-off-by: Kai-Heng Feng Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss-pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c index ade6e1ce5a98..e3a04929aaa3 100644 --- a/drivers/mfd/intel-lpss-pci.c +++ b/drivers/mfd/intel-lpss-pci.c @@ -35,6 +35,8 @@ static int intel_lpss_pci_probe(struct pci_dev *pdev, info->mem = &pdev->resource[0]; info->irq = pdev->irq; + pdev->d3cold_delay = 0; + /* Probably it is enough to set this for iDMA capable devices only */ pci_set_master(pdev); pci_try_set_mwi(pdev); From ea1acf11ee7ac2d63d93e78c638ec8abc710a1a7 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 28 Jul 2019 18:58:58 -0500 Subject: [PATCH 0093/2880] mfd: omap-usb-host: Mark expected switch fall-throughs Mark switch cases where we are expecting to fall through. This patch fixes the following warnings: drivers/mfd/omap-usb-host.c: In function 'usbhs_runtime_resume': drivers/mfd/omap-usb-host.c:303:7: warning: this statement may fall through [-Wimplicit-fallthrough=] if (!IS_ERR(omap->hsic480m_clk[i])) { ^ drivers/mfd/omap-usb-host.c:313:3: note: here case OMAP_EHCI_PORT_MODE_TLL: ^~~~ drivers/mfd/omap-usb-host.c: In function 'usbhs_runtime_suspend': drivers/mfd/omap-usb-host.c:345:7: warning: this statement may fall through [-Wimplicit-fallthrough=] if (!IS_ERR(omap->hsic480m_clk[i])) ^ drivers/mfd/omap-usb-host.c:349:3: note: here case OMAP_EHCI_PORT_MODE_TLL: ^~~~ Reported-by: Stephen Rothwell Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Lee Jones --- drivers/mfd/omap-usb-host.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c index 792b855a9104..4798d9f3f9d5 100644 --- a/drivers/mfd/omap-usb-host.c +++ b/drivers/mfd/omap-usb-host.c @@ -308,7 +308,7 @@ static int usbhs_runtime_resume(struct device *dev) i, r); } } - /* Fall through as HSIC mode needs utmi_clk */ + /* Fall through - as HSIC mode needs utmi_clk */ case OMAP_EHCI_PORT_MODE_TLL: if (!IS_ERR(omap->utmi_clk[i])) { @@ -344,7 +344,7 @@ static int usbhs_runtime_suspend(struct device *dev) if (!IS_ERR(omap->hsic480m_clk[i])) clk_disable_unprepare(omap->hsic480m_clk[i]); - /* Fall through as utmi_clks were used in HSIC mode */ + /* Fall through - as utmi_clks were used in HSIC mode */ case OMAP_EHCI_PORT_MODE_TLL: if (!IS_ERR(omap->utmi_clk[i])) From ff71266aa490a9f8ed761d47945f300ba19e7c93 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Thu, 25 Jul 2019 18:02:14 -0400 Subject: [PATCH 0094/2880] mfd: Drop obsolete JZ4740 driver It has been replaced with the ingenic-iio driver for the ADC. Signed-off-by: Paul Cercueil Tested-by: Artur Rojek Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 9 -- drivers/mfd/Makefile | 1 - drivers/mfd/jz4740-adc.c | 324 --------------------------------------- 3 files changed, 334 deletions(-) delete mode 100644 drivers/mfd/jz4740-adc.c diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index f129f9678940..4a07afe50b35 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -649,15 +649,6 @@ config MFD_JANZ_CMODIO host many different types of MODULbus daughterboards, including CAN and GPIO controllers. -config MFD_JZ4740_ADC - bool "Janz JZ4740 ADC core" - select MFD_CORE - select GENERIC_IRQ_CHIP - depends on MACH_JZ4740 - help - Say yes here if you want support for the ADC unit in the JZ4740 SoC. - This driver is necessary for jz4740-battery and jz4740-hwmon driver. - config MFD_KEMPLD tristate "Kontron module PLD device" select MFD_CORE diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f026ada68f6a..446d5df7cacb 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -191,7 +191,6 @@ obj-$(CONFIG_LPC_SCH) += lpc_sch.o obj-$(CONFIG_LPC_ICH) += lpc_ich.o obj-$(CONFIG_MFD_RDC321X) += rdc321x-southbridge.o obj-$(CONFIG_MFD_JANZ_CMODIO) += janz-cmodio.o -obj-$(CONFIG_MFD_JZ4740_ADC) += jz4740-adc.o obj-$(CONFIG_MFD_TPS6586X) += tps6586x.o obj-$(CONFIG_MFD_VX855) += vx855.o obj-$(CONFIG_MFD_WL1273_CORE) += wl1273-core.o diff --git a/drivers/mfd/jz4740-adc.c b/drivers/mfd/jz4740-adc.c deleted file mode 100644 index 082f16917519..000000000000 --- a/drivers/mfd/jz4740-adc.c +++ /dev/null @@ -1,324 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2009-2010, Lars-Peter Clausen - * JZ4740 SoC ADC driver - * - * This driver synchronizes access to the JZ4740 ADC core between the - * JZ4740 battery and hwmon drivers. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - - -#define JZ_REG_ADC_ENABLE 0x00 -#define JZ_REG_ADC_CFG 0x04 -#define JZ_REG_ADC_CTRL 0x08 -#define JZ_REG_ADC_STATUS 0x0c - -#define JZ_REG_ADC_TOUCHSCREEN_BASE 0x10 -#define JZ_REG_ADC_BATTERY_BASE 0x1c -#define JZ_REG_ADC_HWMON_BASE 0x20 - -#define JZ_ADC_ENABLE_TOUCH BIT(2) -#define JZ_ADC_ENABLE_BATTERY BIT(1) -#define JZ_ADC_ENABLE_ADCIN BIT(0) - -enum { - JZ_ADC_IRQ_ADCIN = 0, - JZ_ADC_IRQ_BATTERY, - JZ_ADC_IRQ_TOUCH, - JZ_ADC_IRQ_PENUP, - JZ_ADC_IRQ_PENDOWN, -}; - -struct jz4740_adc { - struct resource *mem; - void __iomem *base; - - int irq; - struct irq_chip_generic *gc; - - struct clk *clk; - atomic_t clk_ref; - - spinlock_t lock; -}; - -static void jz4740_adc_irq_demux(struct irq_desc *desc) -{ - struct irq_chip_generic *gc = irq_desc_get_handler_data(desc); - uint8_t status; - unsigned int i; - - status = readb(gc->reg_base + JZ_REG_ADC_STATUS); - - for (i = 0; i < 5; ++i) { - if (status & BIT(i)) - generic_handle_irq(gc->irq_base + i); - } -} - - -/* Refcounting for the ADC clock is done in here instead of in the clock - * framework, because it is the only clock which is shared between multiple - * devices and thus is the only clock which needs refcounting */ -static inline void jz4740_adc_clk_enable(struct jz4740_adc *adc) -{ - if (atomic_inc_return(&adc->clk_ref) == 1) - clk_prepare_enable(adc->clk); -} - -static inline void jz4740_adc_clk_disable(struct jz4740_adc *adc) -{ - if (atomic_dec_return(&adc->clk_ref) == 0) - clk_disable_unprepare(adc->clk); -} - -static inline void jz4740_adc_set_enabled(struct jz4740_adc *adc, int engine, - bool enabled) -{ - unsigned long flags; - uint8_t val; - - spin_lock_irqsave(&adc->lock, flags); - - val = readb(adc->base + JZ_REG_ADC_ENABLE); - if (enabled) - val |= BIT(engine); - else - val &= ~BIT(engine); - writeb(val, adc->base + JZ_REG_ADC_ENABLE); - - spin_unlock_irqrestore(&adc->lock, flags); -} - -static int jz4740_adc_cell_enable(struct platform_device *pdev) -{ - struct jz4740_adc *adc = dev_get_drvdata(pdev->dev.parent); - - jz4740_adc_clk_enable(adc); - jz4740_adc_set_enabled(adc, pdev->id, true); - - return 0; -} - -static int jz4740_adc_cell_disable(struct platform_device *pdev) -{ - struct jz4740_adc *adc = dev_get_drvdata(pdev->dev.parent); - - jz4740_adc_set_enabled(adc, pdev->id, false); - jz4740_adc_clk_disable(adc); - - return 0; -} - -int jz4740_adc_set_config(struct device *dev, uint32_t mask, uint32_t val) -{ - struct jz4740_adc *adc = dev_get_drvdata(dev); - unsigned long flags; - uint32_t cfg; - - if (!adc) - return -ENODEV; - - spin_lock_irqsave(&adc->lock, flags); - - cfg = readl(adc->base + JZ_REG_ADC_CFG); - - cfg &= ~mask; - cfg |= val; - - writel(cfg, adc->base + JZ_REG_ADC_CFG); - - spin_unlock_irqrestore(&adc->lock, flags); - - return 0; -} -EXPORT_SYMBOL_GPL(jz4740_adc_set_config); - -static struct resource jz4740_hwmon_resources[] = { - { - .start = JZ_ADC_IRQ_ADCIN, - .flags = IORESOURCE_IRQ, - }, - { - .start = JZ_REG_ADC_HWMON_BASE, - .end = JZ_REG_ADC_HWMON_BASE + 3, - .flags = IORESOURCE_MEM, - }, -}; - -static struct resource jz4740_battery_resources[] = { - { - .start = JZ_ADC_IRQ_BATTERY, - .flags = IORESOURCE_IRQ, - }, - { - .start = JZ_REG_ADC_BATTERY_BASE, - .end = JZ_REG_ADC_BATTERY_BASE + 3, - .flags = IORESOURCE_MEM, - }, -}; - -static const struct mfd_cell jz4740_adc_cells[] = { - { - .id = 0, - .name = "jz4740-hwmon", - .num_resources = ARRAY_SIZE(jz4740_hwmon_resources), - .resources = jz4740_hwmon_resources, - - .enable = jz4740_adc_cell_enable, - .disable = jz4740_adc_cell_disable, - }, - { - .id = 1, - .name = "jz4740-battery", - .num_resources = ARRAY_SIZE(jz4740_battery_resources), - .resources = jz4740_battery_resources, - - .enable = jz4740_adc_cell_enable, - .disable = jz4740_adc_cell_disable, - }, -}; - -static int jz4740_adc_probe(struct platform_device *pdev) -{ - struct irq_chip_generic *gc; - struct irq_chip_type *ct; - struct jz4740_adc *adc; - struct resource *mem_base; - int ret; - int irq_base; - - adc = devm_kzalloc(&pdev->dev, sizeof(*adc), GFP_KERNEL); - if (!adc) - return -ENOMEM; - - adc->irq = platform_get_irq(pdev, 0); - if (adc->irq < 0) { - ret = adc->irq; - dev_err(&pdev->dev, "Failed to get platform irq: %d\n", ret); - return ret; - } - - irq_base = platform_get_irq(pdev, 1); - if (irq_base < 0) { - dev_err(&pdev->dev, "Failed to get irq base: %d\n", irq_base); - return irq_base; - } - - mem_base = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!mem_base) { - dev_err(&pdev->dev, "Failed to get platform mmio resource\n"); - return -ENOENT; - } - - /* Only request the shared registers for the MFD driver */ - adc->mem = request_mem_region(mem_base->start, JZ_REG_ADC_STATUS, - pdev->name); - if (!adc->mem) { - dev_err(&pdev->dev, "Failed to request mmio memory region\n"); - return -EBUSY; - } - - adc->base = ioremap_nocache(adc->mem->start, resource_size(adc->mem)); - if (!adc->base) { - ret = -EBUSY; - dev_err(&pdev->dev, "Failed to ioremap mmio memory\n"); - goto err_release_mem_region; - } - - adc->clk = clk_get(&pdev->dev, "adc"); - if (IS_ERR(adc->clk)) { - ret = PTR_ERR(adc->clk); - dev_err(&pdev->dev, "Failed to get clock: %d\n", ret); - goto err_iounmap; - } - - spin_lock_init(&adc->lock); - atomic_set(&adc->clk_ref, 0); - - platform_set_drvdata(pdev, adc); - - gc = irq_alloc_generic_chip("INTC", 1, irq_base, adc->base, - handle_level_irq); - - ct = gc->chip_types; - ct->regs.mask = JZ_REG_ADC_CTRL; - ct->regs.ack = JZ_REG_ADC_STATUS; - ct->chip.irq_mask = irq_gc_mask_set_bit; - ct->chip.irq_unmask = irq_gc_mask_clr_bit; - ct->chip.irq_ack = irq_gc_ack_set_bit; - - irq_setup_generic_chip(gc, IRQ_MSK(5), IRQ_GC_INIT_MASK_CACHE, 0, - IRQ_NOPROBE | IRQ_LEVEL); - - adc->gc = gc; - - irq_set_chained_handler_and_data(adc->irq, jz4740_adc_irq_demux, gc); - - writeb(0x00, adc->base + JZ_REG_ADC_ENABLE); - writeb(0xff, adc->base + JZ_REG_ADC_CTRL); - - ret = mfd_add_devices(&pdev->dev, 0, jz4740_adc_cells, - ARRAY_SIZE(jz4740_adc_cells), mem_base, - irq_base, NULL); - if (ret < 0) - goto err_clk_put; - - return 0; - -err_clk_put: - clk_put(adc->clk); -err_iounmap: - iounmap(adc->base); -err_release_mem_region: - release_mem_region(adc->mem->start, resource_size(adc->mem)); - return ret; -} - -static int jz4740_adc_remove(struct platform_device *pdev) -{ - struct jz4740_adc *adc = platform_get_drvdata(pdev); - - mfd_remove_devices(&pdev->dev); - - irq_remove_generic_chip(adc->gc, IRQ_MSK(5), IRQ_NOPROBE | IRQ_LEVEL, 0); - kfree(adc->gc); - irq_set_chained_handler_and_data(adc->irq, NULL, NULL); - - iounmap(adc->base); - release_mem_region(adc->mem->start, resource_size(adc->mem)); - - clk_put(adc->clk); - - return 0; -} - -static struct platform_driver jz4740_adc_driver = { - .probe = jz4740_adc_probe, - .remove = jz4740_adc_remove, - .driver = { - .name = "jz4740-adc", - }, -}; - -module_platform_driver(jz4740_adc_driver); - -MODULE_DESCRIPTION("JZ4740 SoC ADC driver"); -MODULE_AUTHOR("Lars-Peter Clausen "); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:jz4740-adc"); From ec65b56046d27a21a5ae02eb7fcb321e1942a541 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 1 Aug 2019 16:28:41 +0300 Subject: [PATCH 0095/2880] mfd: intel-lpss: Add Intel Tiger Lake PCI IDs Intel Tiger Lake has the same LPSS than Intel Broxton. Add the new IDs to the list of supported devices. Signed-off-by: Andy Shevchenko Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss-pci.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c index e3a04929aaa3..9355db29d2f9 100644 --- a/drivers/mfd/intel-lpss-pci.c +++ b/drivers/mfd/intel-lpss-pci.c @@ -258,6 +258,29 @@ static const struct pci_device_id intel_lpss_pci_ids[] = { { PCI_VDEVICE(INTEL, 0x9dea), (kernel_ulong_t)&cnl_i2c_info }, { PCI_VDEVICE(INTEL, 0x9deb), (kernel_ulong_t)&cnl_i2c_info }, { PCI_VDEVICE(INTEL, 0x9dfb), (kernel_ulong_t)&spt_info }, + /* TGL-LP */ + { PCI_VDEVICE(INTEL, 0xa0a8), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0a9), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0aa), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0ab), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0c5), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0c6), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0c7), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0d8), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0d9), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0da), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0db), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0dc), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0dd), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0de), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0df), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0e8), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0e9), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0ea), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0eb), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0fb), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0fd), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0fe), (kernel_ulong_t)&spt_info }, /* SPT-H */ { PCI_VDEVICE(INTEL, 0xa127), (kernel_ulong_t)&spt_uart_info }, { PCI_VDEVICE(INTEL, 0xa128), (kernel_ulong_t)&spt_uart_info }, From b620c17672b9c162bbc1e480eaf43a825345cb2a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 28 Jul 2019 18:56:14 -0500 Subject: [PATCH 0096/2880] mfd: db8500-prcmu: Mark expected switch fall-throughs Mark switch cases where we are expecting to fall through. This patch fixes the following warnings: drivers/mfd/db8500-prcmu.c: In function 'dsiclk_rate': drivers/mfd/db8500-prcmu.c:1592:7: warning: this statement may fall through [-Wimplicit-fallthrough=] div *= 2; ~~~~^~~~ drivers/mfd/db8500-prcmu.c:1593:2: note: here case PRCM_DSI_PLLOUT_SEL_PHI_2: ^~~~ drivers/mfd/db8500-prcmu.c:1594:7: warning: this statement may fall through [-Wimplicit-fallthrough=] div *= 2; ~~~~^~~~ drivers/mfd/db8500-prcmu.c:1595:2: note: here case PRCM_DSI_PLLOUT_SEL_PHI: ^~~~ Reported-by: Stephen Rothwell Signed-off-by: Gustavo A. R. Silva Reviewed-by: Linus Walleij Reviewed-by: Kees Cook Signed-off-by: Lee Jones --- drivers/mfd/db8500-prcmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c index 3f21e26b8d36..90e0f21bc49c 100644 --- a/drivers/mfd/db8500-prcmu.c +++ b/drivers/mfd/db8500-prcmu.c @@ -1590,8 +1590,10 @@ static unsigned long dsiclk_rate(u8 n) switch (divsel) { case PRCM_DSI_PLLOUT_SEL_PHI_4: div *= 2; + /* Fall through */ case PRCM_DSI_PLLOUT_SEL_PHI_2: div *= 2; + /* Fall through */ case PRCM_DSI_PLLOUT_SEL_PHI: return pll_rate(PRCM_PLLDSI_FREQ, clock_rate(PRCMU_HDMICLK), PLL_RAW) / div; From 802d9bd4fac70be2ea61fa83660a87a57d06bab0 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 30 Jul 2019 11:15:27 -0700 Subject: [PATCH 0097/2880] mfd: Remove dev_err() usage after platform_get_irq() We don't need dev_err() messages when platform_get_irq() fails now that platform_get_irq() prints an error message itself when something goes wrong. Let's remove these prints with a simple semantic patch. // @@ expression ret; struct platform_device *E; @@ ret = ( platform_get_irq(E, ...) | platform_get_irq_byname(E, ...) ); if ( \( ret < 0 \| ret <= 0 \) ) { ( -if (ret != -EPROBE_DEFER) -{ ... -dev_err(...); -... } | ... -dev_err(...); ) ... } // While we're here, remove braces on if statements that only have one statement (manually). Signed-off-by: Stephen Boyd Signed-off-by: Lee Jones --- drivers/mfd/ab8500-debugfs.c | 8 ++------ drivers/mfd/db8500-prcmu.c | 4 +--- drivers/mfd/fsl-imx25-tsadc.c | 4 +--- drivers/mfd/intel_soc_pmic_bxtwc.c | 4 +--- drivers/mfd/qcom_rpm.c | 12 +++--------- drivers/mfd/sm501.c | 4 +--- 6 files changed, 9 insertions(+), 27 deletions(-) diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c index 567a34b073dd..f4e26b6e5362 100644 --- a/drivers/mfd/ab8500-debugfs.c +++ b/drivers/mfd/ab8500-debugfs.c @@ -2680,16 +2680,12 @@ static int ab8500_debug_probe(struct platform_device *plf) irq_ab8500 = res->start; irq_first = platform_get_irq_byname(plf, "IRQ_FIRST"); - if (irq_first < 0) { - dev_err(&plf->dev, "First irq not found, err %d\n", irq_first); + if (irq_first < 0) return irq_first; - } irq_last = platform_get_irq_byname(plf, "IRQ_LAST"); - if (irq_last < 0) { - dev_err(&plf->dev, "Last irq not found, err %d\n", irq_last); + if (irq_last < 0) return irq_last; - } ab8500_dir = debugfs_create_dir(AB8500_NAME_STRING, NULL); diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c index 90e0f21bc49c..0f459c246634 100644 --- a/drivers/mfd/db8500-prcmu.c +++ b/drivers/mfd/db8500-prcmu.c @@ -3130,10 +3130,8 @@ static int db8500_prcmu_probe(struct platform_device *pdev) writel(ALL_MBOX_BITS, PRCM_ARM_IT1_CLR); irq = platform_get_irq(pdev, 0); - if (irq <= 0) { - dev_err(&pdev->dev, "no prcmu irq provided\n"); + if (irq <= 0) return irq; - } err = request_threaded_irq(irq, prcmu_irq_handler, prcmu_irq_thread_fn, IRQF_NO_SUSPEND, "prcmu", NULL); diff --git a/drivers/mfd/fsl-imx25-tsadc.c b/drivers/mfd/fsl-imx25-tsadc.c index 20791cab7263..a016b39fe9b0 100644 --- a/drivers/mfd/fsl-imx25-tsadc.c +++ b/drivers/mfd/fsl-imx25-tsadc.c @@ -69,10 +69,8 @@ static int mx25_tsadc_setup_irq(struct platform_device *pdev, int irq; irq = platform_get_irq(pdev, 0); - if (irq <= 0) { - dev_err(dev, "Failed to get irq\n"); + if (irq <= 0) return irq; - } tsadc->domain = irq_domain_add_simple(np, 2, 0, &mx25_tsadc_domain_ops, tsadc); diff --git a/drivers/mfd/intel_soc_pmic_bxtwc.c b/drivers/mfd/intel_soc_pmic_bxtwc.c index 6310c3bdb991..739cfb5b69fe 100644 --- a/drivers/mfd/intel_soc_pmic_bxtwc.c +++ b/drivers/mfd/intel_soc_pmic_bxtwc.c @@ -450,10 +450,8 @@ static int bxtwc_probe(struct platform_device *pdev) return -ENOMEM; ret = platform_get_irq(pdev, 0); - if (ret < 0) { - dev_err(&pdev->dev, "Invalid IRQ\n"); + if (ret < 0) return ret; - } pmic->irq = ret; dev_set_drvdata(&pdev->dev, pmic); diff --git a/drivers/mfd/qcom_rpm.c b/drivers/mfd/qcom_rpm.c index 4d7e9008628c..71bc34b74bc9 100644 --- a/drivers/mfd/qcom_rpm.c +++ b/drivers/mfd/qcom_rpm.c @@ -561,22 +561,16 @@ static int qcom_rpm_probe(struct platform_device *pdev) clk_prepare_enable(rpm->ramclk); /* Accepts NULL */ irq_ack = platform_get_irq_byname(pdev, "ack"); - if (irq_ack < 0) { - dev_err(&pdev->dev, "required ack interrupt missing\n"); + if (irq_ack < 0) return irq_ack; - } irq_err = platform_get_irq_byname(pdev, "err"); - if (irq_err < 0) { - dev_err(&pdev->dev, "required err interrupt missing\n"); + if (irq_err < 0) return irq_err; - } irq_wakeup = platform_get_irq_byname(pdev, "wakeup"); - if (irq_wakeup < 0) { - dev_err(&pdev->dev, "required wakeup interrupt missing\n"); + if (irq_wakeup < 0) return irq_wakeup; - } match = of_match_device(qcom_rpm_of_match, &pdev->dev); if (!match) diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c index 9b9b06d36cb1..d5e34a5eb250 100644 --- a/drivers/mfd/sm501.c +++ b/drivers/mfd/sm501.c @@ -1394,10 +1394,8 @@ static int sm501_plat_probe(struct platform_device *dev) sm->platdata = dev_get_platdata(&dev->dev); ret = platform_get_irq(dev, 0); - if (ret < 0) { - dev_err(&dev->dev, "failed to get irq resource\n"); + if (ret < 0) goto err_res; - } sm->irq = ret; sm->io_res = platform_get_resource(dev, IORESOURCE_MEM, 1); From 9e38e690ace3e7a22a81fc02652fc101efb340cf Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Wed, 24 Jul 2019 13:54:12 +0530 Subject: [PATCH 0098/2880] PCI: tegra: Fix OF node reference leak Each iteration of for_each_child_of_node() executes of_node_put() on the previous node, but in some return paths in the middle of the loop of_node_put() is missing thus causing a reference leak. Hence stash these mid-loop return values in a variable 'err' and add a new label err_node_put which executes of_node_put() on the previous node and returns 'err' on failure. Change mid-loop return statements to point to jump to this label to fix the reference leak. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta [lorenzo.pieralisi@arm.com: rewrote commit log] Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pci-tegra.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c index 9a917b2456f6..673a1725ef38 100644 --- a/drivers/pci/controller/pci-tegra.c +++ b/drivers/pci/controller/pci-tegra.c @@ -2237,14 +2237,15 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) err = of_pci_get_devfn(port); if (err < 0) { dev_err(dev, "failed to parse address: %d\n", err); - return err; + goto err_node_put; } index = PCI_SLOT(err); if (index < 1 || index > soc->num_ports) { dev_err(dev, "invalid port number: %d\n", index); - return -EINVAL; + err = -EINVAL; + goto err_node_put; } index--; @@ -2253,12 +2254,13 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) if (err < 0) { dev_err(dev, "failed to parse # of lanes: %d\n", err); - return err; + goto err_node_put; } if (value > 16) { dev_err(dev, "invalid # of lanes: %u\n", value); - return -EINVAL; + err = -EINVAL; + goto err_node_put; } lanes |= value << (index << 3); @@ -2272,13 +2274,15 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) lane += value; rp = devm_kzalloc(dev, sizeof(*rp), GFP_KERNEL); - if (!rp) - return -ENOMEM; + if (!rp) { + err = -ENOMEM; + goto err_node_put; + } err = of_address_to_resource(port, 0, &rp->regs); if (err < 0) { dev_err(dev, "failed to parse address: %d\n", err); - return err; + goto err_node_put; } INIT_LIST_HEAD(&rp->list); @@ -2330,6 +2334,10 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) return err; return 0; + +err_node_put: + of_node_put(port); + return err; } /* From 630ee1a50c40210ef021435faa0e80d17a904883 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 29 Jul 2019 10:10:33 -0500 Subject: [PATCH 0099/2880] watchdog: Mark expected switch fall-throughs Mark switch cases where we are expecting to fall through. This patch fixes the following warnings: drivers/watchdog/ar7_wdt.c: warning: this statement may fall through [-Wimplicit-fallthrough=]: => 237:3 drivers/watchdog/pcwd.c: warning: this statement may fall through [-Wimplicit-fallthrough=]: => 653:3 drivers/watchdog/sb_wdog.c: warning: this statement may fall through [-Wimplicit-fallthrough=]: => 204:3 drivers/watchdog/wdt.c: warning: this statement may fall through [-Wimplicit-fallthrough=]: => 391:3 Reported-by: Geert Uytterhoeven Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190729151033.GA10143@embeddedor Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ar7_wdt.c | 1 + drivers/watchdog/pcwd.c | 2 +- drivers/watchdog/sb_wdog.c | 1 + drivers/watchdog/wdt.c | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/ar7_wdt.c b/drivers/watchdog/ar7_wdt.c index b9b2d06b3879..668a1c704f28 100644 --- a/drivers/watchdog/ar7_wdt.c +++ b/drivers/watchdog/ar7_wdt.c @@ -235,6 +235,7 @@ static long ar7_wdt_ioctl(struct file *file, ar7_wdt_update_margin(new_margin); ar7_wdt_kick(1); spin_unlock(&wdt_lock); + /* Fall through */ case WDIOC_GETTIMEOUT: if (put_user(margin, (int *)arg)) diff --git a/drivers/watchdog/pcwd.c b/drivers/watchdog/pcwd.c index 1b2cf5b95a89..c3c93e00b320 100644 --- a/drivers/watchdog/pcwd.c +++ b/drivers/watchdog/pcwd.c @@ -651,7 +651,7 @@ static long pcwd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return -EINVAL; pcwd_keepalive(); - /* Fall */ + /* Fall through */ case WDIOC_GETTIMEOUT: return put_user(heartbeat, argp); diff --git a/drivers/watchdog/sb_wdog.c b/drivers/watchdog/sb_wdog.c index 5a6ced7a7e8f..202fc8d8ca5f 100644 --- a/drivers/watchdog/sb_wdog.c +++ b/drivers/watchdog/sb_wdog.c @@ -202,6 +202,7 @@ static long sbwdog_ioctl(struct file *file, unsigned int cmd, timeout = time; sbwdog_set(user_dog, timeout); sbwdog_pet(user_dog); + /* Fall through */ case WDIOC_GETTIMEOUT: /* diff --git a/drivers/watchdog/wdt.c b/drivers/watchdog/wdt.c index 0650100fad00..7d278b37e083 100644 --- a/drivers/watchdog/wdt.c +++ b/drivers/watchdog/wdt.c @@ -389,7 +389,7 @@ static long wdt_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (wdt_set_heartbeat(new_heartbeat)) return -EINVAL; wdt_ping(); - /* Fall */ + /* Fall through */ case WDIOC_GETTIMEOUT: return put_user(heartbeat, p); default: From ca58397c53ddba9544748dc0cbaef4ff34282466 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 29 Jul 2019 15:06:02 -0500 Subject: [PATCH 0100/2880] watchdog: scx200_wdt: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. This patch fixes the following warning (Building: i386): drivers/watchdog/scx200_wdt.c: In function ‘scx200_wdt_ioctl’: drivers/watchdog/scx200_wdt.c:188:3: warning: this statement may fall through [-Wimplicit-fallthrough=] scx200_wdt_ping(); ^~~~~~~~~~~~~~~~~ drivers/watchdog/scx200_wdt.c:189:2: note: here case WDIOC_GETTIMEOUT: ^~~~ Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190729200602.GA6854@embeddedor Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/scx200_wdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/scx200_wdt.c b/drivers/watchdog/scx200_wdt.c index efd7996694de..46268309ee9b 100644 --- a/drivers/watchdog/scx200_wdt.c +++ b/drivers/watchdog/scx200_wdt.c @@ -186,6 +186,7 @@ static long scx200_wdt_ioctl(struct file *file, unsigned int cmd, margin = new_margin; scx200_wdt_update_margin(); scx200_wdt_ping(); + /* Fall through */ case WDIOC_GETTIMEOUT: if (put_user(margin, p)) return -EFAULT; From 2c017640826a16eab385b756431ce560e44a7054 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 29 Jul 2019 17:31:59 -0500 Subject: [PATCH 0101/2880] watchdog: wdt977: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. This patch fixes the following warning (Building: arm): drivers/watchdog/wdt977.c: In function ‘wdt977_ioctl’: LD [M] drivers/media/platform/vicodec/vicodec.o drivers/watchdog/wdt977.c:400:3: warning: this statement may fall through [-Wimplicit-fallthrough=] wdt977_keepalive(); ^~~~~~~~~~~~~~~~~~ drivers/watchdog/wdt977.c:403:2: note: here case WDIOC_GETTIMEOUT: ^~~~ Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190729223159.GA20878@embeddedor Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/wdt977.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/wdt977.c b/drivers/watchdog/wdt977.c index 567005d7598e..5c52c73e1839 100644 --- a/drivers/watchdog/wdt977.c +++ b/drivers/watchdog/wdt977.c @@ -398,7 +398,7 @@ static long wdt977_ioctl(struct file *file, unsigned int cmd, return -EINVAL; wdt977_keepalive(); - /* Fall */ + /* Fall through */ case WDIOC_GETTIMEOUT: return put_user(timeout, uarg.i); From 4b4b8b03458ef33dc2545c038d3134ba668038a4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 29 Jul 2019 20:46:50 -0500 Subject: [PATCH 0102/2880] watchdog: riowd: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. This patch fixes the following warnings (Building: sparc64): drivers/watchdog/riowd.c: In function ‘riowd_ioctl’: drivers/watchdog/riowd.c:136:3: warning: this statement may fall through [-Wimplicit-fallthrough=] riowd_writereg(p, riowd_timeout, WDTO_INDEX); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/watchdog/riowd.c:139:2: note: here case WDIOC_GETTIMEOUT: ^~~~ Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190730014650.GA31309@embeddedor Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/riowd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/riowd.c b/drivers/watchdog/riowd.c index 41a2a11535a6..b35f7be20c00 100644 --- a/drivers/watchdog/riowd.c +++ b/drivers/watchdog/riowd.c @@ -134,7 +134,7 @@ static long riowd_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -EINVAL; riowd_timeout = (new_margin + 59) / 60; riowd_writereg(p, riowd_timeout, WDTO_INDEX); - /* Fall */ + /* Fall through */ case WDIOC_GETTIMEOUT: return put_user(riowd_timeout * 60, (int __user *)argp); From 26ae6a8e9b09a94a38e5351502bee0e801d4d8c6 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 30 Jul 2019 11:15:48 -0700 Subject: [PATCH 0103/2880] watchdog: Remove dev_err() usage after platform_get_irq() We don't need dev_err() messages when platform_get_irq() fails now that platform_get_irq() prints an error message itself when something goes wrong. Let's remove these prints with a simple semantic patch. // @@ expression ret; struct platform_device *E; @@ ret = ( platform_get_irq(E, ...) | platform_get_irq_byname(E, ...) ); if ( \( ret < 0 \| ret <= 0 \) ) { ( -if (ret != -EPROBE_DEFER) -{ ... -dev_err(...); -... } | ... -dev_err(...); ) ... } // While we're here, remove braces on if statements that only have one statement (manually). Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: linux-watchdog@vger.kernel.org Cc: Greg Kroah-Hartman Signed-off-by: Stephen Boyd Reviewed-by: Guenter Roeck Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/sprd_wdt.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/watchdog/sprd_wdt.c b/drivers/watchdog/sprd_wdt.c index edba4e278685..0bb17b046140 100644 --- a/drivers/watchdog/sprd_wdt.c +++ b/drivers/watchdog/sprd_wdt.c @@ -284,10 +284,8 @@ static int sprd_wdt_probe(struct platform_device *pdev) } wdt->irq = platform_get_irq(pdev, 0); - if (wdt->irq < 0) { - dev_err(dev, "failed to get IRQ resource\n"); + if (wdt->irq < 0) return wdt->irq; - } ret = devm_request_irq(dev, wdt->irq, sprd_wdt_isr, IRQF_NO_SUSPEND, "sprd-wdt", (void *)wdt); From b18f22d02ad1a4cfc0ca348766da651f9119cf44 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 6 Aug 2019 02:39:53 -0500 Subject: [PATCH 0104/2880] watchdog: jz4740: Fix unused variable warning in jz4740_wdt_probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following warning (Building: ci20_defconfig mips): drivers/watchdog/jz4740_wdt.c: In function ‘jz4740_wdt_probe’: drivers/watchdog/jz4740_wdt.c:165:6: warning: unused variable ‘ret’ [-Wunused-variable] int ret; ^~~ Fixes: 9ee644c9326c ("watchdog: jz4740_wdt: drop warning after registering device") Signed-off-by: Gustavo A. R. Silva Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190806073953.GA13685@embeddedor Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/jz4740_wdt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/watchdog/jz4740_wdt.c b/drivers/watchdog/jz4740_wdt.c index d4a90916dd38..c6052ae54f32 100644 --- a/drivers/watchdog/jz4740_wdt.c +++ b/drivers/watchdog/jz4740_wdt.c @@ -162,7 +162,6 @@ static int jz4740_wdt_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct jz4740_wdt_drvdata *drvdata; struct watchdog_device *jz4740_wdt; - int ret; drvdata = devm_kzalloc(dev, sizeof(struct jz4740_wdt_drvdata), GFP_KERNEL); From 708cb5cc3fde63a33afa0aaf7a26fd7ea9015dd3 Mon Sep 17 00:00:00 2001 From: Hsin-Hsiung Wang Date: Mon, 5 Aug 2019 13:21:49 +0800 Subject: [PATCH 0105/2880] mfd: mt6397: Rename macros to something more readable Signed-off-by: Hsin-Hsiung Wang Signed-off-by: Lee Jones --- drivers/mfd/mt6397-core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c index 337bcccdb914..c0708622f41b 100644 --- a/drivers/mfd/mt6397-core.c +++ b/drivers/mfd/mt6397-core.c @@ -10,17 +10,17 @@ #include #include #include -#include #include -#include +#include #include +#include #define MT6397_RTC_BASE 0xe000 #define MT6397_RTC_SIZE 0x3e -#define MT6323_CID_CODE 0x23 -#define MT6391_CID_CODE 0x91 -#define MT6397_CID_CODE 0x97 +#define MT6323_CHIP_ID 0x23 +#define MT6391_CHIP_ID 0x91 +#define MT6397_CHIP_ID 0x97 static const struct resource mt6397_rtc_resources[] = { { @@ -290,7 +290,7 @@ static int mt6397_probe(struct platform_device *pdev) return pmic->irq; switch (id & 0xff) { - case MT6323_CID_CODE: + case MT6323_CHIP_ID: pmic->int_con[0] = MT6323_INT_CON0; pmic->int_con[1] = MT6323_INT_CON1; pmic->int_status[0] = MT6323_INT_STATUS0; @@ -304,8 +304,8 @@ static int mt6397_probe(struct platform_device *pdev) 0, pmic->irq_domain); break; - case MT6397_CID_CODE: - case MT6391_CID_CODE: + case MT6391_CHIP_ID: + case MT6397_CHIP_ID: pmic->int_con[0] = MT6397_INT_CON0; pmic->int_con[1] = MT6397_INT_CON1; pmic->int_status[0] = MT6397_INT_STATUS0; From a4872e80ce7d2a1844328176dbf279d0a2b89bdb Mon Sep 17 00:00:00 2001 From: Hsin-Hsiung Wang Date: Mon, 5 Aug 2019 13:21:50 +0800 Subject: [PATCH 0106/2880] mfd: mt6397: Extract IRQ related code from core driver In order to support different types of irq design, we decide to add separate irq drivers for different design and keep mt6397 mfd core simple and reusable to all generations of PMICs so far. Signed-off-by: Hsin-Hsiung Wang Signed-off-by: Lee Jones --- drivers/mfd/Makefile | 3 +- drivers/mfd/mt6397-core.c | 146 -------------------------- drivers/mfd/mt6397-irq.c | 181 ++++++++++++++++++++++++++++++++ include/linux/mfd/mt6397/core.h | 9 ++ 4 files changed, 192 insertions(+), 147 deletions(-) create mode 100644 drivers/mfd/mt6397-irq.c diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 446d5df7cacb..7b6a6aa4fe42 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -240,7 +240,8 @@ obj-$(CONFIG_INTEL_SOC_PMIC) += intel-soc-pmic.o obj-$(CONFIG_INTEL_SOC_PMIC_BXTWC) += intel_soc_pmic_bxtwc.o obj-$(CONFIG_INTEL_SOC_PMIC_CHTWC) += intel_soc_pmic_chtwc.o obj-$(CONFIG_INTEL_SOC_PMIC_CHTDC_TI) += intel_soc_pmic_chtdc_ti.o -obj-$(CONFIG_MFD_MT6397) += mt6397-core.o +mt6397-objs := mt6397-core.o mt6397-irq.o +obj-$(CONFIG_MFD_MT6397) += mt6397.o obj-$(CONFIG_MFD_ALTERA_A10SR) += altera-a10sr.o obj-$(CONFIG_MFD_ALTERA_SYSMGR) += altera-sysmgr.o diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c index c0708622f41b..93c888153b77 100644 --- a/drivers/mfd/mt6397-core.c +++ b/drivers/mfd/mt6397-core.c @@ -18,10 +18,6 @@ #define MT6397_RTC_BASE 0xe000 #define MT6397_RTC_SIZE 0x3e -#define MT6323_CHIP_ID 0x23 -#define MT6391_CHIP_ID 0x91 -#define MT6397_CHIP_ID 0x97 - static const struct resource mt6397_rtc_resources[] = { { .start = MT6397_RTC_BASE, @@ -86,148 +82,6 @@ static const struct mfd_cell mt6397_devs[] = { } }; -static void mt6397_irq_lock(struct irq_data *data) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); - - mutex_lock(&mt6397->irqlock); -} - -static void mt6397_irq_sync_unlock(struct irq_data *data) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); - - regmap_write(mt6397->regmap, mt6397->int_con[0], - mt6397->irq_masks_cur[0]); - regmap_write(mt6397->regmap, mt6397->int_con[1], - mt6397->irq_masks_cur[1]); - - mutex_unlock(&mt6397->irqlock); -} - -static void mt6397_irq_disable(struct irq_data *data) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); - int shift = data->hwirq & 0xf; - int reg = data->hwirq >> 4; - - mt6397->irq_masks_cur[reg] &= ~BIT(shift); -} - -static void mt6397_irq_enable(struct irq_data *data) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); - int shift = data->hwirq & 0xf; - int reg = data->hwirq >> 4; - - mt6397->irq_masks_cur[reg] |= BIT(shift); -} - -#ifdef CONFIG_PM_SLEEP -static int mt6397_irq_set_wake(struct irq_data *irq_data, unsigned int on) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(irq_data); - int shift = irq_data->hwirq & 0xf; - int reg = irq_data->hwirq >> 4; - - if (on) - mt6397->wake_mask[reg] |= BIT(shift); - else - mt6397->wake_mask[reg] &= ~BIT(shift); - - return 0; -} -#else -#define mt6397_irq_set_wake NULL -#endif - -static struct irq_chip mt6397_irq_chip = { - .name = "mt6397-irq", - .irq_bus_lock = mt6397_irq_lock, - .irq_bus_sync_unlock = mt6397_irq_sync_unlock, - .irq_enable = mt6397_irq_enable, - .irq_disable = mt6397_irq_disable, - .irq_set_wake = mt6397_irq_set_wake, -}; - -static void mt6397_irq_handle_reg(struct mt6397_chip *mt6397, int reg, - int irqbase) -{ - unsigned int status; - int i, irq, ret; - - ret = regmap_read(mt6397->regmap, reg, &status); - if (ret) { - dev_err(mt6397->dev, "Failed to read irq status: %d\n", ret); - return; - } - - for (i = 0; i < 16; i++) { - if (status & BIT(i)) { - irq = irq_find_mapping(mt6397->irq_domain, irqbase + i); - if (irq) - handle_nested_irq(irq); - } - } - - regmap_write(mt6397->regmap, reg, status); -} - -static irqreturn_t mt6397_irq_thread(int irq, void *data) -{ - struct mt6397_chip *mt6397 = data; - - mt6397_irq_handle_reg(mt6397, mt6397->int_status[0], 0); - mt6397_irq_handle_reg(mt6397, mt6397->int_status[1], 16); - - return IRQ_HANDLED; -} - -static int mt6397_irq_domain_map(struct irq_domain *d, unsigned int irq, - irq_hw_number_t hw) -{ - struct mt6397_chip *mt6397 = d->host_data; - - irq_set_chip_data(irq, mt6397); - irq_set_chip_and_handler(irq, &mt6397_irq_chip, handle_level_irq); - irq_set_nested_thread(irq, 1); - irq_set_noprobe(irq); - - return 0; -} - -static const struct irq_domain_ops mt6397_irq_domain_ops = { - .map = mt6397_irq_domain_map, -}; - -static int mt6397_irq_init(struct mt6397_chip *mt6397) -{ - int ret; - - mutex_init(&mt6397->irqlock); - - /* Mask all interrupt sources */ - regmap_write(mt6397->regmap, mt6397->int_con[0], 0x0); - regmap_write(mt6397->regmap, mt6397->int_con[1], 0x0); - - mt6397->irq_domain = irq_domain_add_linear(mt6397->dev->of_node, - MT6397_IRQ_NR, &mt6397_irq_domain_ops, mt6397); - if (!mt6397->irq_domain) { - dev_err(mt6397->dev, "could not create irq domain\n"); - return -ENOMEM; - } - - ret = devm_request_threaded_irq(mt6397->dev, mt6397->irq, NULL, - mt6397_irq_thread, IRQF_ONESHOT, "mt6397-pmic", mt6397); - if (ret) { - dev_err(mt6397->dev, "failed to register irq=%d; err: %d\n", - mt6397->irq, ret); - return ret; - } - - return 0; -} - #ifdef CONFIG_PM_SLEEP static int mt6397_irq_suspend(struct device *dev) { diff --git a/drivers/mfd/mt6397-irq.c b/drivers/mfd/mt6397-irq.c new file mode 100644 index 000000000000..b2d3ce1f3115 --- /dev/null +++ b/drivers/mfd/mt6397-irq.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Copyright (c) 2019 MediaTek Inc. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void mt6397_irq_lock(struct irq_data *data) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); + + mutex_lock(&mt6397->irqlock); +} + +static void mt6397_irq_sync_unlock(struct irq_data *data) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); + + regmap_write(mt6397->regmap, mt6397->int_con[0], + mt6397->irq_masks_cur[0]); + regmap_write(mt6397->regmap, mt6397->int_con[1], + mt6397->irq_masks_cur[1]); + + mutex_unlock(&mt6397->irqlock); +} + +static void mt6397_irq_disable(struct irq_data *data) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); + int shift = data->hwirq & 0xf; + int reg = data->hwirq >> 4; + + mt6397->irq_masks_cur[reg] &= ~BIT(shift); +} + +static void mt6397_irq_enable(struct irq_data *data) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); + int shift = data->hwirq & 0xf; + int reg = data->hwirq >> 4; + + mt6397->irq_masks_cur[reg] |= BIT(shift); +} + +#ifdef CONFIG_PM_SLEEP +static int mt6397_irq_set_wake(struct irq_data *irq_data, unsigned int on) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(irq_data); + int shift = irq_data->hwirq & 0xf; + int reg = irq_data->hwirq >> 4; + + if (on) + mt6397->wake_mask[reg] |= BIT(shift); + else + mt6397->wake_mask[reg] &= ~BIT(shift); + + return 0; +} +#else +#define mt6397_irq_set_wake NULL +#endif + +static struct irq_chip mt6397_irq_chip = { + .name = "mt6397-irq", + .irq_bus_lock = mt6397_irq_lock, + .irq_bus_sync_unlock = mt6397_irq_sync_unlock, + .irq_enable = mt6397_irq_enable, + .irq_disable = mt6397_irq_disable, + .irq_set_wake = mt6397_irq_set_wake, +}; + +static void mt6397_irq_handle_reg(struct mt6397_chip *mt6397, int reg, + int irqbase) +{ + unsigned int status; + int i, irq, ret; + + ret = regmap_read(mt6397->regmap, reg, &status); + if (ret) { + dev_err(mt6397->dev, "Failed to read irq status: %d\n", ret); + return; + } + + for (i = 0; i < 16; i++) { + if (status & BIT(i)) { + irq = irq_find_mapping(mt6397->irq_domain, irqbase + i); + if (irq) + handle_nested_irq(irq); + } + } + + regmap_write(mt6397->regmap, reg, status); +} + +static irqreturn_t mt6397_irq_thread(int irq, void *data) +{ + struct mt6397_chip *mt6397 = data; + + mt6397_irq_handle_reg(mt6397, mt6397->int_status[0], 0); + mt6397_irq_handle_reg(mt6397, mt6397->int_status[1], 16); + + return IRQ_HANDLED; +} + +static int mt6397_irq_domain_map(struct irq_domain *d, unsigned int irq, + irq_hw_number_t hw) +{ + struct mt6397_chip *mt6397 = d->host_data; + + irq_set_chip_data(irq, mt6397); + irq_set_chip_and_handler(irq, &mt6397_irq_chip, handle_level_irq); + irq_set_nested_thread(irq, 1); + irq_set_noprobe(irq); + + return 0; +} + +static const struct irq_domain_ops mt6397_irq_domain_ops = { + .map = mt6397_irq_domain_map, +}; + +int mt6397_irq_init(struct mt6397_chip *chip) +{ + int ret; + + mutex_init(&chip->irqlock); + + switch (chip->chip_id) { + case MT6323_CHIP_ID: + chip->int_con[0] = MT6323_INT_CON0; + chip->int_con[1] = MT6323_INT_CON1; + chip->int_status[0] = MT6323_INT_STATUS0; + chip->int_status[1] = MT6323_INT_STATUS1; + break; + + case MT6391_CHIP_ID: + case MT6397_CHIP_ID: + chip->int_con[0] = MT6397_INT_CON0; + chip->int_con[1] = MT6397_INT_CON1; + chip->int_status[0] = MT6397_INT_STATUS0; + chip->int_status[1] = MT6397_INT_STATUS1; + break; + + default: + dev_err(chip->dev, "unsupported chip: 0x%x\n", chip->chip_id); + return -ENODEV; + } + + /* Mask all interrupt sources */ + regmap_write(chip->regmap, chip->int_con[0], 0x0); + regmap_write(chip->regmap, chip->int_con[1], 0x0); + + chip->irq_domain = irq_domain_add_linear(chip->dev->of_node, + MT6397_IRQ_NR, + &mt6397_irq_domain_ops, + chip); + if (!chip->irq_domain) { + dev_err(chip->dev, "could not create irq domain\n"); + return -ENOMEM; + } + + ret = devm_request_threaded_irq(chip->dev, chip->irq, NULL, + mt6397_irq_thread, IRQF_ONESHOT, + "mt6397-pmic", chip); + if (ret) { + dev_err(chip->dev, "failed to register irq=%d; err: %d\n", + chip->irq, ret); + return ret; + } + + return 0; +} diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h index 25a95e72179b..9320c2a1e3e6 100644 --- a/include/linux/mfd/mt6397/core.h +++ b/include/linux/mfd/mt6397/core.h @@ -7,6 +7,12 @@ #ifndef __MFD_MT6397_CORE_H__ #define __MFD_MT6397_CORE_H__ +enum chip_id { + MT6323_CHIP_ID = 0x23, + MT6391_CHIP_ID = 0x91, + MT6397_CHIP_ID = 0x97, +}; + enum mt6397_irq_numbers { MT6397_IRQ_SPKL_AB = 0, MT6397_IRQ_SPKR_AB, @@ -54,6 +60,9 @@ struct mt6397_chip { u16 irq_masks_cache[2]; u16 int_con[2]; u16 int_status[2]; + u16 chip_id; }; +int mt6397_irq_init(struct mt6397_chip *chip); + #endif /* __MFD_MT6397_CORE_H__ */ From 533ca1feed98b0bf024779a14760694c7cb4d431 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 2 Aug 2019 22:50:20 +0000 Subject: [PATCH 0107/2880] PCI: hv: Avoid use of hv_pci_dev->pci_slot after freeing it The slot must be removed before the pci_dev is removed, otherwise a panic can happen due to use-after-free. Fixes: 15becc2b56c6 ("PCI: hv: Add hv_pci_remove_slots() when we unload the driver") Signed-off-by: Dexuan Cui Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org --- drivers/pci/controller/pci-hyperv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 40b625458afa..2b53976cd9f9 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2701,8 +2701,8 @@ static int hv_pci_remove(struct hv_device *hdev) /* Remove the bus from PCI's point of view. */ pci_lock_rescan_remove(); pci_stop_root_bus(hbus->pci_bus); - pci_remove_root_bus(hbus->pci_bus); hv_pci_remove_slots(hbus); + pci_remove_root_bus(hbus->pci_bus); pci_unlock_rescan_remove(); hbus->state = hv_pcibus_removed; } From 448d5a55759a2e60208b75ceed4bd88cd4f5a963 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:15 +0530 Subject: [PATCH 0108/2880] PCI: Add #defines for some of PCIe spec r4.0 features Add #defines only for the Data Link Feature and Physical Layer 16.0 GT/s features as defined in PCIe spec r4.0, sec 7.7.4 for Data Link Feature and sec 7.7.5 for Physical Layer 16.0 GT/s. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Thierry Reding Acked-by: Bjorn Helgaas --- include/uapi/linux/pci_regs.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f28e562d7ca8..d28d0319d932 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -713,7 +713,9 @@ #define PCI_EXT_CAP_ID_DPC 0x1D /* Downstream Port Containment */ #define PCI_EXT_CAP_ID_L1SS 0x1E /* L1 PM Substates */ #define PCI_EXT_CAP_ID_PTM 0x1F /* Precision Time Measurement */ -#define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PTM +#define PCI_EXT_CAP_ID_DLF 0x25 /* Data Link Feature */ +#define PCI_EXT_CAP_ID_PL_16GT 0x26 /* Physical Layer 16.0 GT/s */ +#define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL_16GT #define PCI_EXT_CAP_DSN_SIZEOF 12 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40 @@ -1053,4 +1055,14 @@ #define PCI_L1SS_CTL1_LTR_L12_TH_SCALE 0xe0000000 /* LTR_L1.2_THRESHOLD_Scale */ #define PCI_L1SS_CTL2 0x0c /* Control 2 Register */ +/* Data Link Feature */ +#define PCI_DLF_CAP 0x04 /* Capabilities Register */ +#define PCI_DLF_EXCHANGE_ENABLE 0x80000000 /* Data Link Feature Exchange Enable */ + +/* Physical Layer 16.0 GT/s */ +#define PCI_PL_16GT_LE_CTRL 0x20 /* Lane Equalization Control Register */ +#define PCI_PL_16GT_LE_CTRL_DSP_TX_PRESET_MASK 0x0000000F +#define PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK 0x000000F0 +#define PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT 4 + #endif /* LINUX_PCI_REGS_H */ From 8c7e96d3fe75bb41fadf23e7a494f37d1cd9906c Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:16 +0530 Subject: [PATCH 0109/2880] PCI: Disable MSI for Tegra root ports Tegra PCIe rootports don't generate MSI interrupts for PME and AER events. Since PCIe spec (Ref: r4.0 sec 7.7.1.2 and 7.7.2.2) doesn't support using a mix of INTx and MSI/MSI-X, MSI needs to be disabled to avoid root ports service drivers registering their respective ISRs with MSI interrupt and to let only INTx be used for all events. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Thierry Reding --- drivers/pci/quirks.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 208aacf39329..168782c5d23b 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2592,6 +2592,59 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_15, nvenet_msi_disable); +/* + * PCIe spec r4.0 sec 7.7.1.2 and sec 7.7.2.2 say that if MSI/MSI-X is enabled, + * then the device can't use INTx interrupts. Tegra's PCIe root ports don't + * generate MSI interrupts for PME and AER events instead only INTx interrupts + * are generated. Though Tegra's PCIe root ports can generate MSI interrupts + * for other events, since PCIe specificiation doesn't support using a mix of + * INTx and MSI/MSI-X, it is required to disable MSI interrupts to avoid port + * service drivers registering their respective ISRs for MSIs. + */ +static void pci_quirk_nvidia_tegra_disable_rp_msi(struct pci_dev *dev) +{ + dev->no_msi = 1; +} +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x1ad0, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x1ad1, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x1ad2, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0bf0, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0bf1, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e1c, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e1d, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e12, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e13, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0fae, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0faf, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x10e5, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x10e6, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); + /* * Some versions of the MCP55 bridge from Nvidia have a legacy IRQ routing * config register. This register controls the routing of legacy From 3924bc2fd1b607faa5cb0bc0655e9853656de31a Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:17 +0530 Subject: [PATCH 0110/2880] PCI: dwc: Group DBI registers writes requiring unlocking Some of DesignWare core's DBI registers (a.k.a configuration space registers) are write-protected with a lock without enabling which they are read-only by default. These write-protected registers are implementation specific. Tegra194's BAR-0 register which is at offset 0x10 in the configuration space is an example. Current implementation in dw_pcie_setup_rc() API attempts to unlock those write-protected registers whenever they are updated and lock them back again for writing. Group all write-protected registers writes so that locking and unlocking is performed once to avoid bloating the code with multiple unlock/lock sequences for all those write-protected registers. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Thierry Reding Acked-by: Jingoo Han --- drivers/pci/controller/dwc/pcie-designware-host.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index f93252d0da5b..d3156446ff27 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -628,6 +628,12 @@ void dw_pcie_setup_rc(struct pcie_port *pp) u32 val, ctrl, num_ctrls; struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + /* + * Enable DBI read-only registers for writing/updating configuration. + * Write permission gets disabled towards the end of this function. + */ + dw_pcie_dbi_ro_wr_en(pci); + dw_pcie_setup(pci); if (!pp->ops->msi_host_init) { @@ -650,12 +656,10 @@ void dw_pcie_setup_rc(struct pcie_port *pp) dw_pcie_writel_dbi(pci, PCI_BASE_ADDRESS_1, 0x00000000); /* Setup interrupt pins */ - dw_pcie_dbi_ro_wr_en(pci); val = dw_pcie_readl_dbi(pci, PCI_INTERRUPT_LINE); val &= 0xffff00ff; val |= 0x00000100; dw_pcie_writel_dbi(pci, PCI_INTERRUPT_LINE, val); - dw_pcie_dbi_ro_wr_dis(pci); /* Setup bus numbers */ val = dw_pcie_readl_dbi(pci, PCI_PRIMARY_BUS); @@ -687,15 +691,13 @@ void dw_pcie_setup_rc(struct pcie_port *pp) dw_pcie_wr_own_conf(pp, PCI_BASE_ADDRESS_0, 4, 0); - /* Enable write permission for the DBI read-only register */ - dw_pcie_dbi_ro_wr_en(pci); /* Program correct class for RC */ dw_pcie_wr_own_conf(pp, PCI_CLASS_DEVICE, 2, PCI_CLASS_BRIDGE_PCI); - /* Better disable write permission right after the update */ - dw_pcie_dbi_ro_wr_dis(pci); dw_pcie_rd_own_conf(pp, PCIE_LINK_WIDTH_SPEED_CONTROL, 4, &val); val |= PORT_LOGIC_SPEED_CHANGE; dw_pcie_wr_own_conf(pp, PCIE_LINK_WIDTH_SPEED_CONTROL, 4, val); + + dw_pcie_dbi_ro_wr_dis(pci); } EXPORT_SYMBOL_GPL(dw_pcie_setup_rc); From 7a6854f6874ff416afa6552706a0011418f07888 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:18 +0530 Subject: [PATCH 0111/2880] PCI: dwc: Move config space capability search API Move PCIe config space capability search API to common DesignWare file as this can be used by both host and EP mode drivers. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Thierry Reding Acked-by: Gustavo Pimentel --- .../pci/controller/dwc/pcie-designware-ep.c | 37 +----------------- drivers/pci/controller/dwc/pcie-designware.c | 39 +++++++++++++++++++ drivers/pci/controller/dwc/pcie-designware.h | 2 + 3 files changed, 43 insertions(+), 35 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 2bf5a35c0570..65f479250087 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -40,39 +40,6 @@ void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar) __dw_pcie_ep_reset_bar(pci, bar, 0); } -static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie *pci, u8 cap_ptr, - u8 cap) -{ - u8 cap_id, next_cap_ptr; - u16 reg; - - if (!cap_ptr) - return 0; - - reg = dw_pcie_readw_dbi(pci, cap_ptr); - cap_id = (reg & 0x00ff); - - if (cap_id > PCI_CAP_ID_MAX) - return 0; - - if (cap_id == cap) - return cap_ptr; - - next_cap_ptr = (reg & 0xff00) >> 8; - return __dw_pcie_ep_find_next_cap(pci, next_cap_ptr, cap); -} - -static u8 dw_pcie_ep_find_capability(struct dw_pcie *pci, u8 cap) -{ - u8 next_cap_ptr; - u16 reg; - - reg = dw_pcie_readw_dbi(pci, PCI_CAPABILITY_LIST); - next_cap_ptr = (reg & 0x00ff); - - return __dw_pcie_ep_find_next_cap(pci, next_cap_ptr, cap); -} - static int dw_pcie_ep_write_header(struct pci_epc *epc, u8 func_no, struct pci_epf_header *hdr) { @@ -612,9 +579,9 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) dev_err(dev, "Failed to reserve memory for MSI/MSI-X\n"); return -ENOMEM; } - ep->msi_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSI); + ep->msi_cap = dw_pcie_find_capability(pci, PCI_CAP_ID_MSI); - ep->msix_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSIX); + ep->msix_cap = dw_pcie_find_capability(pci, PCI_CAP_ID_MSIX); offset = dw_pcie_ep_find_ext_capability(pci, PCI_EXT_CAP_ID_REBAR); if (offset) { diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 7d25102c304c..7818b4febb08 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -14,6 +14,45 @@ #include "pcie-designware.h" +/* + * These interfaces resemble the pci_find_*capability() interfaces, but these + * are for configuring host controllers, which are bridges *to* PCI devices but + * are not PCI devices themselves. + */ +static u8 __dw_pcie_find_next_cap(struct dw_pcie *pci, u8 cap_ptr, + u8 cap) +{ + u8 cap_id, next_cap_ptr; + u16 reg; + + if (!cap_ptr) + return 0; + + reg = dw_pcie_readw_dbi(pci, cap_ptr); + cap_id = (reg & 0x00ff); + + if (cap_id > PCI_CAP_ID_MAX) + return 0; + + if (cap_id == cap) + return cap_ptr; + + next_cap_ptr = (reg & 0xff00) >> 8; + return __dw_pcie_find_next_cap(pci, next_cap_ptr, cap); +} + +u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap) +{ + u8 next_cap_ptr; + u16 reg; + + reg = dw_pcie_readw_dbi(pci, PCI_CAPABILITY_LIST); + next_cap_ptr = (reg & 0x00ff); + + return __dw_pcie_find_next_cap(pci, next_cap_ptr, cap); +} +EXPORT_SYMBOL_GPL(dw_pcie_find_capability); + int dw_pcie_read(void __iomem *addr, int size, u32 *val) { if (!IS_ALIGNED((uintptr_t)addr, size)) { diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index ffed084a0b4f..d8c66a6827dc 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -251,6 +251,8 @@ struct dw_pcie { #define to_dw_pcie_from_ep(endpoint) \ container_of((endpoint), struct dw_pcie, ep) +u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap); + int dw_pcie_read(void __iomem *addr, int size, u32 *val); int dw_pcie_write(void __iomem *addr, int size, u32 val); From 5b0841fa653f6c156500ecfe7e996837884363c4 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:19 +0530 Subject: [PATCH 0112/2880] PCI: dwc: Add extended configuration space capability search API Add extended configuration space capability search API using struct dw_pcie* pointer. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Acked-by: Gustavo Pimentel Acked-by: Thierry Reding --- drivers/pci/controller/dwc/pcie-designware.c | 41 ++++++++++++++++++++ drivers/pci/controller/dwc/pcie-designware.h | 1 + 2 files changed, 42 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 7818b4febb08..181449e342f1 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -53,6 +53,47 @@ u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap) } EXPORT_SYMBOL_GPL(dw_pcie_find_capability); +static u16 dw_pcie_find_next_ext_capability(struct dw_pcie *pci, u16 start, + u8 cap) +{ + u32 header; + int ttl; + int pos = PCI_CFG_SPACE_SIZE; + + /* minimum 8 bytes per capability */ + ttl = (PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) / 8; + + if (start) + pos = start; + + header = dw_pcie_readl_dbi(pci, pos); + /* + * If we have no capabilities, this is indicated by cap ID, + * cap version and next pointer all being 0. + */ + if (header == 0) + return 0; + + while (ttl-- > 0) { + if (PCI_EXT_CAP_ID(header) == cap && pos != start) + return pos; + + pos = PCI_EXT_CAP_NEXT(header); + if (pos < PCI_CFG_SPACE_SIZE) + break; + + header = dw_pcie_readl_dbi(pci, pos); + } + + return 0; +} + +u16 dw_pcie_find_ext_capability(struct dw_pcie *pci, u8 cap) +{ + return dw_pcie_find_next_ext_capability(pci, 0, cap); +} +EXPORT_SYMBOL_GPL(dw_pcie_find_ext_capability); + int dw_pcie_read(void __iomem *addr, int size, u32 *val) { if (!IS_ALIGNED((uintptr_t)addr, size)) { diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index d8c66a6827dc..11c223471416 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -252,6 +252,7 @@ struct dw_pcie { container_of((endpoint), struct dw_pcie, ep) u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap); +u16 dw_pcie_find_ext_capability(struct dw_pcie *pci, u8 cap); int dw_pcie_read(void __iomem *addr, int size, u32 *val); int dw_pcie_write(void __iomem *addr, int size, u32 val); From feee519ae55c8859a50eca0da83b1ef4ea45f65d Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:20 +0530 Subject: [PATCH 0113/2880] PCI: dwc: Export dw_pcie_wait_for_link() API Export the dw_pcie_wait_for_link() function to be able to build drivers using it as loadable modules. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pcie-designware.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 181449e342f1..1d87e823de21 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -460,6 +460,7 @@ int dw_pcie_wait_for_link(struct dw_pcie *pci) return -ETIMEDOUT; } +EXPORT_SYMBOL_GPL(dw_pcie_wait_for_link); int dw_pcie_link_up(struct dw_pcie *pci) { From f4e84a5d9db7dfea96832157bc461177e87a47dd Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:21 +0530 Subject: [PATCH 0114/2880] dt-bindings: PCI: designware: Add binding for CDM register check Add support to enable CDM (Configuration Dependent Module) registers check for any data corruption. CDM registers include standard PCIe configuration space registers, Port Logic registers and iATU and DMA registers. Refer Section S.4 of Synopsys DesignWare Cores PCI Express Controller Databook Version 4.90a. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Thierry Reding Reviewed-by: Rob Herring --- Documentation/devicetree/bindings/pci/designware-pcie.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/designware-pcie.txt b/Documentation/devicetree/bindings/pci/designware-pcie.txt index 5561a1c060d0..3fba04da6a59 100644 --- a/Documentation/devicetree/bindings/pci/designware-pcie.txt +++ b/Documentation/devicetree/bindings/pci/designware-pcie.txt @@ -34,6 +34,11 @@ Optional properties: - clock-names: Must include the following entries: - "pcie" - "pcie_bus" +- snps,enable-cdm-check: This is a boolean property and if present enables + automatic checking of CDM (Configuration Dependent Module) registers + for data corruption. CDM registers include standard PCIe configuration + space registers, Port Logic registers, DMA and iATU (internal Address + Translation Unit) registers. RC mode: - num-viewport: number of view ports configured in hardware. If a platform does not specify it, the driver assumes 2. From 07f123def73e07e398537be7a5b43a244cb0cbf3 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:22 +0530 Subject: [PATCH 0115/2880] PCI: dwc: Add support to enable CDM register check Add support to enable CDM (Configuration Dependent Module) register check for any data corruption based on the DT property 'snps,enable-cdm-check'. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Thierry Reding Acked-by: Gustavo Pimentel --- drivers/pci/controller/dwc/pcie-designware.c | 7 +++++++ drivers/pci/controller/dwc/pcie-designware.h | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 1d87e823de21..59eaeeb21dbe 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -547,4 +547,11 @@ void dw_pcie_setup(struct dw_pcie *pci) break; } dw_pcie_writel_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL, val); + + if (of_property_read_bool(np, "snps,enable-cdm-check")) { + val = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS); + val |= PCIE_PL_CHK_REG_CHK_REG_CONTINUOUS | + PCIE_PL_CHK_REG_CHK_REG_START; + dw_pcie_writel_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS, val); + } } diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index 11c223471416..5a18e94e52c8 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -86,6 +86,15 @@ #define PCIE_MISC_CONTROL_1_OFF 0x8BC #define PCIE_DBI_RO_WR_EN BIT(0) +#define PCIE_PL_CHK_REG_CONTROL_STATUS 0xB20 +#define PCIE_PL_CHK_REG_CHK_REG_START BIT(0) +#define PCIE_PL_CHK_REG_CHK_REG_CONTINUOUS BIT(1) +#define PCIE_PL_CHK_REG_CHK_REG_COMPARISON_ERROR BIT(16) +#define PCIE_PL_CHK_REG_CHK_REG_LOGIC_ERROR BIT(17) +#define PCIE_PL_CHK_REG_CHK_REG_COMPLETE BIT(18) + +#define PCIE_PL_CHK_REG_ERR_ADDR 0xB28 + /* * iATU Unroll-specific register definitions * From 4.80 core version the address translation will be made by unroll From 7b31a72e7195aebd620d504f28b97ad8398e95ec Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:23 +0530 Subject: [PATCH 0116/2880] dt-bindings: Add PCIe supports-clkreq property Some host controllers need to know the existence of clkreq signal routing to downstream devices to be able to advertise low power features like ASPM L1 substates. Without clkreq signal routing being present, enabling ASPM L1 substates might lead to downstream devices being disconnected from the bus. Hence a new device tree property 'supports-clkreq' is added to make such host controllers aware of clkreq signal routing to downstream devices. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: Thierry Reding --- Documentation/devicetree/bindings/pci/pci.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/pci.txt b/Documentation/devicetree/bindings/pci/pci.txt index 2a5d91024059..29bcbd88f457 100644 --- a/Documentation/devicetree/bindings/pci/pci.txt +++ b/Documentation/devicetree/bindings/pci/pci.txt @@ -27,6 +27,11 @@ driver implementation may support the following properties: - reset-gpios: If present this property specifies PERST# GPIO. Host drivers can parse the GPIO and apply fundamental reset to endpoints. +- supports-clkreq: + If present this property specifies that CLKREQ signal routing exists from + root port to downstream device and host bridge drivers can do programming + which depends on CLKREQ signal existence. For example, programming root port + not to advertise ASPM L1 Sub-States support if there is no CLKREQ signal. PCI-PCI Bridge properties ------------------------- From 42c253ecaa53c0a2fee431018dd18d2fa797580f Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:24 +0530 Subject: [PATCH 0117/2880] dt-bindings: PCI: tegra: Add device tree support for Tegra194 Add support for Tegra194 PCIe controllers. These controllers are based on Synopsys DesignWare core IP. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Thierry Reding --- .../bindings/pci/nvidia,tegra194-pcie.txt | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt diff --git a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt new file mode 100644 index 000000000000..674e5adb2895 --- /dev/null +++ b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt @@ -0,0 +1,155 @@ +NVIDIA Tegra PCIe controller (Synopsys DesignWare Core based) + +This PCIe host controller is based on the Synopsis Designware PCIe IP +and thus inherits all the common properties defined in designware-pcie.txt. + +Required properties: +- compatible: For Tegra19x, must contain "nvidia,tegra194-pcie". +- device_type: Must be "pci" +- power-domains: A phandle to the node that controls power to the respective + PCIe controller and a specifier name for the PCIe controller. Following are + the specifiers for the different PCIe controllers + TEGRA194_POWER_DOMAIN_PCIEX8B: C0 + TEGRA194_POWER_DOMAIN_PCIEX1A: C1 + TEGRA194_POWER_DOMAIN_PCIEX1A: C2 + TEGRA194_POWER_DOMAIN_PCIEX1A: C3 + TEGRA194_POWER_DOMAIN_PCIEX4A: C4 + TEGRA194_POWER_DOMAIN_PCIEX8A: C5 + these specifiers are defined in + "include/dt-bindings/power/tegra194-powergate.h" file. +- reg: A list of physical base address and length pairs for each set of + controller registers. Must contain an entry for each entry in the reg-names + property. +- reg-names: Must include the following entries: + "appl": Controller's application logic registers + "config": As per the definition in designware-pcie.txt + "atu_dma": iATU and DMA registers. This is where the iATU (internal Address + Translation Unit) registers of the PCIe core are made available + for SW access. + "dbi": The aperture where root port's own configuration registers are + available +- interrupts: A list of interrupt outputs of the controller. Must contain an + entry for each entry in the interrupt-names property. +- interrupt-names: Must include the following entries: + "intr": The Tegra interrupt that is asserted for controller interrupts + "msi": The Tegra interrupt that is asserted when an MSI is received +- bus-range: Range of bus numbers associated with this controller +- #address-cells: Address representation for root ports (must be 3) + - cell 0 specifies the bus and device numbers of the root port: + [23:16]: bus number + [15:11]: device number + - cell 1 denotes the upper 32 address bits and should be 0 + - cell 2 contains the lower 32 address bits and is used to translate to the + CPU address space +- #size-cells: Size representation for root ports (must be 2) +- ranges: Describes the translation of addresses for root ports and standard + PCI regions. The entries must be 7 cells each, where the first three cells + correspond to the address as described for the #address-cells property + above, the fourth and fifth cells are for the physical CPU address to + translate to and the sixth and seventh cells are as described for the + #size-cells property above. + - Entries setup the mapping for the standard I/O, memory and + prefetchable PCI regions. The first cell determines the type of region + that is setup: + - 0x81000000: I/O memory region + - 0x82000000: non-prefetchable memory region + - 0xc2000000: prefetchable memory region + Please refer to the standard PCI bus binding document for a more detailed + explanation. +- #interrupt-cells: Size representation for interrupts (must be 1) +- interrupt-map-mask and interrupt-map: Standard PCI IRQ mapping properties + Please refer to the standard PCI bus binding document for a more detailed + explanation. +- clocks: Must contain an entry for each entry in clock-names. + See ../clocks/clock-bindings.txt for details. +- clock-names: Must include the following entries: + - core +- resets: Must contain an entry for each entry in reset-names. + See ../reset/reset.txt for details. +- reset-names: Must include the following entries: + - apb + - core +- phys: Must contain a phandle to P2U PHY for each entry in phy-names. +- phy-names: Must include an entry for each active lane. + "p2u-N": where N ranges from 0 to one less than the total number of lanes +- nvidia,bpmp: Must contain a pair of phandle to BPMP controller node followed + by controller-id. Following are the controller ids for each controller. + 0: C0 + 1: C1 + 2: C2 + 3: C3 + 4: C4 + 5: C5 +- vddio-pex-ctl-supply: Regulator supply for PCIe side band signals + +Optional properties: +- supports-clkreq: Refer to Documentation/devicetree/bindings/pci/pci.txt +- nvidia,update-fc-fixup: This is a boolean property and needs to be present to + improve performance when a platform is designed in such a way that it + satisfies at least one of the following conditions thereby enabling root + port to exchange optimum number of FC (Flow Control) credits with + downstream devices + 1. If C0/C4/C5 run at x1/x2 link widths (irrespective of speed and MPS) + 2. If C0/C1/C2/C3/C4/C5 operate at their respective max link widths and + a) speed is Gen-2 and MPS is 256B + b) speed is >= Gen-3 with any MPS +- nvidia,aspm-cmrt-us: Common Mode Restore Time for proper operation of ASPM + to be specified in microseconds +- nvidia,aspm-pwr-on-t-us: Power On time for proper operation of ASPM to be + specified in microseconds +- nvidia,aspm-l0s-entrance-latency-us: ASPM L0s entrance latency to be + specified in microseconds + +Examples: +========= + +Tegra194: +-------- + + pcie@14180000 { + compatible = "nvidia,tegra194-pcie", "snps,dw-pcie"; + power-domains = <&bpmp TEGRA194_POWER_DOMAIN_PCIEX8B>; + reg = <0x00 0x14180000 0x0 0x00020000 /* appl registers (128K) */ + 0x00 0x38000000 0x0 0x00040000 /* configuration space (256K) */ + 0x00 0x38040000 0x0 0x00040000>; /* iATU_DMA reg space (256K) */ + reg-names = "appl", "config", "atu_dma"; + + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + num-lanes = <8>; + linux,pci-domain = <0>; + + clocks = <&bpmp TEGRA194_CLK_PEX0_CORE_0>; + clock-names = "core"; + + resets = <&bpmp TEGRA194_RESET_PEX0_CORE_0_APB>, + <&bpmp TEGRA194_RESET_PEX0_CORE_0>; + reset-names = "apb", "core"; + + interrupts = , /* controller interrupt */ + ; /* MSI interrupt */ + interrupt-names = "intr", "msi"; + + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0>; + interrupt-map = <0 0 0 0 &gic GIC_SPI 72 IRQ_TYPE_LEVEL_HIGH>; + + nvidia,bpmp = <&bpmp 0>; + + supports-clkreq; + nvidia,aspm-cmrt-us = <60>; + nvidia,aspm-pwr-on-t-us = <20>; + nvidia,aspm-l0s-entrance-latency-us = <3>; + + bus-range = <0x0 0xff>; + ranges = <0x81000000 0x0 0x38100000 0x0 0x38100000 0x0 0x00100000 /* downstream I/O (1MB) */ + 0x82000000 0x0 0x38200000 0x0 0x38200000 0x0 0x01E00000 /* non-prefetchable memory (30MB) */ + 0xc2000000 0x18 0x00000000 0x18 0x00000000 0x4 0x00000000>; /* prefetchable memory (16GB) */ + + vddio-pex-ctl-supply = <&vdd_1v8ao>; + + phys = <&p2u_hsio_2>, <&p2u_hsio_3>, <&p2u_hsio_4>, + <&p2u_hsio_5>; + phy-names = "p2u-0", "p2u-1", "p2u-2", "p2u-3"; + }; From 813c6fbcb7756e96f96c5931d8830970ff50e3b0 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:25 +0530 Subject: [PATCH 0118/2880] dt-bindings: PHY: P2U: Add Tegra194 P2U block Add support for Tegra194 P2U (PIPE to UPHY) module block which is a glue module instantiated once for each PCIe lane between Synopsys DesignWare core based PCIe IP and Universal PHY block. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Thierry Reding Acked-by: Kishon Vijay Abraham I --- .../bindings/phy/phy-tegra194-p2u.txt | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt diff --git a/Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt b/Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt new file mode 100644 index 000000000000..d23ff90baad5 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt @@ -0,0 +1,28 @@ +NVIDIA Tegra194 P2U binding + +Tegra194 has two PHY bricks namely HSIO (High Speed IO) and NVHS (NVIDIA High +Speed) each interfacing with 12 and 8 P2U instances respectively. +A P2U instance is a glue logic between Synopsys DesignWare Core PCIe IP's PIPE +interface and PHY of HSIO/NVHS bricks. Each P2U instance represents one PCIe +lane. + +Required properties: +- compatible: For Tegra19x, must contain "nvidia,tegra194-p2u". +- reg: Should be the physical address space and length of respective each P2U + instance. +- reg-names: Must include the entry "ctl". + +Required properties for PHY port node: +- #phy-cells: Defined by generic PHY bindings. Must be 0. + +Refer to phy/phy-bindings.txt for the generic PHY binding properties. + +Example: + +p2u_hsio_0: phy@3e10000 { + compatible = "nvidia,tegra194-p2u"; + reg = <0x03e10000 0x10000>; + reg-names = "ctl"; + + #phy-cells = <0>; +}; From 5dae15b21d36a2e6d7e92c6af39c33dea5c39cc3 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:26 +0530 Subject: [PATCH 0119/2880] phy: tegra: Add PCIe PIPE2UPHY support Synopsys DesignWare core based PCIe controllers in Tegra 194 SoC interface with Universal PHY (UPHY) module through a PIPE2UPHY (P2U) module. For each PCIe lane of a controller, there is a P2U unit instantiated at hardware level. This driver provides support for the programming required for each P2U that is going to be used for a PCIe controller. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Acked-by: Kishon Vijay Abraham I Acked-by: Thierry Reding --- drivers/phy/tegra/Kconfig | 7 ++ drivers/phy/tegra/Makefile | 1 + drivers/phy/tegra/phy-tegra194-p2u.c | 120 +++++++++++++++++++++++++++ 3 files changed, 128 insertions(+) create mode 100644 drivers/phy/tegra/phy-tegra194-p2u.c diff --git a/drivers/phy/tegra/Kconfig b/drivers/phy/tegra/Kconfig index e516967d695b..f9817c3ae85f 100644 --- a/drivers/phy/tegra/Kconfig +++ b/drivers/phy/tegra/Kconfig @@ -7,3 +7,10 @@ config PHY_TEGRA_XUSB To compile this driver as a module, choose M here: the module will be called phy-tegra-xusb. + +config PHY_TEGRA194_P2U + tristate "NVIDIA Tegra194 PIPE2UPHY PHY driver" + depends on ARCH_TEGRA_194_SOC || COMPILE_TEST + select GENERIC_PHY + help + Enable this to support the P2U (PIPE to UPHY) that is part of Tegra 19x SOCs. diff --git a/drivers/phy/tegra/Makefile b/drivers/phy/tegra/Makefile index 64ccaeacb631..320dd389f34d 100644 --- a/drivers/phy/tegra/Makefile +++ b/drivers/phy/tegra/Makefile @@ -6,3 +6,4 @@ phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_124_SOC) += xusb-tegra124.o phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_132_SOC) += xusb-tegra124.o phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_210_SOC) += xusb-tegra210.o phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_186_SOC) += xusb-tegra186.o +obj-$(CONFIG_PHY_TEGRA194_P2U) += phy-tegra194-p2u.o diff --git a/drivers/phy/tegra/phy-tegra194-p2u.c b/drivers/phy/tegra/phy-tegra194-p2u.c new file mode 100644 index 000000000000..7042bed9feaa --- /dev/null +++ b/drivers/phy/tegra/phy-tegra194-p2u.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * P2U (PIPE to UPHY) driver for Tegra T194 SoC + * + * Copyright (C) 2019 NVIDIA Corporation. + * + * Author: Vidya Sagar + */ + +#include +#include +#include +#include +#include +#include + +#define P2U_PERIODIC_EQ_CTRL_GEN3 0xc0 +#define P2U_PERIODIC_EQ_CTRL_GEN3_PERIODIC_EQ_EN BIT(0) +#define P2U_PERIODIC_EQ_CTRL_GEN3_INIT_PRESET_EQ_TRAIN_EN BIT(1) +#define P2U_PERIODIC_EQ_CTRL_GEN4 0xc4 +#define P2U_PERIODIC_EQ_CTRL_GEN4_INIT_PRESET_EQ_TRAIN_EN BIT(1) + +#define P2U_RX_DEBOUNCE_TIME 0xa4 +#define P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_MASK 0xffff +#define P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_VAL 160 + +struct tegra_p2u { + void __iomem *base; +}; + +static inline void p2u_writel(struct tegra_p2u *phy, const u32 value, + const u32 reg) +{ + writel_relaxed(value, phy->base + reg); +} + +static inline u32 p2u_readl(struct tegra_p2u *phy, const u32 reg) +{ + return readl_relaxed(phy->base + reg); +} + +static int tegra_p2u_power_on(struct phy *x) +{ + struct tegra_p2u *phy = phy_get_drvdata(x); + u32 val; + + val = p2u_readl(phy, P2U_PERIODIC_EQ_CTRL_GEN3); + val &= ~P2U_PERIODIC_EQ_CTRL_GEN3_PERIODIC_EQ_EN; + val |= P2U_PERIODIC_EQ_CTRL_GEN3_INIT_PRESET_EQ_TRAIN_EN; + p2u_writel(phy, val, P2U_PERIODIC_EQ_CTRL_GEN3); + + val = p2u_readl(phy, P2U_PERIODIC_EQ_CTRL_GEN4); + val |= P2U_PERIODIC_EQ_CTRL_GEN4_INIT_PRESET_EQ_TRAIN_EN; + p2u_writel(phy, val, P2U_PERIODIC_EQ_CTRL_GEN4); + + val = p2u_readl(phy, P2U_RX_DEBOUNCE_TIME); + val &= ~P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_MASK; + val |= P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_VAL; + p2u_writel(phy, val, P2U_RX_DEBOUNCE_TIME); + + return 0; +} + +static const struct phy_ops ops = { + .power_on = tegra_p2u_power_on, + .owner = THIS_MODULE, +}; + +static int tegra_p2u_probe(struct platform_device *pdev) +{ + struct phy_provider *phy_provider; + struct device *dev = &pdev->dev; + struct phy *generic_phy; + struct tegra_p2u *phy; + struct resource *res; + + phy = devm_kzalloc(dev, sizeof(*phy), GFP_KERNEL); + if (!phy) + return -ENOMEM; + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ctl"); + phy->base = devm_ioremap_resource(dev, res); + if (IS_ERR(phy->base)) + return PTR_ERR(phy->base); + + platform_set_drvdata(pdev, phy); + + generic_phy = devm_phy_create(dev, NULL, &ops); + if (IS_ERR(generic_phy)) + return PTR_ERR(generic_phy); + + phy_set_drvdata(generic_phy, phy); + + phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate); + if (IS_ERR(phy_provider)) + return PTR_ERR(phy_provider); + + return 0; +} + +static const struct of_device_id tegra_p2u_id_table[] = { + { + .compatible = "nvidia,tegra194-p2u", + }, + {} +}; +MODULE_DEVICE_TABLE(of, tegra_p2u_id_table); + +static struct platform_driver tegra_p2u_driver = { + .probe = tegra_p2u_probe, + .driver = { + .name = "tegra194-p2u", + .of_match_table = tegra_p2u_id_table, + }, +}; +module_platform_driver(tegra_p2u_driver); + +MODULE_AUTHOR("Vidya Sagar "); +MODULE_DESCRIPTION("NVIDIA Tegra194 PIPE2UPHY PHY driver"); +MODULE_LICENSE("GPL v2"); From 40667d1bd822f7a9d961bac190cd36505de0de4e Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 13 Aug 2019 08:58:41 +0300 Subject: [PATCH 0120/2880] MAINTAINERS: altera-sysmgr: Fix typo in a filepath Fix typo (s/sysgmr/sysmgr/) in the header filepath. Fixes: f36e789a1f8d ("mfd: altera-sysmgr: Add SOCFPGA System Manager") Signed-off-by: Denis Efremov Signed-off-by: Lee Jones --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 783569e3c4b4..99ac7a9921f7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -723,7 +723,7 @@ ALTERA SYSTEM MANAGER DRIVER M: Thor Thayer S: Maintained F: drivers/mfd/altera-sysmgr.c -F: include/linux/mfd/altera-sysgmr.h +F: include/linux/mfd/altera-sysmgr.h ALTERA SYSTEM RESOURCE DRIVER FOR ARRIA10 DEVKIT M: Thor Thayer From 5cd690a308e86bcb10b8a0d6d42f153e82f695f6 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 14 Aug 2019 09:24:03 +0200 Subject: [PATCH 0121/2880] mfd: asic3: Include the right header This is a GPIO driver, use the appropriate header rather than the legacy header. Signed-off-by: Linus Walleij Signed-off-by: Lee Jones --- drivers/mfd/asic3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c index 83b18c998d6f..a6bd2134cea2 100644 --- a/drivers/mfd/asic3.c +++ b/drivers/mfd/asic3.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include From fd5d16531a39322c3d7433d9f8a36203c9aaeddc Mon Sep 17 00:00:00 2001 From: Xiaowei Bao Date: Wed, 14 Aug 2019 10:03:29 +0800 Subject: [PATCH 0122/2880] PCI: layerscape: Add the bar_fixed_64bit property to the endpoint driver The layerscape PCIe controller have 4 BARs. BAR0 and BAR1 are 32bit, BAR2 and BAR4 are 64bit and that's a fixed hardware configuration. Set the bar_fixed_64bit variable accordingly. Signed-off-by: Xiaowei Bao [lorenzo.pieralisi@arm.com: commit log] Signed-off-by: Lorenzo Pieralisi Acked-by: Kishon Vijay Abraham I --- drivers/pci/controller/dwc/pci-layerscape-ep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c index be61d96cc95e..ca9aa4501e7e 100644 --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c @@ -44,6 +44,7 @@ static const struct pci_epc_features ls_pcie_epc_features = { .linkup_notifier = false, .msi_capable = true, .msix_capable = false, + .bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4), }; static const struct pci_epc_features* From b5b24617987f522abb5c42851ae505557f2c84f8 Mon Sep 17 00:00:00 2001 From: Xiaowei Bao Date: Wed, 14 Aug 2019 10:03:30 +0800 Subject: [PATCH 0123/2880] PCI: layerscape: Add CONFIG_PCI_LAYERSCAPE_EP to build EP/RC separately Add CONFIG_PCI_LAYERSCAPE_EP so that endpoint and host controller drivers can be built separately. Signed-off-by: Xiaowei Bao Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/Kconfig | 20 ++++++++++++++++++-- drivers/pci/controller/dwc/Makefile | 3 ++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 6ea778ae4877..869c645dd135 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -131,13 +131,29 @@ config PCI_KEYSTONE_EP DesignWare core functions to implement the driver. config PCI_LAYERSCAPE - bool "Freescale Layerscape PCIe controller" + bool "Freescale Layerscape PCIe controller - Host mode" depends on OF && (ARM || ARCH_LAYERSCAPE || COMPILE_TEST) depends on PCI_MSI_IRQ_DOMAIN select MFD_SYSCON select PCIE_DW_HOST help - Say Y here if you want PCIe controller support on Layerscape SoCs. + Say Y here if you want to enable PCIe controller support on Layerscape + SoCs to work in Host mode. + This controller can work either as EP or RC. The RCW[HOST_AGT_PEX] + determines which PCIe controller works in EP mode and which PCIe + controller works in RC mode. + +config PCI_LAYERSCAPE_EP + bool "Freescale Layerscape PCIe controller - Endpoint mode" + depends on OF && (ARM || ARCH_LAYERSCAPE || COMPILE_TEST) + depends on PCI_ENDPOINT + select PCIE_DW_EP + help + Say Y here if you want to enable PCIe controller support on Layerscape + SoCs to work in Endpoint mode. + This controller can work either as EP or RC. The RCW[HOST_AGT_PEX] + determines which PCIe controller works in EP mode and which PCIe + controller works in RC mode. config PCI_HISI depends on OF && (ARM64 || COMPILE_TEST) diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index b085dfd4fab7..824fde7ae750 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -8,7 +8,8 @@ obj-$(CONFIG_PCI_EXYNOS) += pci-exynos.o obj-$(CONFIG_PCI_IMX6) += pci-imx6.o obj-$(CONFIG_PCIE_SPEAR13XX) += pcie-spear13xx.o obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone.o -obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o pci-layerscape-ep.o +obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o +obj-$(CONFIG_PCI_LAYERSCAPE_EP) += pci-layerscape-ep.o obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o obj-$(CONFIG_PCIE_ARMADA_8K) += pcie-armada8k.o obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o From af80559b4d9cd481c63505edbe425ff6bad4ab8d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 9 Aug 2019 17:40:47 +0200 Subject: [PATCH 0124/2880] i2c: replace i2c_new_secondary_device with an ERR_PTR variant In the general move to have i2c_new_*_device functions which return ERR_PTR instead of NULL, this patch converts i2c_new_secondary_device(). There are only few users, so this patch converts the I2C core and all users in one go. The function gets renamed to i2c_new_ancillary_device() so out-of-tree users will get a build failure to understand they need to adapt their error checking code. Signed-off-by: Wolfram Sang Reviewed-by: Kieran Bingham # adv748x Reviewed-by: Laurent Pinchart # adv7511 + adv7604 Reviewed-by: Hans Verkuil # adv7604 Signed-off-by: Wolfram Sang --- drivers/gpu/drm/bridge/adv7511/adv7511_drv.c | 18 ++++++++-------- drivers/i2c/i2c-core-base.c | 10 ++++----- drivers/media/i2c/adv748x/adv748x-core.c | 6 +++--- drivers/media/i2c/adv7604.c | 22 +++++++++++--------- include/linux/i2c.h | 2 +- 5 files changed, 30 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c index f6d2681f6927..9e13e466e72c 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c @@ -981,10 +981,10 @@ static int adv7511_init_cec_regmap(struct adv7511 *adv) { int ret; - adv->i2c_cec = i2c_new_secondary_device(adv->i2c_main, "cec", + adv->i2c_cec = i2c_new_ancillary_device(adv->i2c_main, "cec", ADV7511_CEC_I2C_ADDR_DEFAULT); - if (!adv->i2c_cec) - return -EINVAL; + if (IS_ERR(adv->i2c_cec)) + return PTR_ERR(adv->i2c_cec); i2c_set_clientdata(adv->i2c_cec, adv); adv->regmap_cec = devm_regmap_init_i2c(adv->i2c_cec, @@ -1165,20 +1165,20 @@ static int adv7511_probe(struct i2c_client *i2c, const struct i2c_device_id *id) adv7511_packet_disable(adv7511, 0xffff); - adv7511->i2c_edid = i2c_new_secondary_device(i2c, "edid", + adv7511->i2c_edid = i2c_new_ancillary_device(i2c, "edid", ADV7511_EDID_I2C_ADDR_DEFAULT); - if (!adv7511->i2c_edid) { - ret = -EINVAL; + if (IS_ERR(adv7511->i2c_edid)) { + ret = PTR_ERR(adv7511->i2c_edid); goto uninit_regulators; } regmap_write(adv7511->regmap, ADV7511_REG_EDID_I2C_ADDR, adv7511->i2c_edid->addr << 1); - adv7511->i2c_packet = i2c_new_secondary_device(i2c, "packet", + adv7511->i2c_packet = i2c_new_ancillary_device(i2c, "packet", ADV7511_PACKET_I2C_ADDR_DEFAULT); - if (!adv7511->i2c_packet) { - ret = -EINVAL; + if (IS_ERR(adv7511->i2c_packet)) { + ret = PTR_ERR(adv7511->i2c_packet); goto err_i2c_unregister_edid; } diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index b4d84bd475da..ca10525e6d94 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -964,7 +964,7 @@ struct i2c_client *devm_i2c_new_dummy_device(struct device *dev, EXPORT_SYMBOL_GPL(devm_i2c_new_dummy_device); /** - * i2c_new_secondary_device - Helper to get the instantiated secondary address + * i2c_new_ancillary_device - Helper to get the instantiated secondary address * and create the associated device * @client: Handle to the primary client * @name: Handle to specify which secondary address to get @@ -983,9 +983,9 @@ EXPORT_SYMBOL_GPL(devm_i2c_new_dummy_device); * cell whose "reg-names" value matches the slave name. * * This returns the new i2c client, which should be saved for later use with - * i2c_unregister_device(); or NULL to indicate an error. + * i2c_unregister_device(); or an ERR_PTR to describe the error. */ -struct i2c_client *i2c_new_secondary_device(struct i2c_client *client, +struct i2c_client *i2c_new_ancillary_device(struct i2c_client *client, const char *name, u16 default_addr) { @@ -1000,9 +1000,9 @@ struct i2c_client *i2c_new_secondary_device(struct i2c_client *client, } dev_dbg(&client->adapter->dev, "Address for %s : 0x%x\n", name, addr); - return i2c_new_dummy(client->adapter, addr); + return i2c_new_dummy_device(client->adapter, addr); } -EXPORT_SYMBOL_GPL(i2c_new_secondary_device); +EXPORT_SYMBOL_GPL(i2c_new_ancillary_device); /* ------------------------------------------------------------------------- */ diff --git a/drivers/media/i2c/adv748x/adv748x-core.c b/drivers/media/i2c/adv748x/adv748x-core.c index f57cd77a32fa..2567de2b0037 100644 --- a/drivers/media/i2c/adv748x/adv748x-core.c +++ b/drivers/media/i2c/adv748x/adv748x-core.c @@ -183,14 +183,14 @@ static int adv748x_initialise_clients(struct adv748x_state *state) int ret; for (i = ADV748X_PAGE_DPLL; i < ADV748X_PAGE_MAX; ++i) { - state->i2c_clients[i] = i2c_new_secondary_device( + state->i2c_clients[i] = i2c_new_ancillary_device( state->client, adv748x_default_addresses[i].name, adv748x_default_addresses[i].default_addr); - if (state->i2c_clients[i] == NULL) { + if (IS_ERR(state->i2c_clients[i])) { adv_err(state, "failed to create i2c client %u\n", i); - return -ENOMEM; + return PTR_ERR(state->i2c_clients[i]); } ret = adv748x_configure_regmap(state, i); diff --git a/drivers/media/i2c/adv7604.c b/drivers/media/i2c/adv7604.c index 28a84bf9f8a9..2dedd6ebb236 100644 --- a/drivers/media/i2c/adv7604.c +++ b/drivers/media/i2c/adv7604.c @@ -2862,10 +2862,8 @@ static void adv76xx_unregister_clients(struct adv76xx_state *state) { unsigned int i; - for (i = 1; i < ARRAY_SIZE(state->i2c_clients); ++i) { - if (state->i2c_clients[i]) - i2c_unregister_device(state->i2c_clients[i]); - } + for (i = 1; i < ARRAY_SIZE(state->i2c_clients); ++i) + i2c_unregister_device(state->i2c_clients[i]); } static struct i2c_client *adv76xx_dummy_client(struct v4l2_subdev *sd, @@ -2878,14 +2876,14 @@ static struct i2c_client *adv76xx_dummy_client(struct v4l2_subdev *sd, struct i2c_client *new_client; if (pdata && pdata->i2c_addresses[page]) - new_client = i2c_new_dummy(client->adapter, + new_client = i2c_new_dummy_device(client->adapter, pdata->i2c_addresses[page]); else - new_client = i2c_new_secondary_device(client, + new_client = i2c_new_ancillary_device(client, adv76xx_default_addresses[page].name, adv76xx_default_addresses[page].default_addr); - if (new_client) + if (!IS_ERR(new_client)) io_write(sd, io_reg, new_client->addr << 1); return new_client; @@ -3516,15 +3514,19 @@ static int adv76xx_probe(struct i2c_client *client, } for (i = 1; i < ADV76XX_PAGE_MAX; ++i) { + struct i2c_client *dummy_client; + if (!(BIT(i) & state->info->page_mask)) continue; - state->i2c_clients[i] = adv76xx_dummy_client(sd, i); - if (!state->i2c_clients[i]) { - err = -EINVAL; + dummy_client = adv76xx_dummy_client(sd, i); + if (IS_ERR(dummy_client)) { + err = PTR_ERR(dummy_client); v4l2_err(sd, "failed to create i2c client %u\n", i); goto err_i2c; } + + state->i2c_clients[i] = dummy_client; } INIT_DELAYED_WORK(&state->delayed_work_enable_hotplug, diff --git a/include/linux/i2c.h b/include/linux/i2c.h index fa5552c2307b..ebbe024dd9e0 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -473,7 +473,7 @@ extern struct i2c_client * devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adap, u16 address); extern struct i2c_client * -i2c_new_secondary_device(struct i2c_client *client, +i2c_new_ancillary_device(struct i2c_client *client, const char *name, u16 default_addr); From 4e4521f76ff96f83561555f76c50d25314320d10 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 9 Aug 2019 14:30:02 -0700 Subject: [PATCH 0125/2880] dt-bindings: i2c: rcar: Rename bindings documentation file Rename the bindings documentation file for R-Car I2C controller from i2c-rcar.txt to renesas,i2c.txt. This is part of an ongoing effort to name bindings documentation files for Renesas IP blocks consistently, in line with the compat strings they document. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/{i2c-rcar.txt => renesas,i2c.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{i2c-rcar.txt => renesas,i2c.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/i2c-rcar.txt b/Documentation/devicetree/bindings/i2c/renesas,i2c.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-rcar.txt rename to Documentation/devicetree/bindings/i2c/renesas,i2c.txt diff --git a/MAINTAINERS b/MAINTAINERS index b49db5ad0b64..a14ce44c33b3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13650,7 +13650,7 @@ F: drivers/iio/adc/rcar-gyroadc.c RENESAS R-CAR I2C DRIVERS M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-rcar.txt +F: Documentation/devicetree/bindings/i2c/renesas,i2c.txt F: Documentation/devicetree/bindings/i2c/renesas,iic.txt F: drivers/i2c/busses/i2c-rcar.c F: drivers/i2c/busses/i2c-sh_mobile.c From 747bee3574040246be18e2e9db709a7cb058b35b Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 9 Aug 2019 14:30:04 -0700 Subject: [PATCH 0126/2880] dt-bindings: i2c: i2c-emev2: Rename bindings documentation file Rename the bindings documentation file for Renesas EMEV2 IIC controller from i2c-emev2.txt to renesas,iic-emev2.txt. This is part of an ongoing effort to name bindings documentation files for Renesas IP blocks consistently, in line with the compat strings they document. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- .../bindings/i2c/{i2c-emev2.txt => renesas,iic-emev2.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{i2c-emev2.txt => renesas,iic-emev2.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/i2c-emev2.txt b/Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-emev2.txt rename to Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt diff --git a/MAINTAINERS b/MAINTAINERS index a14ce44c33b3..f89f27a2d09a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13628,7 +13628,7 @@ F: drivers/clk/renesas/ RENESAS EMEV2 I2C DRIVER M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-emev2.txt +F: Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt F: drivers/i2c/busses/i2c-emev2.c RENESAS ETHERNET DRIVERS From 232219b9a464c2479c98aa589acb1bd3383ae9d6 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 13 Aug 2019 12:03:01 +0200 Subject: [PATCH 0127/2880] i2c-cht-wc: Fix lockdep warning When the kernel is build with lockdep support and the i2c-cht-wc driver is used, the following warning is shown: [ 66.674334] ====================================================== [ 66.674337] WARNING: possible circular locking dependency detected [ 66.674340] 5.3.0-rc4+ #83 Not tainted [ 66.674342] ------------------------------------------------------ [ 66.674345] systemd-udevd/1232 is trying to acquire lock: [ 66.674349] 00000000a74dab07 (intel_soc_pmic_chtwc:167:(&cht_wc_regmap_cfg)->lock){+.+.}, at: regmap_write+0x31/0x70 [ 66.674360] but task is already holding lock: [ 66.674362] 00000000d44a85b7 (i2c_register_adapter){+.+.}, at: i2c_smbus_xfer+0x49/0xf0 [ 66.674370] which lock already depends on the new lock. [ 66.674371] the existing dependency chain (in reverse order) is: [ 66.674374] -> #1 (i2c_register_adapter){+.+.}: [ 66.674381] rt_mutex_lock_nested+0x46/0x60 [ 66.674384] i2c_smbus_xfer+0x49/0xf0 [ 66.674387] i2c_smbus_read_byte_data+0x45/0x70 [ 66.674391] cht_wc_byte_reg_read+0x35/0x50 [ 66.674394] _regmap_read+0x63/0x1a0 [ 66.674396] _regmap_update_bits+0xa8/0xe0 [ 66.674399] regmap_update_bits_base+0x63/0xa0 [ 66.674403] regmap_irq_update_bits.isra.0+0x3b/0x50 [ 66.674406] regmap_add_irq_chip+0x592/0x7a0 [ 66.674409] devm_regmap_add_irq_chip+0x89/0xed [ 66.674412] cht_wc_probe+0x102/0x158 [ 66.674415] i2c_device_probe+0x95/0x250 [ 66.674419] really_probe+0xf3/0x380 [ 66.674422] driver_probe_device+0x59/0xd0 [ 66.674425] device_driver_attach+0x53/0x60 [ 66.674428] __driver_attach+0x92/0x150 [ 66.674431] bus_for_each_dev+0x7d/0xc0 [ 66.674434] bus_add_driver+0x14d/0x1f0 [ 66.674437] driver_register+0x6d/0xb0 [ 66.674440] i2c_register_driver+0x45/0x80 [ 66.674445] do_one_initcall+0x60/0x2f4 [ 66.674450] kernel_init_freeable+0x20d/0x2b4 [ 66.674453] kernel_init+0xa/0x10c [ 66.674457] ret_from_fork+0x3a/0x50 [ 66.674459] -> #0 (intel_soc_pmic_chtwc:167:(&cht_wc_regmap_cfg)->lock){+.+.}: [ 66.674465] __lock_acquire+0xe07/0x1930 [ 66.674468] lock_acquire+0x9d/0x1a0 [ 66.674472] __mutex_lock+0xa8/0x9a0 [ 66.674474] regmap_write+0x31/0x70 [ 66.674480] cht_wc_i2c_adap_smbus_xfer+0x72/0x240 [i2c_cht_wc] [ 66.674483] __i2c_smbus_xfer+0x1a3/0x640 [ 66.674486] i2c_smbus_xfer+0x67/0xf0 [ 66.674489] i2c_smbus_read_byte_data+0x45/0x70 [ 66.674494] bq24190_probe+0x26b/0x410 [bq24190_charger] [ 66.674497] i2c_device_probe+0x189/0x250 [ 66.674500] really_probe+0xf3/0x380 [ 66.674503] driver_probe_device+0x59/0xd0 [ 66.674506] device_driver_attach+0x53/0x60 [ 66.674509] __driver_attach+0x92/0x150 [ 66.674512] bus_for_each_dev+0x7d/0xc0 [ 66.674515] bus_add_driver+0x14d/0x1f0 [ 66.674518] driver_register+0x6d/0xb0 [ 66.674521] i2c_register_driver+0x45/0x80 [ 66.674524] do_one_initcall+0x60/0x2f4 [ 66.674528] do_init_module+0x5c/0x230 [ 66.674531] load_module+0x2707/0x2a20 [ 66.674534] __do_sys_init_module+0x188/0x1b0 [ 66.674537] do_syscall_64+0x5c/0xb0 [ 66.674541] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 66.674543] other info that might help us debug this: [ 66.674545] Possible unsafe locking scenario: [ 66.674547] CPU0 CPU1 [ 66.674548] ---- ---- [ 66.674550] lock(i2c_register_adapter); [ 66.674553] lock(intel_soc_pmic_chtwc:167:(&cht_wc_regmap_cfg)->lock); [ 66.674556] lock(i2c_register_adapter); [ 66.674559] lock(intel_soc_pmic_chtwc:167:(&cht_wc_regmap_cfg)->lock); [ 66.674561] *** DEADLOCK *** The problem is that the CHT Whiskey Cove PMIC's builtin i2c-adapter is itself a part of an i2c-client (the PMIC). This means that transfers done through it take adapter->bus_lock twice, once for the parent i2c-adapter and once for its own bus_lock. Lockdep does not like this nested locking. To make lockdep happy in the case of busses with muxes, the i2c-core's i2c_adapter_lock_bus function calls: rt_mutex_lock_nested(&adapter->bus_lock, i2c_adapter_depth(adapter)); But i2c_adapter_depth only works when the direct parent of the adapter is another adapter, as it is only meant for muxes. In this case there is an i2c-client and MFD instantiated platform_device in the parent->child chain between the 2 devices. This commit overrides the default i2c_lock_operations, passing a hardcoded depth of 1 to rt_mutex_lock_nested, making lockdep happy. Note that if there were to be a mux attached to the i2c-wc-cht adapter, this would break things again since the i2c-mux code expects the root-adapter to have a locking depth of 0. But the i2c-wc-cht adapter always has only 1 client directly attached in the form of the charger IC paired with the CHT Whiskey Cove PMIC. Signed-off-by: Hans de Goede Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-cht-wc.c | 46 +++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/i2c/busses/i2c-cht-wc.c b/drivers/i2c/busses/i2c-cht-wc.c index 66af44bfa67d..f6546de66fbc 100644 --- a/drivers/i2c/busses/i2c-cht-wc.c +++ b/drivers/i2c/busses/i2c-cht-wc.c @@ -178,6 +178,51 @@ static const struct i2c_algorithm cht_wc_i2c_adap_algo = { .smbus_xfer = cht_wc_i2c_adap_smbus_xfer, }; +/* + * We are an i2c-adapter which itself is part of an i2c-client. This means that + * transfers done through us take adapter->bus_lock twice, once for our parent + * i2c-adapter and once to take our own bus_lock. Lockdep does not like this + * nested locking, to make lockdep happy in the case of busses with muxes, the + * i2c-core's i2c_adapter_lock_bus function calls: + * rt_mutex_lock_nested(&adapter->bus_lock, i2c_adapter_depth(adapter)); + * + * But i2c_adapter_depth only works when the direct parent of the adapter is + * another adapter, as it is only meant for muxes. In our case there is an + * i2c-client and MFD instantiated platform_device in the parent->child chain + * between the 2 devices. + * + * So we override the default i2c_lock_operations and pass a hardcoded + * depth of 1 to rt_mutex_lock_nested, to make lockdep happy. + * + * Note that if there were to be a mux attached to our adapter, this would + * break things again since the i2c-mux code expects the root-adapter to have + * a locking depth of 0. But we always have only 1 client directly attached + * in the form of the Charger IC paired with the CHT Whiskey Cove PMIC. + */ +static void cht_wc_i2c_adap_lock_bus(struct i2c_adapter *adapter, + unsigned int flags) +{ + rt_mutex_lock_nested(&adapter->bus_lock, 1); +} + +static int cht_wc_i2c_adap_trylock_bus(struct i2c_adapter *adapter, + unsigned int flags) +{ + return rt_mutex_trylock(&adapter->bus_lock); +} + +static void cht_wc_i2c_adap_unlock_bus(struct i2c_adapter *adapter, + unsigned int flags) +{ + rt_mutex_unlock(&adapter->bus_lock); +} + +static const struct i2c_lock_operations cht_wc_i2c_adap_lock_ops = { + .lock_bus = cht_wc_i2c_adap_lock_bus, + .trylock_bus = cht_wc_i2c_adap_trylock_bus, + .unlock_bus = cht_wc_i2c_adap_unlock_bus, +}; + /**** irqchip for the client connected to the extchgr i2c adapter ****/ static void cht_wc_i2c_irq_lock(struct irq_data *data) { @@ -286,6 +331,7 @@ static int cht_wc_i2c_adap_i2c_probe(struct platform_device *pdev) adap->adapter.owner = THIS_MODULE; adap->adapter.class = I2C_CLASS_HWMON; adap->adapter.algo = &cht_wc_i2c_adap_algo; + adap->adapter.lock_ops = &cht_wc_i2c_adap_lock_ops; strlcpy(adap->adapter.name, "PMIC I2C Adapter", sizeof(adap->adapter.name)); adap->adapter.dev.parent = &pdev->dev; From f58ba5e3f6863ea4486952698898848a6db726c2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 12 Jul 2019 08:53:19 -0700 Subject: [PATCH 0128/2880] PCI: pci-hyperv: Fix build errors on non-SYSFS config Fix build errors when building almost-allmodconfig but with SYSFS not set (not enabled). Fixes these build errors: ERROR: "pci_destroy_slot" [drivers/pci/controller/pci-hyperv.ko] undefined! ERROR: "pci_create_slot" [drivers/pci/controller/pci-hyperv.ko] undefined! drivers/pci/slot.o is only built when SYSFS is enabled, so pci-hyperv.o has an implicit dependency on SYSFS. Make that explicit. Also, depending on X86 && X86_64 is not needed, so just change that to depend on X86_64. Fixes: a15f2c08c708 ("PCI: hv: support reporting serial number as slot information") Signed-off-by: Randy Dunlap Signed-off-by: Lorenzo Pieralisi Reviewed-by: Haiyang Zhang Cc: Matthew Wilcox Cc: Jake Oshins Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Stephen Hemminger Cc: Sasha Levin Cc: Bjorn Helgaas Cc: linux-pci@vger.kernel.org Cc: linux-hyperv@vger.kernel.org Cc: Dexuan Cui --- drivers/pci/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 2ab92409210a..297bf928d652 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -181,7 +181,7 @@ config PCI_LABEL config PCI_HYPERV tristate "Hyper-V PCI Frontend" - depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64 + depends on X86_64 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && SYSFS help The PCI device frontend driver allows the kernel to import arbitrary PCI devices from a PCI backend to support PCI driver domains. From 075af61c19cdc32b94fd697c610c9a07fa21866e Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Fri, 26 Jul 2019 16:40:07 +0200 Subject: [PATCH 0129/2880] PCI: imx6: Limit DBI register length Define the length of the DBI registers and limit config space to its length. This makes sure that the kernel does not access registers beyond that point, avoiding the following abort on a i.MX 6Quad: # cat /sys/devices/soc0/soc/1ffc000.pcie/pci0000\:00/0000\:00\:00.0/config [ 100.021433] Unhandled fault: imprecise external abort (0x1406) at 0xb6ea7000 ... [ 100.056423] PC is at dw_pcie_read+0x50/0x84 [ 100.060790] LR is at dw_pcie_rd_own_conf+0x44/0x48 ... Signed-off-by: Stefan Agner Signed-off-by: Lorenzo Pieralisi Reviewed-by: Lucas Stach --- drivers/pci/controller/dwc/pci-imx6.c | 33 +++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 9b5cb5b70389..8b8efa3063f5 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -57,6 +57,7 @@ enum imx6_pcie_variants { struct imx6_pcie_drvdata { enum imx6_pcie_variants variant; u32 flags; + int dbi_length; }; struct imx6_pcie { @@ -1212,6 +1213,7 @@ static const struct imx6_pcie_drvdata drvdata[] = { .variant = IMX6Q, .flags = IMX6_PCIE_FLAG_IMX6_PHY | IMX6_PCIE_FLAG_IMX6_SPEED_CHANGE, + .dbi_length = 0x200, }, [IMX6SX] = { .variant = IMX6SX, @@ -1254,6 +1256,37 @@ static struct platform_driver imx6_pcie_driver = { .shutdown = imx6_pcie_shutdown, }; +static void imx6_pcie_quirk(struct pci_dev *dev) +{ + struct pci_bus *bus = dev->bus; + struct pcie_port *pp = bus->sysdata; + + /* Bus parent is the PCI bridge, its parent is this platform driver */ + if (!bus->dev.parent || !bus->dev.parent->parent) + return; + + /* Make sure we only quirk devices associated with this driver */ + if (bus->dev.parent->parent->driver != &imx6_pcie_driver.driver) + return; + + if (bus->number == pp->root_bus_nr) { + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct imx6_pcie *imx6_pcie = to_imx6_pcie(pci); + + /* + * Limit config length to avoid the kernel reading beyond + * the register set and causing an abort on i.MX 6Quad + */ + if (imx6_pcie->drvdata->dbi_length) { + dev->cfg_size = imx6_pcie->drvdata->dbi_length; + dev_info(&dev->dev, "Limiting cfg_size to %d\n", + dev->cfg_size); + } + } +} +DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_SYNOPSYS, 0xabcd, + PCI_CLASS_BRIDGE_PCI, 8, imx6_pcie_quirk); + static int __init imx6_pcie_init(void) { #ifdef CONFIG_ARM From 5336c17928cc464845ff765ce45b368c22f848e0 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 15 Aug 2019 16:24:56 +0800 Subject: [PATCH 0130/2880] csky: Fixup ioremap function losing Implement the following apis to meet usage in different scenarios. - ioremap (NonCache + StrongOrder) - ioremap_nocache (NonCache + StrongOrder) - ioremap_wc (NonCache + WeakOrder ) - ioremap_cache ( Cache + WeakOrder ) Also change flag VM_ALLOC to VM_IOREMAP in get_vm_area_caller. Signed-off-by: Guo Ren Cc: Arnd Bergmann Cc: Christoph Hellwig --- arch/csky/include/asm/io.h | 23 ++++++++++++----------- arch/csky/mm/ioremap.c | 23 +++++++++++++++++------ 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/arch/csky/include/asm/io.h b/arch/csky/include/asm/io.h index c1dfa9c10e36..80d071e2567f 100644 --- a/arch/csky/include/asm/io.h +++ b/arch/csky/include/asm/io.h @@ -4,17 +4,10 @@ #ifndef __ASM_CSKY_IO_H #define __ASM_CSKY_IO_H -#include +#include #include #include -extern void __iomem *ioremap(phys_addr_t offset, size_t size); - -extern void iounmap(void *addr); - -extern int remap_area_pages(unsigned long address, phys_addr_t phys_addr, - size_t size, unsigned long flags); - /* * I/O memory access primitives. Reads are ordered relative to any * following Normal memory access. Writes are ordered relative to any prior @@ -40,9 +33,17 @@ extern int remap_area_pages(unsigned long address, phys_addr_t phys_addr, #define writel(v,c) ({ wmb(); writel_relaxed((v),(c)); mb(); }) #endif -#define ioremap_nocache(phy, sz) ioremap(phy, sz) -#define ioremap_wc ioremap_nocache -#define ioremap_wt ioremap_nocache +/* + * I/O memory mapping functions. + */ +extern void __iomem *ioremap_cache(phys_addr_t addr, size_t size); +extern void __iomem *__ioremap(phys_addr_t addr, size_t size, pgprot_t prot); +extern void iounmap(void *addr); + +#define ioremap(addr, size) __ioremap((addr), (size), pgprot_noncached(PAGE_KERNEL)) +#define ioremap_wc(addr, size) __ioremap((addr), (size), pgprot_writecombine(PAGE_KERNEL)) +#define ioremap_nocache(addr, size) ioremap((addr), (size)) +#define ioremap_cache ioremap_cache #include diff --git a/arch/csky/mm/ioremap.c b/arch/csky/mm/ioremap.c index 48531115fd9d..e13cd3497628 100644 --- a/arch/csky/mm/ioremap.c +++ b/arch/csky/mm/ioremap.c @@ -8,12 +8,12 @@ #include -void __iomem *ioremap(phys_addr_t addr, size_t size) +static void __iomem *__ioremap_caller(phys_addr_t addr, size_t size, + pgprot_t prot, void *caller) { phys_addr_t last_addr; unsigned long offset, vaddr; struct vm_struct *area; - pgprot_t prot; last_addr = addr + size - 1; if (!size || last_addr < addr) @@ -23,14 +23,12 @@ void __iomem *ioremap(phys_addr_t addr, size_t size) addr &= PAGE_MASK; size = PAGE_ALIGN(size + offset); - area = get_vm_area_caller(size, VM_ALLOC, __builtin_return_address(0)); + area = get_vm_area_caller(size, VM_IOREMAP, caller); if (!area) return NULL; vaddr = (unsigned long)area->addr; - prot = pgprot_noncached(PAGE_KERNEL); - if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) { free_vm_area(area); return NULL; @@ -38,7 +36,20 @@ void __iomem *ioremap(phys_addr_t addr, size_t size) return (void __iomem *)(vaddr + offset); } -EXPORT_SYMBOL(ioremap); + +void __iomem *__ioremap(phys_addr_t phys_addr, size_t size, pgprot_t prot) +{ + return __ioremap_caller(phys_addr, size, prot, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(__ioremap); + +void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size) +{ + return __ioremap_caller(phys_addr, size, PAGE_KERNEL, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_cache); void iounmap(void __iomem *addr) { From a6e6fe6549f609f5c00c91e988da9d25f8ae8604 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:35 -0600 Subject: [PATCH 0131/2880] PCI/P2PDMA: Introduce private pagemap structure Move the PCI bus offset from the generic dev_pagemap structure to a specific pci_p2pdma_pagemap structure. This structure will grow in subsequent patches. Link: https://lore.kernel.org/r/20190730163545.4915-2-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-2-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 26 ++++++++++++++++++++------ include/linux/memremap.h | 1 - 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 234476226529..03e9c887bdfb 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -25,6 +25,16 @@ struct pci_p2pdma { bool p2pmem_published; }; +struct pci_p2pdma_pagemap { + struct dev_pagemap pgmap; + u64 bus_offset; +}; + +static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) +{ + return container_of(pgmap, struct pci_p2pdma_pagemap, pgmap); +} + static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -135,6 +145,7 @@ out: int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { + struct pci_p2pdma_pagemap *p2p_pgmap; struct dev_pagemap *pgmap; void *addr; int error; @@ -157,14 +168,17 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, return error; } - pgmap = devm_kzalloc(&pdev->dev, sizeof(*pgmap), GFP_KERNEL); - if (!pgmap) + p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL); + if (!p2p_pgmap) return -ENOMEM; + + pgmap = &p2p_pgmap->pgmap; pgmap->res.start = pci_resource_start(pdev, bar) + offset; pgmap->res.end = pgmap->res.start + size - 1; pgmap->res.flags = pci_resource_flags(pdev, bar); pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; - pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) - + + p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); addr = devm_memremap_pages(&pdev->dev, pgmap); @@ -720,7 +734,7 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_publish); int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir) { - struct dev_pagemap *pgmap; + struct pci_p2pdma_pagemap *p2p_pgmap; struct scatterlist *s; phys_addr_t paddr; int i; @@ -736,10 +750,10 @@ int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, return 0; for_each_sg(sg, s, nents, i) { - pgmap = sg_page(s)->pgmap; + p2p_pgmap = to_p2p_pgmap(sg_page(s)->pgmap); paddr = sg_phys(s); - s->dma_address = paddr - pgmap->pci_p2pdma_bus_offset; + s->dma_address = paddr - p2p_pgmap->bus_offset; sg_dma_len(s) = s->length; } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f8a5b2a19945..b459518ce475 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -112,7 +112,6 @@ struct dev_pagemap { struct device *dev; enum memory_type type; unsigned int flags; - u64 pci_p2pdma_bus_offset; const struct dev_pagemap_ops *ops; }; From 0afea3814358c97e2964710ba16fbb0b54ceda0e Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:36 -0600 Subject: [PATCH 0132/2880] PCI/P2PDMA: Add provider's pci_dev to pci_p2pdma_pagemap struct The provider will be needed to figure out how to map a device. Link: https://lore.kernel.org/r/20190730163545.4915-3-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-3-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 03e9c887bdfb..93fbda14f4a9 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -27,6 +27,7 @@ struct pci_p2pdma { struct pci_p2pdma_pagemap { struct dev_pagemap pgmap; + struct pci_dev *provider; u64 bus_offset; }; @@ -178,6 +179,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->res.flags = pci_resource_flags(pdev, bar); pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; + p2p_pgmap->provider = pdev; p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); From 72583084e3fe333c27bc37527efe455d27eaf0b1 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:37 -0600 Subject: [PATCH 0133/2880] PCI/P2PDMA: Add constants for map type results to upstream_bridge_distance() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add constant flags to indicate how two devices will be mapped or if they are unsupported. upstream_bridge_distance() will now return the mapping type and the distance in a passed-by-reference argument. This helps annotate the code better, but the main reason is so we can use the information to store the required mapping method in an xarray. Link: https://lore.kernel.org/r/20190730163545.4915-4-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-4-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 95 +++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 37 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 93fbda14f4a9..8f0688201aec 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -20,6 +20,13 @@ #include #include +enum pci_p2pdma_map_type { + PCI_P2PDMA_MAP_UNKNOWN = 0, + PCI_P2PDMA_MAP_NOT_SUPPORTED, + PCI_P2PDMA_MAP_BUS_ADDR, + PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, +}; + struct pci_p2pdma { struct gen_pool *pool; bool p2pmem_published; @@ -313,34 +320,33 @@ static bool root_complex_whitelist(struct pci_dev *dev) * port of the switch, to the common upstream port, back up to the second * downstream port and then to Device B. * - * Any two devices that don't have a common upstream bridge will return -1. - * In this way devices on separate PCIe root ports will be rejected, which - * is what we want for peer-to-peer seeing each PCIe root port defines a - * separate hierarchy domain and there's no way to determine whether the root - * complex supports forwarding between them. + * Any two devices that cannot communicate using p2pdma will return + * PCI_P2PDMA_MAP_NOT_SUPPORTED. * - * In the case where two devices are connected to different PCIe switches, - * this function will still return a positive distance as long as both - * switches eventually have a common upstream bridge. Note this covers - * the case of using multiple PCIe switches to achieve a desired level of - * fan-out from a root port. The exact distance will be a function of the - * number of switches between Device A and Device B. + * Any two devices that have a data path that goes through the host bridge + * will consult a whitelist. If the host bridges are on the whitelist, + * this function will return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE. * - * If a bridge which has any ACS redirection bits set is in the path - * then this functions will return -2. This is so we reject any - * cases where the TLPs are forwarded up into the root complex. - * In this case, a list of all infringing bridge addresses will be - * populated in acs_list (assuming it's non-null) for printk purposes. + * If either bridge is not on the whitelist this function returns + * PCI_P2PDMA_MAP_NOT_SUPPORTED. + * + * If a bridge which has any ACS redirection bits set is in the path, + * acs_redirects will be set to true. In this case, a list of all infringing + * bridge addresses will be populated in acs_list (assuming it's non-null) + * for printk purposes. */ -static int upstream_bridge_distance(struct pci_dev *provider, - struct pci_dev *client, - struct seq_buf *acs_list) +static enum pci_p2pdma_map_type +upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, + int *dist, bool *acs_redirects, struct seq_buf *acs_list) { struct pci_dev *a = provider, *b = client, *bb; int dist_a = 0; int dist_b = 0; int acs_cnt = 0; + if (acs_redirects) + *acs_redirects = false; + /* * Note, we don't need to take references to devices returned by * pci_upstream_bridge() seeing we hold a reference to a child @@ -369,15 +375,18 @@ static int upstream_bridge_distance(struct pci_dev *provider, dist_a++; } + if (dist) + *dist = dist_a + dist_b; + /* * Allow the connection if both devices are on a whitelisted root * complex, but add an arbitrary large value to the distance. */ if (root_complex_whitelist(provider) && root_complex_whitelist(client)) - return 0x1000 + dist_a + dist_b; + return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; - return -1; + return PCI_P2PDMA_MAP_NOT_SUPPORTED; check_b_path_acs: bb = b; @@ -394,33 +403,44 @@ check_b_path_acs: bb = pci_upstream_bridge(bb); } - if (acs_cnt) - return -2; + if (dist) + *dist = dist_a + dist_b; - return dist_a + dist_b; + if (acs_cnt) { + if (acs_redirects) + *acs_redirects = true; + + return PCI_P2PDMA_MAP_NOT_SUPPORTED; + } + + return PCI_P2PDMA_MAP_BUS_ADDR; } -static int upstream_bridge_distance_warn(struct pci_dev *provider, - struct pci_dev *client) +static enum pci_p2pdma_map_type +upstream_bridge_distance_warn(struct pci_dev *provider, struct pci_dev *client, + int *dist) { struct seq_buf acs_list; + bool acs_redirects; int ret; seq_buf_init(&acs_list, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); if (!acs_list.buffer) return -ENOMEM; - ret = upstream_bridge_distance(provider, client, &acs_list); - if (ret == -2) { - pci_warn(client, "cannot be used for peer-to-peer DMA as ACS redirect is set between the client and provider (%s)\n", + ret = upstream_bridge_distance(provider, client, dist, &acs_redirects, + &acs_list); + if (acs_redirects) { + pci_warn(client, "ACS redirect is set between the client and provider (%s)\n", pci_name(provider)); /* Drop final semicolon */ acs_list.buffer[acs_list.len-1] = 0; pci_warn(client, "to disable ACS redirect for this path, add the kernel parameter: pci=disable_acs_redir=%s\n", acs_list.buffer); + } - } else if (ret < 0) { - pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge\n", + if (ret == PCI_P2PDMA_MAP_NOT_SUPPORTED) { + pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge or whitelisted host bridge\n", pci_name(provider)); } @@ -452,7 +472,8 @@ int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, { bool not_supported = false; struct pci_dev *pci_client; - int distance = 0; + int total_dist = 0; + int distance; int i, ret; if (num_clients == 0) @@ -477,26 +498,26 @@ int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, if (verbose) ret = upstream_bridge_distance_warn(provider, - pci_client); + pci_client, &distance); else ret = upstream_bridge_distance(provider, pci_client, - NULL); + &distance, NULL, NULL); pci_dev_put(pci_client); - if (ret < 0) + if (ret == PCI_P2PDMA_MAP_NOT_SUPPORTED) not_supported = true; if (not_supported && !verbose) break; - distance += ret; + total_dist += distance; } if (not_supported) return -1; - return distance; + return total_dist; } EXPORT_SYMBOL_GPL(pci_p2pdma_distance_many); From c6bfaeb573a6caae4b96885b8bad63ee57495c77 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:38 -0600 Subject: [PATCH 0134/2880] PCI/P2PDMA: Factor out __upstream_bridge_distance() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a prep patch to create a second level helper. There are no functional changes. The root complex whitelist code will be moved into this function in a subsequent patch. Link: https://lore.kernel.org/r/20190730163545.4915-5-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-5-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 88 ++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 8f0688201aec..600ba6a7aa11 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -296,47 +296,8 @@ static bool root_complex_whitelist(struct pci_dev *dev) return false; } -/* - * Find the distance through the nearest common upstream bridge between - * two PCI devices. - * - * If the two devices are the same device then 0 will be returned. - * - * If there are two virtual functions of the same device behind the same - * bridge port then 2 will be returned (one step down to the PCIe switch, - * then one step back to the same device). - * - * In the case where two devices are connected to the same PCIe switch, the - * value 4 will be returned. This corresponds to the following PCI tree: - * - * -+ Root Port - * \+ Switch Upstream Port - * +-+ Switch Downstream Port - * + \- Device A - * \-+ Switch Downstream Port - * \- Device B - * - * The distance is 4 because we traverse from Device A through the downstream - * port of the switch, to the common upstream port, back up to the second - * downstream port and then to Device B. - * - * Any two devices that cannot communicate using p2pdma will return - * PCI_P2PDMA_MAP_NOT_SUPPORTED. - * - * Any two devices that have a data path that goes through the host bridge - * will consult a whitelist. If the host bridges are on the whitelist, - * this function will return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE. - * - * If either bridge is not on the whitelist this function returns - * PCI_P2PDMA_MAP_NOT_SUPPORTED. - * - * If a bridge which has any ACS redirection bits set is in the path, - * acs_redirects will be set to true. In this case, a list of all infringing - * bridge addresses will be populated in acs_list (assuming it's non-null) - * for printk purposes. - */ static enum pci_p2pdma_map_type -upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, +__upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, int *dist, bool *acs_redirects, struct seq_buf *acs_list) { struct pci_dev *a = provider, *b = client, *bb; @@ -416,6 +377,53 @@ check_b_path_acs: return PCI_P2PDMA_MAP_BUS_ADDR; } +/* + * Find the distance through the nearest common upstream bridge between + * two PCI devices. + * + * If the two devices are the same device then 0 will be returned. + * + * If there are two virtual functions of the same device behind the same + * bridge port then 2 will be returned (one step down to the PCIe switch, + * then one step back to the same device). + * + * In the case where two devices are connected to the same PCIe switch, the + * value 4 will be returned. This corresponds to the following PCI tree: + * + * -+ Root Port + * \+ Switch Upstream Port + * +-+ Switch Downstream Port + * + \- Device A + * \-+ Switch Downstream Port + * \- Device B + * + * The distance is 4 because we traverse from Device A through the downstream + * port of the switch, to the common upstream port, back up to the second + * downstream port and then to Device B. + * + * Any two devices that cannot communicate using p2pdma will return + * PCI_P2PDMA_MAP_NOT_SUPPORTED. + * + * Any two devices that have a data path that goes through the host bridge + * will consult a whitelist. If the host bridges are on the whitelist, + * this function will return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE. + * + * If either bridge is not on the whitelist this function returns + * PCI_P2PDMA_MAP_NOT_SUPPORTED. + * + * If a bridge which has any ACS redirection bits set is in the path, + * acs_redirects will be set to true. In this case, a list of all infringing + * bridge addresses will be populated in acs_list (assuming it's non-null) + * for printk purposes. + */ +static enum pci_p2pdma_map_type +upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, + int *dist, bool *acs_redirects, struct seq_buf *acs_list) +{ + return __upstream_bridge_distance(provider, client, dist, + acs_redirects, acs_list); +} + static enum pci_p2pdma_map_type upstream_bridge_distance_warn(struct pci_dev *provider, struct pci_dev *client, int *dist) From e2cbfbf78968db3e3a71cb989af17f4f6448bf49 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:39 -0600 Subject: [PATCH 0135/2880] PCI/P2PDMA: Apply host bridge whitelist for ACS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a P2PDMA transfer is rejected due to ACS being set, we can also check the whitelist and allow the transactions. Do this by pushing the whitelist check into the upstream_bridge_distance() function. Link: https://lore.kernel.org/r/20190730163545.4915-6-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-6-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 600ba6a7aa11..f7f7e5862bab 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -339,15 +339,7 @@ __upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, if (dist) *dist = dist_a + dist_b; - /* - * Allow the connection if both devices are on a whitelisted root - * complex, but add an arbitrary large value to the distance. - */ - if (root_complex_whitelist(provider) && - root_complex_whitelist(client)) - return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; - - return PCI_P2PDMA_MAP_NOT_SUPPORTED; + return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; check_b_path_acs: bb = b; @@ -371,7 +363,7 @@ check_b_path_acs: if (acs_redirects) *acs_redirects = true; - return PCI_P2PDMA_MAP_NOT_SUPPORTED; + return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; } return PCI_P2PDMA_MAP_BUS_ADDR; @@ -420,8 +412,18 @@ static enum pci_p2pdma_map_type upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, int *dist, bool *acs_redirects, struct seq_buf *acs_list) { - return __upstream_bridge_distance(provider, client, dist, - acs_redirects, acs_list); + enum pci_p2pdma_map_type map_type; + + map_type = __upstream_bridge_distance(provider, client, dist, + acs_redirects, acs_list); + + if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) { + if (!root_complex_whitelist(provider) || + !root_complex_whitelist(client)) + return PCI_P2PDMA_MAP_NOT_SUPPORTED; + } + + return map_type; } static enum pci_p2pdma_map_type From 2c84d818aee976390c055a417045a85c23d39662 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:40 -0600 Subject: [PATCH 0136/2880] PCI/P2PDMA: Factor out host_bridge_whitelist() Push both PCI devices into the whitelist checking function seeing some hardware will require us ensuring they are on the same host bridge. At the same time we rename root_complex_whitelist() to host_bridge_whitelist() to match the terminology used in the code. Link: https://lore.kernel.org/r/20190730163545.4915-7-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-7-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index f7f7e5862bab..4b9f0903b340 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -269,19 +269,11 @@ static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev) seq_buf_printf(buf, "%s;", pci_name(pdev)); } -/* - * If we can't find a common upstream bridge take a look at the root - * complex and compare it to a whitelist of known good hardware. - */ -static bool root_complex_whitelist(struct pci_dev *dev) +static bool __host_bridge_whitelist(struct pci_host_bridge *host) { - struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); struct pci_dev *root = pci_get_slot(host->bus, PCI_DEVFN(0, 0)); unsigned short vendor, device; - if (iommu_present(dev->dev.bus)) - return false; - if (!root) return false; @@ -296,6 +288,24 @@ static bool root_complex_whitelist(struct pci_dev *dev) return false; } +/* + * If we can't find a common upstream bridge take a look at the root + * complex and compare it to a whitelist of known good hardware. + */ +static bool host_bridge_whitelist(struct pci_dev *a, struct pci_dev *b) +{ + struct pci_host_bridge *host_a = pci_find_host_bridge(a->bus); + struct pci_host_bridge *host_b = pci_find_host_bridge(b->bus); + + if (iommu_present(a->dev.bus) || iommu_present(b->dev.bus)) + return false; + + if (__host_bridge_whitelist(host_a) && __host_bridge_whitelist(host_b)) + return true; + + return false; +} + static enum pci_p2pdma_map_type __upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, int *dist, bool *acs_redirects, struct seq_buf *acs_list) @@ -418,8 +428,7 @@ upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, acs_redirects, acs_list); if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) { - if (!root_complex_whitelist(provider) || - !root_complex_whitelist(client)) + if (!host_bridge_whitelist(provider, client)) return PCI_P2PDMA_MAP_NOT_SUPPORTED; } From 494d63b0d5d0a481f6ac23bc61e94647ab9c4390 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:41 -0600 Subject: [PATCH 0137/2880] PCI/P2PDMA: Whitelist some Intel host bridges Intel devices do not have good support for P2P requests that span different host bridges as the transactions will cross the QPI/UPI bus and this does not perform well. Therefore, enable support for these devices only if the host bridges match. Add Intel devices that have been tested and are known to work. There are likely many others out there that will need to be tested and added. Link: https://lore.kernel.org/r/20190730163545.4915-8-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-8-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 4b9f0903b340..2c4a8e92ed64 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -269,9 +269,30 @@ static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev) seq_buf_printf(buf, "%s;", pci_name(pdev)); } -static bool __host_bridge_whitelist(struct pci_host_bridge *host) +static const struct pci_p2pdma_whitelist_entry { + unsigned short vendor; + unsigned short device; + enum { + REQ_SAME_HOST_BRIDGE = 1 << 0, + } flags; +} pci_p2pdma_whitelist[] = { + /* AMD ZEN */ + {PCI_VENDOR_ID_AMD, 0x1450, 0}, + + /* Intel Xeon E5/Core i7 */ + {PCI_VENDOR_ID_INTEL, 0x3c00, REQ_SAME_HOST_BRIDGE}, + {PCI_VENDOR_ID_INTEL, 0x3c01, REQ_SAME_HOST_BRIDGE}, + /* Intel Xeon E7 v3/Xeon E5 v3/Core i7 */ + {PCI_VENDOR_ID_INTEL, 0x2f00, REQ_SAME_HOST_BRIDGE}, + {PCI_VENDOR_ID_INTEL, 0x2f01, REQ_SAME_HOST_BRIDGE}, + {} +}; + +static bool __host_bridge_whitelist(struct pci_host_bridge *host, + bool same_host_bridge) { struct pci_dev *root = pci_get_slot(host->bus, PCI_DEVFN(0, 0)); + const struct pci_p2pdma_whitelist_entry *entry; unsigned short vendor, device; if (!root) @@ -281,9 +302,14 @@ static bool __host_bridge_whitelist(struct pci_host_bridge *host) device = root->device; pci_dev_put(root); - /* AMD ZEN host bridges can do peer to peer */ - if (vendor == PCI_VENDOR_ID_AMD && device == 0x1450) + for (entry = pci_p2pdma_whitelist; entry->vendor; entry++) { + if (vendor != entry->vendor || device != entry->device) + continue; + if (entry->flags & REQ_SAME_HOST_BRIDGE && !same_host_bridge) + return false; + return true; + } return false; } @@ -300,7 +326,11 @@ static bool host_bridge_whitelist(struct pci_dev *a, struct pci_dev *b) if (iommu_present(a->dev.bus) || iommu_present(b->dev.bus)) return false; - if (__host_bridge_whitelist(host_a) && __host_bridge_whitelist(host_b)) + if (host_a == host_b) + return __host_bridge_whitelist(host_a, true); + + if (__host_bridge_whitelist(host_a, false) && + __host_bridge_whitelist(host_b, false)) return true; return false; From 2b9f4bb2a4fb77da4862f9ddf5209de2bcdaa0c0 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:42 -0600 Subject: [PATCH 0138/2880] PCI/P2PDMA: Add attrs argument to pci_p2pdma_map_sg() This is to match the dma_map_sg() API which this function will have to call in an future patch. Add a pci_p2pdma_map_sg_attrs() function and helper to call it with no attributes just like the dma_map_sg() function. Link: https://lore.kernel.org/r/20190730163545.4915-9-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-9-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++-- drivers/pci/p2pdma.c | 7 ++++--- include/linux/pci-p2pdma.h | 15 +++++++++++---- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bb970ca82517..7747712054cd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -832,8 +832,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, goto out; if (is_pci_p2pdma_page(sg_page(iod->sg))) - nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents, - rq_dma_dir(req)); + nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg, + iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN); else nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN); diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 2c4a8e92ed64..94fbacbcbbd0 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -798,13 +798,14 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_publish); * @sg: scatter list to map * @nents: elements in the scatterlist * @dir: DMA direction + * @attrs: DMA attributes passed to dma_map_sg() (if called) * * Scatterlists mapped with this function should not be unmapped in any way. * * Returns the number of SG entries mapped or 0 on error. */ -int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir) +int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs) { struct pci_p2pdma_pagemap *p2p_pgmap; struct scatterlist *s; @@ -831,7 +832,7 @@ int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, return nents; } -EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg); +EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs); /** * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index bca9bc3e5be7..7fd51954f93a 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -30,8 +30,8 @@ struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev, unsigned int *nents, u32 length); void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl); void pci_p2pmem_publish(struct pci_dev *pdev, bool publish); -int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir); +int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs); int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, @@ -81,8 +81,9 @@ static inline void pci_p2pmem_free_sgl(struct pci_dev *pdev, static inline void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) { } -static inline int pci_p2pdma_map_sg(struct device *dev, - struct scatterlist *sg, int nents, enum dma_data_direction dir) +static inline int pci_p2pdma_map_sg_attrs(struct device *dev, + struct scatterlist *sg, int nents, enum dma_data_direction dir, + unsigned long attrs) { return 0; } @@ -111,4 +112,10 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client) return pci_p2pmem_find_many(&client, 1); } +static inline int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ + return pci_p2pdma_map_sg_attrs(dev, sg, nents, dir, 0); +} + #endif /* _LINUX_PCI_P2P_H */ From 7f73eac3a7137eabfb0c005c7ba55eb7994b9673 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:43 -0600 Subject: [PATCH 0139/2880] PCI/P2PDMA: Introduce pci_p2pdma_unmap_sg() Add pci_p2pdma_unmap_sg() to the two places that call pci_p2pdma_map_sg(). This is a prep patch to introduce correct mappings for p2pdma transactions that go through the root complex. Link: https://lore.kernel.org/r/20190730163545.4915-10-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-10-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/infiniband/core/rw.c | 6 ++++-- drivers/nvme/host/pci.c | 6 ++++-- drivers/pci/p2pdma.c | 18 +++++++++++++++++- include/linux/pci-p2pdma.h | 13 +++++++++++++ 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index dce06108c8c3..5337393d4dfe 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -583,8 +583,10 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, break; } - /* P2PDMA contexts do not need to be unmapped */ - if (!is_pci_p2pdma_page(sg_page(sg))) + if (is_pci_p2pdma_page(sg_page(sg))) + pci_p2pdma_unmap_sg(qp->pd->device->dma_device, sg, + sg_cnt, dir); + else ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); } EXPORT_SYMBOL(rdma_rw_ctx_destroy); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7747712054cd..2348b15f6bd0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -547,8 +547,10 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) WARN_ON_ONCE(!iod->nents); - /* P2PDMA requests do not need to be unmapped */ - if (!is_pci_p2pdma_page(sg_page(iod->sg))) + if (is_pci_p2pdma_page(sg_page(iod->sg))) + pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents, + rq_dma_dir(req)); + else dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req)); diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 94fbacbcbbd0..1eec7a5ec27e 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -800,7 +800,8 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_publish); * @dir: DMA direction * @attrs: DMA attributes passed to dma_map_sg() (if called) * - * Scatterlists mapped with this function should not be unmapped in any way. + * Scatterlists mapped with this function should be unmapped using + * pci_p2pdma_unmap_sg_attrs(). * * Returns the number of SG entries mapped or 0 on error. */ @@ -834,6 +835,21 @@ int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, } EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs); +/** + * pci_p2pdma_unmap_sg - unmap a PCI peer-to-peer scatterlist that was + * mapped with pci_p2pdma_map_sg() + * @dev: device doing the DMA request + * @sg: scatter list to map + * @nents: number of elements returned by pci_p2pdma_map_sg() + * @dir: DMA direction + * @attrs: DMA attributes passed to dma_unmap_sg() (if called) + */ +void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ +} +EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs); + /** * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store * to enable p2pdma diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 7fd51954f93a..8318a97c9c61 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -32,6 +32,8 @@ void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl); void pci_p2pmem_publish(struct pci_dev *pdev, bool publish); int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); +void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs); int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, @@ -87,6 +89,11 @@ static inline int pci_p2pdma_map_sg_attrs(struct device *dev, { return 0; } +static inline void pci_p2pdma_unmap_sg_attrs(struct device *dev, + struct scatterlist *sg, int nents, enum dma_data_direction dir, + unsigned long attrs) +{ +} static inline int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma) { @@ -118,4 +125,10 @@ static inline int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, return pci_p2pdma_map_sg_attrs(dev, sg, nents, dir, 0); } +static inline void pci_p2pdma_unmap_sg(struct device *dev, + struct scatterlist *sg, int nents, enum dma_data_direction dir) +{ + pci_p2pdma_unmap_sg_attrs(dev, sg, nents, dir, 0); +} + #endif /* _LINUX_PCI_P2P_H */ From 142c242e6f12196f8064beb09198838611e029db Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:44 -0600 Subject: [PATCH 0140/2880] PCI/P2PDMA: Factor out __pci_p2pdma_map_sg() Factor out the bus-only mapping into its own static function. No functional changes. The original pci_p2pdma_map_sg_attrs() will be used to decide whether this is an appropriate way to map. Link: https://lore.kernel.org/r/20190730163545.4915-11-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-11-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 53 +++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 1eec7a5ec27e..771b45605853 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -792,6 +792,33 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) } EXPORT_SYMBOL_GPL(pci_p2pmem_publish); +static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap, + struct device *dev, struct scatterlist *sg, int nents) +{ + struct scatterlist *s; + phys_addr_t paddr; + int i; + + /* + * p2pdma mappings are not compatible with devices that use + * dma_virt_ops. If the upper layers do the right thing + * this should never happen because it will be prevented + * by the check in pci_p2pdma_distance_many() + */ + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_DMA_VIRT_OPS) && + dev->dma_ops == &dma_virt_ops)) + return 0; + + for_each_sg(sg, s, nents, i) { + paddr = sg_phys(s); + + s->dma_address = paddr - p2p_pgmap->bus_offset; + sg_dma_len(s) = s->length; + } + + return nents; +} + /** * pci_p2pdma_map_sg - map a PCI peer-to-peer scatterlist for DMA * @dev: device doing the DMA request @@ -808,30 +835,10 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_publish); int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { - struct pci_p2pdma_pagemap *p2p_pgmap; - struct scatterlist *s; - phys_addr_t paddr; - int i; + struct pci_p2pdma_pagemap *p2p_pgmap = + to_p2p_pgmap(sg_page(sg)->pgmap); - /* - * p2pdma mappings are not compatible with devices that use - * dma_virt_ops. If the upper layers do the right thing - * this should never happen because it will be prevented - * by the check in pci_p2pdma_distance_many() - */ - if (WARN_ON_ONCE(IS_ENABLED(CONFIG_DMA_VIRT_OPS) && - dev->dma_ops == &dma_virt_ops)) - return 0; - - for_each_sg(sg, s, nents, i) { - p2p_pgmap = to_p2p_pgmap(sg_page(s)->pgmap); - paddr = sg_phys(s); - - s->dma_address = paddr - p2p_pgmap->bus_offset; - sg_dma_len(s) = s->length; - } - - return nents; + return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents); } EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs); From 110203bee05f33637ccb44f046f14edaea94bb4c Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:45 -0600 Subject: [PATCH 0141/2880] PCI/P2PDMA: Store mapping method in an xarray When upstream_bridge_distance() is called, store the method required to map the DMA transfers in an xarray so it can be looked up efficiently on the hot path in pci_p2pdma_map_sg(). Link: https://lore.kernel.org/r/20190730163545.4915-12-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-12-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 771b45605853..db8224ff6e80 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -19,6 +19,7 @@ #include #include #include +#include enum pci_p2pdma_map_type { PCI_P2PDMA_MAP_UNKNOWN = 0, @@ -30,6 +31,7 @@ enum pci_p2pdma_map_type { struct pci_p2pdma { struct gen_pool *pool; bool p2pmem_published; + struct xarray map_types; }; struct pci_p2pdma_pagemap { @@ -105,6 +107,7 @@ static void pci_p2pdma_release(void *data) gen_pool_destroy(p2pdma->pool); sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group); + xa_destroy(&p2pdma->map_types); } static int pci_p2pdma_setup(struct pci_dev *pdev) @@ -116,6 +119,8 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) if (!p2p) return -ENOMEM; + xa_init(&p2p->map_types); + p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); if (!p2p->pool) goto out; @@ -409,6 +414,12 @@ check_b_path_acs: return PCI_P2PDMA_MAP_BUS_ADDR; } +static unsigned long map_types_idx(struct pci_dev *client) +{ + return (pci_domain_nr(client->bus) << 16) | + (client->bus->number << 8) | client->devfn; +} + /* * Find the distance through the nearest common upstream bridge between * two PCI devices. @@ -459,9 +470,13 @@ upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) { if (!host_bridge_whitelist(provider, client)) - return PCI_P2PDMA_MAP_NOT_SUPPORTED; + map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED; } + if (provider->p2pdma) + xa_store(&provider->p2pdma->map_types, map_types_idx(client), + xa_mk_value(map_type), GFP_KERNEL); + return map_type; } From 5d52e1abcd47957f5f4cae26659768655e3ac9f5 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:46 -0600 Subject: [PATCH 0142/2880] PCI/P2PDMA: dma_map() requests that traverse the host bridge Any requests that traverse the host bridge will need to be mapped into the IOMMU, so call dma_map_sg() inside pci_p2pdma_map_sg() when appropriate. Similarly, call dma_unmap_sg() inside pci_p2pdma_unmap_sg(). Link: https://lore.kernel.org/r/20190730163545.4915-13-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-13-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index db8224ff6e80..bca1ffc7075e 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -807,6 +807,16 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) } EXPORT_SYMBOL_GPL(pci_p2pmem_publish); +static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct pci_dev *provider, + struct pci_dev *client) +{ + if (!provider->p2pdma) + return PCI_P2PDMA_MAP_NOT_SUPPORTED; + + return xa_to_value(xa_load(&provider->p2pdma->map_types, + map_types_idx(client))); +} + static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap, struct device *dev, struct scatterlist *sg, int nents) { @@ -852,8 +862,22 @@ int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, { struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(sg_page(sg)->pgmap); + struct pci_dev *client; - return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents); + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return 0; + + client = to_pci_dev(dev); + + switch (pci_p2pdma_map_type(p2p_pgmap->provider, client)) { + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + return dma_map_sg_attrs(dev, sg, nents, dir, attrs); + case PCI_P2PDMA_MAP_BUS_ADDR: + return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents); + default: + WARN_ON_ONCE(1); + return 0; + } } EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs); @@ -869,6 +893,20 @@ EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs); void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { + struct pci_p2pdma_pagemap *p2p_pgmap = + to_p2p_pgmap(sg_page(sg)->pgmap); + enum pci_p2pdma_map_type map_type; + struct pci_dev *client; + + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return; + + client = to_pci_dev(dev); + + map_type = pci_p2pdma_map_type(p2p_pgmap->provider, client); + + if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + dma_unmap_sg_attrs(dev, sg, nents, dir, attrs); } EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs); From 5daf40b039249ef98d49610d14293dca3464b91f Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:47 -0600 Subject: [PATCH 0143/2880] PCI/P2PDMA: Allow IOMMU for host bridge whitelist Now that we map the requests correctly we can remove the iommu_present() restriction. Link: https://lore.kernel.org/r/20190730163545.4915-14-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-14-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index bca1ffc7075e..d8c824097d26 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -18,7 +18,6 @@ #include #include #include -#include #include enum pci_p2pdma_map_type { @@ -328,9 +327,6 @@ static bool host_bridge_whitelist(struct pci_dev *a, struct pci_dev *b) struct pci_host_bridge *host_a = pci_find_host_bridge(a->bus); struct pci_host_bridge *host_b = pci_find_host_bridge(b->bus); - if (iommu_present(a->dev.bus) || iommu_present(b->dev.bus)) - return false; - if (host_a == host_b) return __host_bridge_whitelist(host_a, true); From 27235b15e4197b3a1014cf0ca0b08dbbf61a692b Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:48 -0600 Subject: [PATCH 0144/2880] PCI/P2PDMA: Update pci_p2pdma_distance_many() documentation The comment describing pci_p2pdma_distance_many() still referred to the devices being behind the same root port. This no longer applies so reword the documentation. Link: https://lore.kernel.org/r/20190730163545.4915-15-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-15-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index d8c824097d26..0608aae72ccc 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -517,15 +517,14 @@ upstream_bridge_distance_warn(struct pci_dev *provider, struct pci_dev *client, * @num_clients: number of clients in the array * @verbose: if true, print warnings for devices when we return -1 * - * Returns -1 if any of the clients are not compatible (behind the same - * root port as the provider), otherwise returns a positive number where - * a lower number is the preferable choice. (If there's one client - * that's the same as the provider it will return 0, which is best choice). + * Returns -1 if any of the clients are not compatible, otherwise returns a + * positive number where a lower number is the preferable choice. (If there's + * one client that's the same as the provider it will return 0, which is best + * choice). * - * For now, "compatible" means the provider and the clients are all behind - * the same PCI root port. This cuts out cases that may work but is safest - * for the user. Future work can expand this to white-list root complexes that - * can safely forward between each ports. + * "compatible" means the provider and the clients are either all behind + * the same PCI root port or the host bridges connected to each of the devices + * are listed in the 'pci_p2pdma_whitelist'. */ int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, int num_clients, bool verbose) From 10fa8acf0fa6ed79ddb662488addb3ba71e9db60 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 30 Jul 2019 19:06:38 -0400 Subject: [PATCH 0145/2880] nfsd: Remove unnecessary NULL checks "cb" is never actually NULL in these functions. On a quick skim of the history, they seem to have been there from the beginning. I'm not sure if they originally served a purpose. Reported-by: Jia-Ju Bai Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 397eb7820929..524111420b48 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -512,11 +512,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb != NULL) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } @@ -604,11 +602,10 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); } #endif /* CONFIG_NFSD_PNFS */ @@ -663,11 +660,10 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status); } @@ -759,11 +755,10 @@ static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status); } /* From fdf4f2fb8e8990c131b2b1a5a9c03681bb16e87a Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 22 Jul 2019 18:03:02 -0700 Subject: [PATCH 0146/2880] drivers: thermal: processor_thermal_device: Export sysfs interface for TCC offset This change exports an interface to read tcc offset and allow writing if the platform is not locked. Refer to Intel SDM for details on the MSR: MSR_TEMPERATURE_TARGET. Here TCC Activation Offset (R/W) bits allow temperature offset in degrees in relation to TjMAX. This change will be useful for improving performance from user space for some platforms, if the current offset is not optimal. Signed-off-by: Srinivas Pandruvada Tested-by: Benjamin Berg Signed-off-by: Zhang Rui --- .../processor_thermal_device.c | 91 ++++++++++++++++++- 1 file changed, 87 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c index d3446acf9bbd..97333fc4be42 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c @@ -137,6 +137,72 @@ static const struct attribute_group power_limit_attribute_group = { .name = "power_limits" }; +static ssize_t tcc_offset_degree_celsius_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val; + int err; + + err = rdmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, &val); + if (err) + return err; + + val = (val >> 24) & 0xff; + return sprintf(buf, "%d\n", (int)val); +} + +static int tcc_offset_update(int tcc) +{ + u64 val; + int err; + + if (!tcc) + return -EINVAL; + + err = rdmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, &val); + if (err) + return err; + + val &= ~GENMASK_ULL(31, 24); + val |= (tcc & 0xff) << 24; + + err = wrmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, val); + if (err) + return err; + + return 0; +} + +static int tcc_offset_save; + +static ssize_t tcc_offset_degree_celsius_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + u64 val; + int tcc, err; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, &val); + if (err) + return err; + + if (!(val & BIT(30))) + return -EACCES; + + if (kstrtoint(buf, 0, &tcc)) + return -EINVAL; + + err = tcc_offset_update(tcc); + if (err) + return err; + + tcc_offset_save = tcc; + + return count; +} + +static DEVICE_ATTR_RW(tcc_offset_degree_celsius); + static int stored_tjmax; /* since it is fixed, we can have local storage */ static int get_tjmax(void) @@ -332,6 +398,7 @@ static void proc_thermal_remove(struct proc_thermal_device *proc_priv) acpi_remove_notify_handler(proc_priv->adev->handle, ACPI_DEVICE_NOTIFY, proc_thermal_notify); int340x_thermal_zone_remove(proc_priv->int340x_zone); + sysfs_remove_file(&proc_priv->dev->kobj, &dev_attr_tcc_offset_degree_celsius.attr); sysfs_remove_group(&proc_priv->dev->kobj, &power_limit_attribute_group); } @@ -355,8 +422,15 @@ static int int3401_add(struct platform_device *pdev) dev_info(&pdev->dev, "Creating sysfs group for PROC_THERMAL_PLATFORM_DEV\n"); - return sysfs_create_group(&pdev->dev.kobj, - &power_limit_attribute_group); + ret = sysfs_create_file(&pdev->dev.kobj, &dev_attr_tcc_offset_degree_celsius.attr); + if (ret) + return ret; + + ret = sysfs_create_group(&pdev->dev.kobj, &power_limit_attribute_group); + if (ret) + sysfs_remove_file(&pdev->dev.kobj, &dev_attr_tcc_offset_degree_celsius.attr); + + return ret; } static int int3401_remove(struct platform_device *pdev) @@ -588,8 +662,15 @@ static int proc_thermal_pci_probe(struct pci_dev *pdev, dev_info(&pdev->dev, "Creating sysfs group for PROC_THERMAL_PCI\n"); - return sysfs_create_group(&pdev->dev.kobj, - &power_limit_attribute_group); + ret = sysfs_create_file(&pdev->dev.kobj, &dev_attr_tcc_offset_degree_celsius.attr); + if (ret) + return ret; + + ret = sysfs_create_group(&pdev->dev.kobj, &power_limit_attribute_group); + if (ret) + sysfs_remove_file(&pdev->dev.kobj, &dev_attr_tcc_offset_degree_celsius.attr); + + return ret; } static void proc_thermal_pci_remove(struct pci_dev *pdev) @@ -615,6 +696,8 @@ static int proc_thermal_resume(struct device *dev) proc_dev = dev_get_drvdata(dev); proc_thermal_read_ppcc(proc_dev); + tcc_offset_update(tcc_offset_save); + return 0; } #else From 4ff96fb52c6964ad42e0a878be8f86a2e8052ddd Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Fri, 19 Jul 2019 14:28:39 +0200 Subject: [PATCH 0147/2880] livepatch: Nullify obj->mod in klp_module_coming()'s error path klp_module_coming() is called for every module appearing in the system. It sets obj->mod to a patched module for klp_object obj. Unfortunately it leaves it set even if an error happens later in the function and the patched module is not allowed to be loaded. klp_is_object_loaded() uses obj->mod variable and could currently give a wrong return value. The bug is probably harmless as of now. Signed-off-by: Miroslav Benes Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Signed-off-by: Petr Mladek --- kernel/livepatch/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c4ce08f43bd6..ab4a4606d19b 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1175,6 +1175,7 @@ err: pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", patch->mod->name, obj->mod->name, obj->mod->name); mod->klp_alive = false; + obj->mod = NULL; klp_cleanup_module_patches_limited(mod, patch); mutex_unlock(&klp_mutex); From d6dfe43ec6062beea5ba1172b957e74a13c95b86 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 16 Aug 2019 17:48:36 -0400 Subject: [PATCH 0148/2880] svcrdma: Remove svc_rdma_wq Clean up: the system workqueue will work just as well. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 - net/sunrpc/xprtrdma/svc_rdma.c | 7 ------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 ++- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 981f0d726ad4..edb39900fe04 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -200,7 +200,6 @@ extern struct svc_xprt_class svc_rdma_bc_class; #endif /* svc_rdma.c */ -extern struct workqueue_struct *svc_rdma_wq; extern int svc_rdma_init(void); extern void svc_rdma_cleanup(void); diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index abdb3004a1e3..97bca509a391 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -73,8 +73,6 @@ atomic_t rdma_stat_rq_prod; atomic_t rdma_stat_sq_poll; atomic_t rdma_stat_sq_prod; -struct workqueue_struct *svc_rdma_wq; - /* * This function implements reading and resetting an atomic_t stat * variable through read/write to a proc file. Any write to the file @@ -230,7 +228,6 @@ static struct ctl_table svcrdma_root_table[] = { void svc_rdma_cleanup(void) { dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); - destroy_workqueue(svc_rdma_wq); if (svcrdma_table_header) { unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; @@ -246,10 +243,6 @@ int svc_rdma_init(void) dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); - svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); - if (!svc_rdma_wq) - return -ENOMEM; - if (!svcrdma_table_header) svcrdma_table_header = register_sysctl_table(svcrdma_root_table); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 3fe665152d95..18d6eb3686e7 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -630,8 +630,9 @@ static void svc_rdma_free(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); + INIT_WORK(&rdma->sc_work, __svc_rdma_free); - queue_work(svc_rdma_wq, &rdma->sc_work); + schedule_work(&rdma->sc_work); } static int svc_rdma_has_wspace(struct svc_xprt *xprt) From 4866073e6ddf03066c925d3237903d7f4ca68982 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 16 Aug 2019 17:49:38 -0400 Subject: [PATCH 0149/2880] svcrdma: Use llist for managing cache of recv_ctxts Use a wait-free mechanism for managing the svc_rdma_recv_ctxts free list. Subsequently, sc_recv_lock can be eliminated. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 5 +++-- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 24 ++++++++++-------------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 +-- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index edb39900fe04..40f65888dd38 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -42,6 +42,7 @@ #ifndef SVC_RDMA_H #define SVC_RDMA_H +#include #include #include #include @@ -107,8 +108,7 @@ struct svcxprt_rdma { struct list_head sc_read_complete_q; struct work_struct sc_work; - spinlock_t sc_recv_lock; - struct list_head sc_recv_ctxts; + struct llist_head sc_recv_ctxts; }; /* sc_flags */ #define RDMAXPRT_CONN_PENDING 3 @@ -125,6 +125,7 @@ enum { #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD struct svc_rdma_recv_ctxt { + struct llist_node rc_node; struct list_head rc_list; struct ib_recv_wr rc_recv_wr; struct ib_cqe rc_cqe; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 65e2fb9aac65..96bccd398469 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -172,9 +172,10 @@ static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma, void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) { - list_del(&ctxt->rc_list); + while ((node = llist_del_first(&rdma->sc_recv_ctxts))) { + ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); svc_rdma_recv_ctxt_destroy(rdma, ctxt); } } @@ -183,21 +184,18 @@ static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; + struct llist_node *node; - spin_lock(&rdma->sc_recv_lock); - ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts); - if (!ctxt) + node = llist_del_first(&rdma->sc_recv_ctxts); + if (!node) goto out_empty; - list_del(&ctxt->rc_list); - spin_unlock(&rdma->sc_recv_lock); + ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); out: ctxt->rc_page_count = 0; return ctxt; out_empty: - spin_unlock(&rdma->sc_recv_lock); - ctxt = svc_rdma_recv_ctxt_alloc(rdma); if (!ctxt) return NULL; @@ -218,11 +216,9 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, for (i = 0; i < ctxt->rc_page_count; i++) put_page(ctxt->rc_pages[i]); - if (!ctxt->rc_temp) { - spin_lock(&rdma->sc_recv_lock); - list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts); - spin_unlock(&rdma->sc_recv_lock); - } else + if (!ctxt->rc_temp) + llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); + else svc_rdma_recv_ctxt_destroy(rdma, ctxt); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 18d6eb3686e7..4182d569b5cf 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -140,14 +140,13 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts); - INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts); + init_llist_head(&cma_xprt->sc_recv_ctxts); INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); spin_lock_init(&cma_xprt->sc_send_lock); - spin_lock_init(&cma_xprt->sc_recv_lock); spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); /* From f69d6d8eef7807f8d937b81da24bebd2e926e4d2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:44 -0400 Subject: [PATCH 0150/2880] sunrpc: add a new cache_detail operation for when a cache is flushed When the exports table is changed, exportfs will usually write a new time to the "flush" file in the nfsd.export cache procfile. This tells the kernel to flush any entries that are older than that value. This gives us a mechanism to tell whether an unexport might have occurred. Add a new ->flush cache_detail operation that is called after flushing the cache whenever someone writes to a "flush" file. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/cache.h | 1 + net/sunrpc/cache.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index f7d086b77a21..f8603724fbee 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -87,6 +87,7 @@ struct cache_detail { int has_died); struct cache_head * (*alloc)(void); + void (*flush)(void); int (*match)(struct cache_head *orig, struct cache_head *new); void (*init)(struct cache_head *orig, struct cache_head *new); void (*update)(struct cache_head *orig, struct cache_head *new); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index a6a6190ad37a..a349094f6fb7 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1524,6 +1524,9 @@ static ssize_t write_flush(struct file *file, const char __user *buf, cd->nextcheck = now; cache_flush(); + if (cd->flush) + cd->flush(); + *ppos += count; return count; } From 18f6622ebbdea56a83f8e553c159ce2d62d3ad0c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:45 -0400 Subject: [PATCH 0151/2880] locks: create a new notifier chain for lease attempts With the new file caching infrastructure in nfsd, we can end up holding files open for an indefinite period of time, even when they are still idle. This may prevent the kernel from handing out leases on the file, which is something we don't want to block. Fix this by running a SRCU notifier call chain whenever on any lease attempt. nfsd can then purge the cache for that inode before returning. Since SRCU is only conditionally compiled in, we must only define the new chain if it's enabled, and users of the chain must ensure that SRCU is enabled. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/locks.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 5 ++++ 2 files changed, 66 insertions(+) diff --git a/fs/locks.c b/fs/locks.c index 686eae21daf6..1913481bfbf7 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1990,6 +1990,64 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp, } EXPORT_SYMBOL(generic_setlease); +#if IS_ENABLED(CONFIG_SRCU) +/* + * Kernel subsystems can register to be notified on any attempt to set + * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd + * to close files that it may have cached when there is an attempt to set a + * conflicting lease. + */ +static struct srcu_notifier_head lease_notifier_chain; + +static inline void +lease_notifier_chain_init(void) +{ + srcu_init_notifier_head(&lease_notifier_chain); +} + +static inline void +setlease_notifier(long arg, struct file_lock *lease) +{ + if (arg != F_UNLCK) + srcu_notifier_call_chain(&lease_notifier_chain, arg, lease); +} + +int lease_register_notifier(struct notifier_block *nb) +{ + return srcu_notifier_chain_register(&lease_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(lease_register_notifier); + +void lease_unregister_notifier(struct notifier_block *nb) +{ + srcu_notifier_chain_unregister(&lease_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(lease_unregister_notifier); + +#else /* !IS_ENABLED(CONFIG_SRCU) */ +static inline void +lease_notifier_chain_init(void) +{ +} + +static inline void +setlease_notifier(long arg, struct file_lock *lease) +{ +} + +int lease_register_notifier(struct notifier_block *nb) +{ + return 0; +} +EXPORT_SYMBOL_GPL(lease_register_notifier); + +void lease_unregister_notifier(struct notifier_block *nb) +{ +} +EXPORT_SYMBOL_GPL(lease_unregister_notifier); + +#endif /* IS_ENABLED(CONFIG_SRCU) */ + /** * vfs_setlease - sets a lease on an open file * @filp: file pointer @@ -2010,6 +2068,8 @@ EXPORT_SYMBOL(generic_setlease); int vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { + if (lease) + setlease_notifier(arg, *lease); if (filp->f_op->setlease) return filp->f_op->setlease(filp, arg, lease, priv); else @@ -2923,6 +2983,7 @@ static int __init filelock_init(void) INIT_HLIST_HEAD(&fll->hlist); } + lease_notifier_chain_init(); return 0; } core_initcall(filelock_init); diff --git a/include/linux/fs.h b/include/linux/fs.h index 56b8e358af5c..0f106c7f4bb9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1155,6 +1155,11 @@ extern void lease_get_mtime(struct inode *, struct timespec64 *time); extern int generic_setlease(struct file *, long, struct file_lock **, void **priv); extern int vfs_setlease(struct file *, long, struct file_lock **, void **); extern int lease_modify(struct file_lock *, int, struct list_head *); + +struct notifier_block; +extern int lease_register_notifier(struct notifier_block *); +extern void lease_unregister_notifier(struct notifier_block *); + struct files_struct; extern void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); From b72679ee89a0a0ecd26f7b6fcae96cdaababff94 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 18 Aug 2019 14:18:46 -0400 Subject: [PATCH 0152/2880] notify: export symbols for use by the knfsd file cache The knfsd file cache will need to detect when files are unlinked, so that it can close the associated cached files. Export a minimal set of notifier functions to allow it to do so. Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/notify/fsnotify.h | 2 -- fs/notify/group.c | 2 ++ fs/notify/mark.c | 6 ++++++ include/linux/fsnotify_backend.h | 2 ++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 5a00121fb219..f3462828a0e2 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -54,8 +54,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { fsnotify_destroy_marks(&sb->s_fsnotify_marks); } -/* Wait until all marks queued for destruction are destroyed */ -extern void fsnotify_wait_marks_destroyed(void); /* * update the dentry->d_flags of all of inode's children to indicate if inode cares diff --git a/fs/notify/group.c b/fs/notify/group.c index 0391190305cc..133f723aca07 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -108,6 +108,7 @@ void fsnotify_put_group(struct fsnotify_group *group) if (refcount_dec_and_test(&group->refcnt)) fsnotify_final_destroy_group(group); } +EXPORT_SYMBOL_GPL(fsnotify_put_group); /* * Create a new fsnotify_group and hold a reference for the group returned. @@ -137,6 +138,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) return group; } +EXPORT_SYMBOL_GPL(fsnotify_alloc_group); int fsnotify_fasync(int fd, struct file *file, int on) { diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 99ddd126f6f0..1d96216dffd1 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -276,6 +276,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) queue_delayed_work(system_unbound_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } +EXPORT_SYMBOL_GPL(fsnotify_put_mark); /* * Get mark reference when we found the mark via lockless traversal of object @@ -430,6 +431,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, mutex_unlock(&group->mark_mutex); fsnotify_free_mark(mark); } +EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); /* * Sorting function for lists of fsnotify marks. @@ -685,6 +687,7 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, mutex_unlock(&group->mark_mutex); return ret; } +EXPORT_SYMBOL_GPL(fsnotify_add_mark); /* * Given a list of marks, find the mark associated with given group. If found @@ -711,6 +714,7 @@ struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, spin_unlock(&conn->lock); return NULL; } +EXPORT_SYMBOL_GPL(fsnotify_find_mark); /* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, @@ -809,6 +813,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, mark->group = group; WRITE_ONCE(mark->connector, NULL); } +EXPORT_SYMBOL_GPL(fsnotify_init_mark); /* * Destroy all marks in destroy_list, waits for SRCU period to finish before @@ -837,3 +842,4 @@ void fsnotify_wait_marks_destroyed(void) { flush_delayed_work(&reaper_work); } +EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 2de3b2ddd19a..1915bdba2fad 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -475,6 +475,8 @@ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, extern void fsnotify_detach_mark(struct fsnotify_mark *mark); /* free mark */ extern void fsnotify_free_mark(struct fsnotify_mark *mark); +/* Wait until all marks queued for destruction are destroyed */ +extern void fsnotify_wait_marks_destroyed(void); /* run all the marks in a group, and clear all of the marks attached to given object type */ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type); /* run all the marks in a group, and clear all of the vfsmount marks */ From 7239a40ca8bfd88dc5d2f66a14882054fe8e3b92 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 18 Aug 2019 14:18:47 -0400 Subject: [PATCH 0153/2880] vfs: Export flush_delayed_fput for use by knfsd. Allow knfsd to flush the delayed fput list so that it can ensure the cached struct file is closed before it is unlinked. Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/file_table.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/file_table.c b/fs/file_table.c index b07b53f24ff5..30d55c9a1744 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -327,6 +327,7 @@ void flush_delayed_fput(void) { delayed_fput(NULL); } +EXPORT_SYMBOL_GPL(flush_delayed_fput); static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); From 65294c1f2c5e72b15b76e16c8c8cfd9359fc9f6f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:48 -0400 Subject: [PATCH 0154/2880] nfsd: add a new struct file caching facility to nfsd Currently, NFSv2/3 reads and writes have to open a file, do the read or write and then close it again for each RPC. This is highly inefficient, especially when the underlying filesystem has a relatively slow open routine. This patch adds a new open file cache to knfsd. Rather than doing an open for each RPC, the read/write handlers can call into this cache to see if there is one already there for the correct filehandle and NFS_MAY_READ/WRITE flags. If there isn't an entry, then we create a new one and attempt to perform the open. If there is, then we wait until the entry is fully instantiated and return it if it is at the end of the wait. If it's not, then we attempt to take over construction. Since the main goal is to speed up NFSv2/3 I/O, we don't want to close these files on last put of these objects. We need to keep them around for a little while since we never know when the next READ/WRITE will come in. Cache entries have a hardcoded 1s timeout, and we have a recurring workqueue job that walks the cache and purges any entries that have expired. Signed-off-by: Jeff Layton Signed-off-by: Weston Andros Adamson Signed-off-by: Richard Sharpe Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/Kconfig | 1 + fs/nfsd/Makefile | 3 +- fs/nfsd/export.c | 13 + fs/nfsd/filecache.c | 885 ++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/filecache.h | 60 +++ fs/nfsd/nfssvc.c | 9 +- fs/nfsd/trace.h | 140 +++++++ fs/nfsd/vfs.c | 65 ++-- fs/nfsd/vfs.h | 3 + 9 files changed, 1155 insertions(+), 24 deletions(-) create mode 100644 fs/nfsd/filecache.c create mode 100644 fs/nfsd/filecache.h diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index d25f6bbe7006..bff8456220e0 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -3,6 +3,7 @@ config NFSD tristate "NFS server support" depends on INET depends on FILE_LOCKING + depends on FSNOTIFY select LOCKD select SUNRPC select EXPORTFS diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 2bfb58eefad1..6a40b1afe703 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -11,7 +11,8 @@ obj-$(CONFIG_NFSD) += nfsd.o nfsd-y += trace.o nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ - export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o + export.o auth.o lockd.o nfscache.o nfsxdr.o \ + stats.o filecache.o nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index baa01956a5b3..052fac64b578 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -22,6 +22,7 @@ #include "nfsfh.h" #include "netns.h" #include "pnfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT @@ -232,6 +233,17 @@ static struct cache_head *expkey_alloc(void) return NULL; } +static void expkey_flush(void) +{ + /* + * Take the nfsd_mutex here to ensure that the file cache is not + * destroyed while we're in the middle of flushing. + */ + mutex_lock(&nfsd_mutex); + nfsd_file_cache_purge(); + mutex_unlock(&nfsd_mutex); +} + static const struct cache_detail svc_expkey_cache_template = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, @@ -244,6 +256,7 @@ static const struct cache_detail svc_expkey_cache_template = { .init = expkey_init, .update = expkey_update, .alloc = expkey_alloc, + .flush = expkey_flush, }; static int diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c new file mode 100644 index 000000000000..a2fcb251d2f6 --- /dev/null +++ b/fs/nfsd/filecache.c @@ -0,0 +1,885 @@ +/* + * Open file cache. + * + * (c) 2015 - Jeff Layton + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vfs.h" +#include "nfsd.h" +#include "nfsfh.h" +#include "filecache.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_FH + +/* FIXME: dynamically size this for the machine somehow? */ +#define NFSD_FILE_HASH_BITS 12 +#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS) +#define NFSD_LAUNDRETTE_DELAY (2 * HZ) + +#define NFSD_FILE_LRU_RESCAN (0) +#define NFSD_FILE_SHUTDOWN (1) +#define NFSD_FILE_LRU_THRESHOLD (4096UL) +#define NFSD_FILE_LRU_LIMIT (NFSD_FILE_LRU_THRESHOLD << 2) + +/* We only care about NFSD_MAY_READ/WRITE for this cache */ +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) + +struct nfsd_fcache_bucket { + struct hlist_head nfb_head; + spinlock_t nfb_lock; + unsigned int nfb_count; + unsigned int nfb_maxcount; +}; + +static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); + +static struct kmem_cache *nfsd_file_slab; +static struct kmem_cache *nfsd_file_mark_slab; +static struct nfsd_fcache_bucket *nfsd_file_hashtbl; +static struct list_lru nfsd_file_lru; +static long nfsd_file_lru_flags; +static struct fsnotify_group *nfsd_file_fsnotify_group; +static atomic_long_t nfsd_filecache_count; +static struct delayed_work nfsd_filecache_laundrette; + +enum nfsd_file_laundrette_ctl { + NFSD_FILE_LAUNDRETTE_NOFLUSH = 0, + NFSD_FILE_LAUNDRETTE_MAY_FLUSH +}; + +static void +nfsd_file_schedule_laundrette(enum nfsd_file_laundrette_ctl ctl) +{ + long count = atomic_long_read(&nfsd_filecache_count); + + if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags)) + return; + + /* Be more aggressive about scanning if over the threshold */ + if (count > NFSD_FILE_LRU_THRESHOLD) + mod_delayed_work(system_wq, &nfsd_filecache_laundrette, 0); + else + schedule_delayed_work(&nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY); + + if (ctl == NFSD_FILE_LAUNDRETTE_NOFLUSH) + return; + + /* ...and don't delay flushing if we're out of control */ + if (count >= NFSD_FILE_LRU_LIMIT) + flush_delayed_work(&nfsd_filecache_laundrette); +} + +static void +nfsd_file_slab_free(struct rcu_head *rcu) +{ + struct nfsd_file *nf = container_of(rcu, struct nfsd_file, nf_rcu); + + put_cred(nf->nf_cred); + kmem_cache_free(nfsd_file_slab, nf); +} + +static void +nfsd_file_mark_free(struct fsnotify_mark *mark) +{ + struct nfsd_file_mark *nfm = container_of(mark, struct nfsd_file_mark, + nfm_mark); + + kmem_cache_free(nfsd_file_mark_slab, nfm); +} + +static struct nfsd_file_mark * +nfsd_file_mark_get(struct nfsd_file_mark *nfm) +{ + if (!atomic_inc_not_zero(&nfm->nfm_ref)) + return NULL; + return nfm; +} + +static void +nfsd_file_mark_put(struct nfsd_file_mark *nfm) +{ + if (atomic_dec_and_test(&nfm->nfm_ref)) { + + fsnotify_destroy_mark(&nfm->nfm_mark, nfsd_file_fsnotify_group); + fsnotify_put_mark(&nfm->nfm_mark); + } +} + +static struct nfsd_file_mark * +nfsd_file_mark_find_or_create(struct nfsd_file *nf) +{ + int err; + struct fsnotify_mark *mark; + struct nfsd_file_mark *nfm = NULL, *new; + struct inode *inode = nf->nf_inode; + + do { + mutex_lock(&nfsd_file_fsnotify_group->mark_mutex); + mark = fsnotify_find_mark(&inode->i_fsnotify_marks, + nfsd_file_fsnotify_group); + if (mark) { + nfm = nfsd_file_mark_get(container_of(mark, + struct nfsd_file_mark, + nfm_mark)); + mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex); + fsnotify_put_mark(mark); + if (likely(nfm)) + break; + } else + mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex); + + /* allocate a new nfm */ + new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL); + if (!new) + return NULL; + fsnotify_init_mark(&new->nfm_mark, nfsd_file_fsnotify_group); + new->nfm_mark.mask = FS_ATTRIB|FS_DELETE_SELF; + atomic_set(&new->nfm_ref, 1); + + err = fsnotify_add_inode_mark(&new->nfm_mark, inode, 0); + + /* + * If the add was successful, then return the object. + * Otherwise, we need to put the reference we hold on the + * nfm_mark. The fsnotify code will take a reference and put + * it on failure, so we can't just free it directly. It's also + * not safe to call fsnotify_destroy_mark on it as the + * mark->group will be NULL. Thus, we can't let the nfm_ref + * counter drive the destruction at this point. + */ + if (likely(!err)) + nfm = new; + else + fsnotify_put_mark(&new->nfm_mark); + } while (unlikely(err == -EEXIST)); + + return nfm; +} + +static struct nfsd_file * +nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval) +{ + struct nfsd_file *nf; + + nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL); + if (nf) { + INIT_HLIST_NODE(&nf->nf_node); + INIT_LIST_HEAD(&nf->nf_lru); + nf->nf_file = NULL; + nf->nf_cred = get_current_cred(); + nf->nf_flags = 0; + nf->nf_inode = inode; + nf->nf_hashval = hashval; + atomic_set(&nf->nf_ref, 1); + nf->nf_may = may & NFSD_FILE_MAY_MASK; + if (may & NFSD_MAY_NOT_BREAK_LEASE) { + if (may & NFSD_MAY_WRITE) + __set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags); + if (may & NFSD_MAY_READ) + __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); + } + nf->nf_mark = NULL; + trace_nfsd_file_alloc(nf); + } + return nf; +} + +static bool +nfsd_file_free(struct nfsd_file *nf) +{ + bool flush = false; + + trace_nfsd_file_put_final(nf); + if (nf->nf_mark) + nfsd_file_mark_put(nf->nf_mark); + if (nf->nf_file) { + get_file(nf->nf_file); + filp_close(nf->nf_file, NULL); + fput(nf->nf_file); + flush = true; + } + call_rcu(&nf->nf_rcu, nfsd_file_slab_free); + return flush; +} + +static void +nfsd_file_do_unhash(struct nfsd_file *nf) +{ + lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + + trace_nfsd_file_unhash(nf); + + --nfsd_file_hashtbl[nf->nf_hashval].nfb_count; + hlist_del_rcu(&nf->nf_node); + if (!list_empty(&nf->nf_lru)) + list_lru_del(&nfsd_file_lru, &nf->nf_lru); + atomic_long_dec(&nfsd_filecache_count); +} + +static bool +nfsd_file_unhash(struct nfsd_file *nf) +{ + if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + nfsd_file_do_unhash(nf); + return true; + } + return false; +} + +/* + * Return true if the file was unhashed. + */ +static bool +nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose) +{ + lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + + trace_nfsd_file_unhash_and_release_locked(nf); + if (!nfsd_file_unhash(nf)) + return false; + /* keep final reference for nfsd_file_lru_dispose */ + if (atomic_add_unless(&nf->nf_ref, -1, 1)) + return true; + + list_add(&nf->nf_lru, dispose); + return true; +} + +static int +nfsd_file_put_noref(struct nfsd_file *nf) +{ + int count; + trace_nfsd_file_put(nf); + + count = atomic_dec_return(&nf->nf_ref); + if (!count) { + WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags)); + nfsd_file_free(nf); + } + return count; +} + +void +nfsd_file_put(struct nfsd_file *nf) +{ + bool is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0; + + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + if (nfsd_file_put_noref(nf) == 1 && is_hashed) + nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_MAY_FLUSH); +} + +struct nfsd_file * +nfsd_file_get(struct nfsd_file *nf) +{ + if (likely(atomic_inc_not_zero(&nf->nf_ref))) + return nf; + return NULL; +} + +static void +nfsd_file_dispose_list(struct list_head *dispose) +{ + struct nfsd_file *nf; + + while(!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); + list_del(&nf->nf_lru); + nfsd_file_put_noref(nf); + } +} + +static void +nfsd_file_dispose_list_sync(struct list_head *dispose) +{ + bool flush = false; + struct nfsd_file *nf; + + while(!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); + list_del(&nf->nf_lru); + if (!atomic_dec_and_test(&nf->nf_ref)) + continue; + if (nfsd_file_free(nf)) + flush = true; + } + if (flush) + flush_delayed_fput(); +} + +/* + * Note this can deadlock with nfsd_file_cache_purge. + */ +static enum lru_status +nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, + spinlock_t *lock, void *arg) + __releases(lock) + __acquires(lock) +{ + struct list_head *head = arg; + struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); + + /* + * Do a lockless refcount check. The hashtable holds one reference, so + * we look to see if anything else has a reference, or if any have + * been put since the shrinker last ran. Those don't get unhashed and + * released. + * + * Note that in the put path, we set the flag and then decrement the + * counter. Here we check the counter and then test and clear the flag. + * That order is deliberate to ensure that we can do this locklessly. + */ + if (atomic_read(&nf->nf_ref) > 1) + goto out_skip; + if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) + goto out_rescan; + + if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) + goto out_skip; + + list_lru_isolate_move(lru, &nf->nf_lru, head); + return LRU_REMOVED; +out_rescan: + set_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags); +out_skip: + return LRU_SKIP; +} + +static void +nfsd_file_lru_dispose(struct list_head *head) +{ + while(!list_empty(head)) { + struct nfsd_file *nf = list_first_entry(head, + struct nfsd_file, nf_lru); + list_del_init(&nf->nf_lru); + spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + nfsd_file_do_unhash(nf); + spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + nfsd_file_put_noref(nf); + } +} + +static unsigned long +nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc) +{ + return list_lru_count(&nfsd_file_lru); +} + +static unsigned long +nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) +{ + LIST_HEAD(head); + unsigned long ret; + + ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &head); + nfsd_file_lru_dispose(&head); + return ret; +} + +static struct shrinker nfsd_file_shrinker = { + .scan_objects = nfsd_file_lru_scan, + .count_objects = nfsd_file_lru_count, + .seeks = 1, +}; + +static void +__nfsd_file_close_inode(struct inode *inode, unsigned int hashval, + struct list_head *dispose) +{ + struct nfsd_file *nf; + struct hlist_node *tmp; + + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) { + if (inode == nf->nf_inode) + nfsd_file_unhash_and_release_locked(nf, dispose); + } + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); +} + +/** + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * @inode: inode of the file to attempt to remove + * + * Walk the whole hash bucket, looking for any files that correspond to "inode". + * If any do, then unhash them and put the hashtable reference to them and + * destroy any that had their last reference put. Also ensure that any of the + * fputs also have their final __fput done as well. + */ +void +nfsd_file_close_inode_sync(struct inode *inode) +{ + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + LIST_HEAD(dispose); + + __nfsd_file_close_inode(inode, hashval, &dispose); + trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose)); + nfsd_file_dispose_list_sync(&dispose); +} + +/** + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * @inode: inode of the file to attempt to remove + * + * Walk the whole hash bucket, looking for any files that correspond to "inode". + * If any do, then unhash them and put the hashtable reference to them and + * destroy any that had their last reference put. + */ +static void +nfsd_file_close_inode(struct inode *inode) +{ + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + LIST_HEAD(dispose); + + __nfsd_file_close_inode(inode, hashval, &dispose); + trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose)); + nfsd_file_dispose_list(&dispose); +} + +/** + * nfsd_file_delayed_close - close unused nfsd_files + * @work: dummy + * + * Walk the LRU list and close any entries that have not been used since + * the last scan. + * + * Note this can deadlock with nfsd_file_cache_purge. + */ +static void +nfsd_file_delayed_close(struct work_struct *work) +{ + LIST_HEAD(head); + + list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, &head, LONG_MAX); + + if (test_and_clear_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags)) + nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_NOFLUSH); + + if (!list_empty(&head)) { + nfsd_file_lru_dispose(&head); + flush_delayed_fput(); + } +} + +static int +nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg, + void *data) +{ + struct file_lock *fl = data; + + /* Only close files for F_SETLEASE leases */ + if (fl->fl_flags & FL_LEASE) + nfsd_file_close_inode_sync(file_inode(fl->fl_file)); + return 0; +} + +static struct notifier_block nfsd_file_lease_notifier = { + .notifier_call = nfsd_file_lease_notifier_call, +}; + +static int +nfsd_file_fsnotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + u32 mask, const void *data, int data_type, + const struct qstr *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) +{ + trace_nfsd_file_fsnotify_handle_event(inode, mask); + + /* Should be no marks on non-regular files */ + if (!S_ISREG(inode->i_mode)) { + WARN_ON_ONCE(1); + return 0; + } + + /* don't close files if this was not the last link */ + if (mask & FS_ATTRIB) { + if (inode->i_nlink) + return 0; + } + + nfsd_file_close_inode(inode); + return 0; +} + + +static const struct fsnotify_ops nfsd_file_fsnotify_ops = { + .handle_event = nfsd_file_fsnotify_handle_event, + .free_mark = nfsd_file_mark_free, +}; + +int +nfsd_file_cache_init(void) +{ + int ret = -ENOMEM; + unsigned int i; + + clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); + + if (nfsd_file_hashtbl) + return 0; + + nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE, + sizeof(*nfsd_file_hashtbl), GFP_KERNEL); + if (!nfsd_file_hashtbl) { + pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n"); + goto out_err; + } + + nfsd_file_slab = kmem_cache_create("nfsd_file", + sizeof(struct nfsd_file), 0, 0, NULL); + if (!nfsd_file_slab) { + pr_err("nfsd: unable to create nfsd_file_slab\n"); + goto out_err; + } + + nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark", + sizeof(struct nfsd_file_mark), 0, 0, NULL); + if (!nfsd_file_mark_slab) { + pr_err("nfsd: unable to create nfsd_file_mark_slab\n"); + goto out_err; + } + + + ret = list_lru_init(&nfsd_file_lru); + if (ret) { + pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret); + goto out_err; + } + + ret = register_shrinker(&nfsd_file_shrinker); + if (ret) { + pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret); + goto out_lru; + } + + ret = lease_register_notifier(&nfsd_file_lease_notifier); + if (ret) { + pr_err("nfsd: unable to register lease notifier: %d\n", ret); + goto out_shrinker; + } + + nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops); + if (IS_ERR(nfsd_file_fsnotify_group)) { + pr_err("nfsd: unable to create fsnotify group: %ld\n", + PTR_ERR(nfsd_file_fsnotify_group)); + nfsd_file_fsnotify_group = NULL; + goto out_notifier; + } + + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head); + spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock); + } + + INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_delayed_close); +out: + return ret; +out_notifier: + lease_unregister_notifier(&nfsd_file_lease_notifier); +out_shrinker: + unregister_shrinker(&nfsd_file_shrinker); +out_lru: + list_lru_destroy(&nfsd_file_lru); +out_err: + kmem_cache_destroy(nfsd_file_slab); + nfsd_file_slab = NULL; + kmem_cache_destroy(nfsd_file_mark_slab); + nfsd_file_mark_slab = NULL; + kfree(nfsd_file_hashtbl); + nfsd_file_hashtbl = NULL; + goto out; +} + +/* + * Note this can deadlock with nfsd_file_lru_cb. + */ +void +nfsd_file_cache_purge(void) +{ + unsigned int i; + struct nfsd_file *nf; + LIST_HEAD(dispose); + bool del; + + if (!nfsd_file_hashtbl) + return; + + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + spin_lock(&nfsd_file_hashtbl[i].nfb_lock); + while(!hlist_empty(&nfsd_file_hashtbl[i].nfb_head)) { + nf = hlist_entry(nfsd_file_hashtbl[i].nfb_head.first, + struct nfsd_file, nf_node); + del = nfsd_file_unhash_and_release_locked(nf, &dispose); + + /* + * Deadlock detected! Something marked this entry as + * unhased, but hasn't removed it from the hash list. + */ + WARN_ON_ONCE(!del); + } + spin_unlock(&nfsd_file_hashtbl[i].nfb_lock); + nfsd_file_dispose_list(&dispose); + } +} + +void +nfsd_file_cache_shutdown(void) +{ + LIST_HEAD(dispose); + + set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); + + lease_unregister_notifier(&nfsd_file_lease_notifier); + unregister_shrinker(&nfsd_file_shrinker); + /* + * make sure all callers of nfsd_file_lru_cb are done before + * calling nfsd_file_cache_purge + */ + cancel_delayed_work_sync(&nfsd_filecache_laundrette); + nfsd_file_cache_purge(); + list_lru_destroy(&nfsd_file_lru); + rcu_barrier(); + fsnotify_put_group(nfsd_file_fsnotify_group); + nfsd_file_fsnotify_group = NULL; + kmem_cache_destroy(nfsd_file_slab); + nfsd_file_slab = NULL; + fsnotify_wait_marks_destroyed(); + kmem_cache_destroy(nfsd_file_mark_slab); + nfsd_file_mark_slab = NULL; + kfree(nfsd_file_hashtbl); + nfsd_file_hashtbl = NULL; +} + +static bool +nfsd_match_cred(const struct cred *c1, const struct cred *c2) +{ + int i; + + if (!uid_eq(c1->fsuid, c2->fsuid)) + return false; + if (!gid_eq(c1->fsgid, c2->fsgid)) + return false; + if (c1->group_info == NULL || c2->group_info == NULL) + return c1->group_info == c2->group_info; + if (c1->group_info->ngroups != c2->group_info->ngroups) + return false; + for (i = 0; i < c1->group_info->ngroups; i++) { + if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i])) + return false; + } + return true; +} + +static struct nfsd_file * +nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, + unsigned int hashval) +{ + struct nfsd_file *nf; + unsigned char need = may_flags & NFSD_FILE_MAY_MASK; + + hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, + nf_node) { + if ((need & nf->nf_may) != need) + continue; + if (nf->nf_inode != inode) + continue; + if (!nfsd_match_cred(nf->nf_cred, current_cred())) + continue; + if (nfsd_file_get(nf) != NULL) + return nf; + } + return NULL; +} + +/** + * nfsd_file_is_cached - are there any cached open files for this fh? + * @inode: inode of the file to check + * + * Scan the hashtable for open files that match this fh. Returns true if there + * are any, and false if not. + */ +bool +nfsd_file_is_cached(struct inode *inode) +{ + bool ret = false; + struct nfsd_file *nf; + unsigned int hashval; + + hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); + + rcu_read_lock(); + hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, + nf_node) { + if (inode == nf->nf_inode) { + ret = true; + break; + } + } + rcu_read_unlock(); + trace_nfsd_file_is_cached(inode, hashval, (int)ret); + return ret; +} + +__be32 +nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + __be32 status; + struct nfsd_file *nf, *new; + struct inode *inode; + unsigned int hashval; + + /* FIXME: skip this if fh_dentry is already set? */ + status = fh_verify(rqstp, fhp, S_IFREG, + may_flags|NFSD_MAY_OWNER_OVERRIDE); + if (status != nfs_ok) + return status; + + inode = d_inode(fhp->fh_dentry); + hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); +retry: + rcu_read_lock(); + nf = nfsd_file_find_locked(inode, may_flags, hashval); + rcu_read_unlock(); + if (nf) + goto wait_for_construction; + + new = nfsd_file_alloc(inode, may_flags, hashval); + if (!new) { + trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, + NULL, nfserr_jukebox); + return nfserr_jukebox; + } + + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + nf = nfsd_file_find_locked(inode, may_flags, hashval); + if (nf == NULL) + goto open_file; + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + nfsd_file_slab_free(&new->nf_rcu); + +wait_for_construction: + wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE); + + /* Did construction of this file fail? */ + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + nfsd_file_put_noref(nf); + goto retry; + } + + this_cpu_inc(nfsd_file_cache_hits); + + if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) { + bool write = (may_flags & NFSD_MAY_WRITE); + + if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) || + (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) { + status = nfserrno(nfsd_open_break_lease( + file_inode(nf->nf_file), may_flags)); + if (status == nfs_ok) { + clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); + if (write) + clear_bit(NFSD_FILE_BREAK_WRITE, + &nf->nf_flags); + } + } + } +out: + if (status == nfs_ok) { + *pnf = nf; + } else { + nfsd_file_put(nf); + nf = NULL; + } + + trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status); + return status; +open_file: + nf = new; + /* Take reference for the hashtable */ + atomic_inc(&nf->nf_ref); + __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); + __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); + list_lru_add(&nfsd_file_lru, &nf->nf_lru); + hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head); + ++nfsd_file_hashtbl[hashval].nfb_count; + nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount, + nfsd_file_hashtbl[hashval].nfb_count); + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + atomic_long_inc(&nfsd_filecache_count); + + nf->nf_mark = nfsd_file_mark_find_or_create(nf); + if (nf->nf_mark) + status = nfsd_open_verified(rqstp, fhp, S_IFREG, + may_flags, &nf->nf_file); + else + status = nfserr_jukebox; + /* + * If construction failed, or we raced with a call to unlink() + * then unhash. + */ + if (status != nfs_ok || inode->i_nlink == 0) { + bool do_free; + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + do_free = nfsd_file_unhash(nf); + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + if (do_free) + nfsd_file_put_noref(nf); + } + clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags); + smp_mb__after_atomic(); + wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); + goto out; +} + +/* + * Note that fields may be added, removed or reordered in the future. Programs + * scraping this file for info should test the labels to ensure they're + * getting the correct field. + */ +static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) +{ + unsigned int i, count = 0, longest = 0; + unsigned long hits = 0; + + /* + * No need for spinlocks here since we're not terribly interested in + * accuracy. We do take the nfsd_mutex simply to ensure that we + * don't end up racing with server shutdown + */ + mutex_lock(&nfsd_mutex); + if (nfsd_file_hashtbl) { + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + count += nfsd_file_hashtbl[i].nfb_count; + longest = max(longest, nfsd_file_hashtbl[i].nfb_count); + } + } + mutex_unlock(&nfsd_mutex); + + for_each_possible_cpu(i) + hits += per_cpu(nfsd_file_cache_hits, i); + + seq_printf(m, "total entries: %u\n", count); + seq_printf(m, "longest chain: %u\n", longest); + seq_printf(m, "cache hits: %lu\n", hits); + return 0; +} + +int nfsd_file_cache_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, nfsd_file_cache_stats_show, NULL); +} diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h new file mode 100644 index 000000000000..0c0c67166b87 --- /dev/null +++ b/fs/nfsd/filecache.h @@ -0,0 +1,60 @@ +#ifndef _FS_NFSD_FILECACHE_H +#define _FS_NFSD_FILECACHE_H + +#include + +/* + * This is the fsnotify_mark container that nfsd attaches to the files that it + * is holding open. Note that we have a separate refcount here aside from the + * one in the fsnotify_mark. We only want a single fsnotify_mark attached to + * the inode, and for each nfsd_file to hold a reference to it. + * + * The fsnotify_mark is itself refcounted, but that's not sufficient to tell us + * how to put that reference. If there are still outstanding nfsd_files that + * reference the mark, then we would want to call fsnotify_put_mark on it. + * If there were not, then we'd need to call fsnotify_destroy_mark. Since we + * can't really tell the difference, we use the nfm_mark to keep track of how + * many nfsd_files hold references to the mark. When that counter goes to zero + * then we know to call fsnotify_destroy_mark on it. + */ +struct nfsd_file_mark { + struct fsnotify_mark nfm_mark; + atomic_t nfm_ref; +}; + +/* + * A representation of a file that has been opened by knfsd. These are hashed + * in the hashtable by inode pointer value. Note that this object doesn't + * hold a reference to the inode by itself, so the nf_inode pointer should + * never be dereferenced, only used for comparison. + */ +struct nfsd_file { + struct hlist_node nf_node; + struct list_head nf_lru; + struct rcu_head nf_rcu; + struct file *nf_file; + const struct cred *nf_cred; +#define NFSD_FILE_HASHED (0) +#define NFSD_FILE_PENDING (1) +#define NFSD_FILE_BREAK_READ (2) +#define NFSD_FILE_BREAK_WRITE (3) +#define NFSD_FILE_REFERENCED (4) + unsigned long nf_flags; + struct inode *nf_inode; + unsigned int nf_hashval; + atomic_t nf_ref; + unsigned char nf_may; + struct nfsd_file_mark *nf_mark; +}; + +int nfsd_file_cache_init(void); +void nfsd_file_cache_purge(void); +void nfsd_file_cache_shutdown(void); +void nfsd_file_put(struct nfsd_file *nf); +struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); +void nfsd_file_close_inode_sync(struct inode *inode); +bool nfsd_file_is_cached(struct inode *inode); +__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); +int nfsd_file_cache_stats_open(struct inode *, struct file *); +#endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 18d94ea984ba..a6b1eab7b722 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -27,6 +27,7 @@ #include "cache.h" #include "vfs.h" #include "netns.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_SVC @@ -313,6 +314,9 @@ static int nfsd_startup_generic(int nrservs) if (nfsd_users++) return 0; + ret = nfsd_file_cache_init(); + if (ret) + goto dec_users; /* * Readahead param cache - will no-op if it already exists. * (Note therefore results will be suboptimal if number of @@ -320,7 +324,7 @@ static int nfsd_startup_generic(int nrservs) */ ret = nfsd_racache_init(2*nrservs); if (ret) - goto dec_users; + goto out_file_cache; ret = nfs4_state_start(); if (ret) @@ -329,6 +333,8 @@ static int nfsd_startup_generic(int nrservs) out_racache: nfsd_racache_shutdown(); +out_file_cache: + nfsd_file_cache_shutdown(); dec_users: nfsd_users--; return ret; @@ -340,6 +346,7 @@ static void nfsd_shutdown_generic(void) return; nfs4_state_shutdown(); + nfsd_file_cache_shutdown(); nfsd_racache_shutdown(); } diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 80933e4334d8..ffc78a0e28b2 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -126,6 +126,8 @@ DEFINE_NFSD_ERR_EVENT(read_err); DEFINE_NFSD_ERR_EVENT(write_err); #include "state.h" +#include "filecache.h" +#include "vfs.h" DECLARE_EVENT_CLASS(nfsd_stateid_class, TP_PROTO(stateid_t *stp), @@ -164,6 +166,144 @@ DEFINE_STATEID_EVENT(layout_recall_done); DEFINE_STATEID_EVENT(layout_recall_fail); DEFINE_STATEID_EVENT(layout_recall_release); +#define show_nf_flags(val) \ + __print_flags(val, "|", \ + { 1 << NFSD_FILE_HASHED, "HASHED" }, \ + { 1 << NFSD_FILE_PENDING, "PENDING" }, \ + { 1 << NFSD_FILE_BREAK_READ, "BREAK_READ" }, \ + { 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \ + { 1 << NFSD_FILE_REFERENCED, "REFERENCED"}) + +/* FIXME: This should probably be fleshed out in the future. */ +#define show_nf_may(val) \ + __print_flags(val, "|", \ + { NFSD_MAY_READ, "READ" }, \ + { NFSD_MAY_WRITE, "WRITE" }, \ + { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }) + +DECLARE_EVENT_CLASS(nfsd_file_class, + TP_PROTO(struct nfsd_file *nf), + TP_ARGS(nf), + TP_STRUCT__entry( + __field(unsigned int, nf_hashval) + __field(void *, nf_inode) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + ), + TP_fast_assign( + __entry->nf_hashval = nf->nf_hashval; + __entry->nf_inode = nf->nf_inode; + __entry->nf_ref = atomic_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p", + __entry->nf_hashval, + __entry->nf_inode, + __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nf_may(__entry->nf_may), + __entry->nf_file) +) + +#define DEFINE_NFSD_FILE_EVENT(name) \ +DEFINE_EVENT(nfsd_file_class, name, \ + TP_PROTO(struct nfsd_file *nf), \ + TP_ARGS(nf)) + +DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc); +DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash); +DEFINE_NFSD_FILE_EVENT(nfsd_file_put); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked); + +TRACE_EVENT(nfsd_file_acquire, + TP_PROTO(struct svc_rqst *rqstp, unsigned int hash, + struct inode *inode, unsigned int may_flags, + struct nfsd_file *nf, __be32 status), + + TP_ARGS(rqstp, hash, inode, may_flags, nf, status), + + TP_STRUCT__entry( + __field(__be32, xid) + __field(unsigned int, hash) + __field(void *, inode) + __field(unsigned int, may_flags) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + __field(__be32, status) + ), + + TP_fast_assign( + __entry->xid = rqstp->rq_xid; + __entry->hash = hash; + __entry->inode = inode; + __entry->may_flags = may_flags; + __entry->nf_ref = nf ? atomic_read(&nf->nf_ref) : 0; + __entry->nf_flags = nf ? nf->nf_flags : 0; + __entry->nf_may = nf ? nf->nf_may : 0; + __entry->nf_file = nf ? nf->nf_file : NULL; + __entry->status = status; + ), + + TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u", + be32_to_cpu(__entry->xid), __entry->hash, __entry->inode, + show_nf_may(__entry->may_flags), __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nf_may(__entry->nf_may), __entry->nf_file, + be32_to_cpu(__entry->status)) +); + +DECLARE_EVENT_CLASS(nfsd_file_search_class, + TP_PROTO(struct inode *inode, unsigned int hash, int found), + TP_ARGS(inode, hash, found), + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned int, hash) + __field(int, found) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->hash = hash; + __entry->found = found; + ), + TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash, + __entry->inode, __entry->found) +); + +#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \ +DEFINE_EVENT(nfsd_file_search_class, name, \ + TP_PROTO(struct inode *inode, unsigned int hash, int found), \ + TP_ARGS(inode, hash, found)) + +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync); +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode); +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached); + +TRACE_EVENT(nfsd_file_fsnotify_handle_event, + TP_PROTO(struct inode *inode, u32 mask), + TP_ARGS(inode, mask), + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned int, nlink) + __field(umode_t, mode) + __field(u32, mask) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->nlink = inode->i_nlink; + __entry->mode = inode->i_mode; + __entry->mask = mask; + ), + TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode, + __entry->nlink, __entry->mode, __entry->mask) +); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c85783e536d5..5983206ab036 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -699,7 +699,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor } #endif /* CONFIG_NFSD_V3 */ -static int nfsd_open_break_lease(struct inode *inode, int access) +int nfsd_open_break_lease(struct inode *inode, int access) { unsigned int mode; @@ -715,8 +715,8 @@ static int nfsd_open_break_lease(struct inode *inode, int access) * and additional flags. * N.B. After this call fhp needs an fh_put */ -__be32 -nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, +static __be32 +__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { struct path path; @@ -726,25 +726,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, __be32 err; int host_err = 0; - validate_process_creds(); - - /* - * If we get here, then the client has already done an "open", - * and (hopefully) checked permission - so allow OWNER_OVERRIDE - * in case a chmod has now revoked permission. - * - * Arguably we should also allow the owner override for - * directories, but we never have and it doesn't seem to have - * caused anyone a problem. If we were to change this, note - * also that our filldir callbacks would need a variant of - * lookup_one_len that doesn't check permissions. - */ - if (type == S_IFREG) - may_flags |= NFSD_MAY_OWNER_OVERRIDE; - err = fh_verify(rqstp, fhp, type, may_flags); - if (err) - goto out; - path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; inode = d_inode(path.dentry); @@ -798,10 +779,50 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, out_nfserr: err = nfserrno(host_err); out: + return err; +} + +__be32 +nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) +{ + __be32 err; + + validate_process_creds(); + /* + * If we get here, then the client has already done an "open", + * and (hopefully) checked permission - so allow OWNER_OVERRIDE + * in case a chmod has now revoked permission. + * + * Arguably we should also allow the owner override for + * directories, but we never have and it doesn't seem to have + * caused anyone a problem. If we were to change this, note + * also that our filldir callbacks would need a variant of + * lookup_one_len that doesn't check permissions. + */ + if (type == S_IFREG) + may_flags |= NFSD_MAY_OWNER_OVERRIDE; + err = fh_verify(rqstp, fhp, type, may_flags); + if (!err) + err = __nfsd_open(rqstp, fhp, type, may_flags, filp); validate_process_creds(); return err; } +__be32 +nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) +{ + __be32 err; + + validate_process_creds(); + err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + validate_process_creds(); + return err; +} + + + struct raparms * nfsd_init_raparms(struct file *file) { diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index db351247892d..31fdae34e028 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -75,8 +75,11 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, loff_t, unsigned long); #endif /* CONFIG_NFSD_V3 */ +int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); +__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, + int, struct file **); struct raparms; __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, From b493523926f9b466db3c440ac64beb93d8068cdf Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:49 -0400 Subject: [PATCH 0155/2880] nfsd: hook up nfsd_write to the new nfsd_file cache Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 5983206ab036..2f5b52004a18 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -44,6 +44,7 @@ #include "nfsd.h" #include "vfs.h" +#include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_FILEOP @@ -1104,17 +1105,18 @@ __be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, int stable) { - struct file *file = NULL; - __be32 err = 0; + struct nfsd_file *nf; + __be32 err; trace_nfsd_write_start(rqstp, fhp, offset, *cnt); - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf); if (err) goto out; - err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable); - fput(file); + err = nfsd_vfs_write(rqstp, fhp, nf->nf_file, offset, vec, + vlen, cnt, stable); + nfsd_file_put(nf); out: trace_nfsd_write_done(rqstp, fhp, offset, *cnt); return err; From 48cd7b51258c1a158293cefb97dda988080f5e13 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:50 -0400 Subject: [PATCH 0156/2880] nfsd: hook up nfsd_read to the nfsd_file cache Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2f5b52004a18..8593c6423336 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1071,25 +1071,22 @@ out_nfserr: __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { + struct nfsd_file *nf; struct file *file; - struct raparms *ra; __be32 err; trace_nfsd_read_start(rqstp, fhp, offset, *count); - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); if (err) return err; - ra = nfsd_init_raparms(file); - + file = nf->nf_file; if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) err = nfsd_splice_read(rqstp, fhp, file, offset, count); else err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count); - if (ra) - nfsd_put_raparams(file, ra); - fput(file); + nfsd_file_put(nf); trace_nfsd_read_done(rqstp, fhp, offset, *count); From 5920afa3c85ff38642f652b6e3880e79392fcc89 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:51 -0400 Subject: [PATCH 0157/2880] nfsd: hook nfsd_commit up to the nfsd_file cache Use cached filps if possible instead of opening a new one every time. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8593c6423336..ec254bff1893 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1133,9 +1133,9 @@ __be32 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long count) { - struct file *file; - loff_t end = LLONG_MAX; - __be32 err = nfserr_inval; + struct nfsd_file *nf; + loff_t end = LLONG_MAX; + __be32 err = nfserr_inval; if (offset < 0) goto out; @@ -1145,12 +1145,12 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; } - err = nfsd_open(rqstp, fhp, S_IFREG, - NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file); + err = nfsd_file_acquire(rqstp, fhp, + NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf); if (err) goto out; if (EX_ISSYNC(fhp->fh_export)) { - int err2 = vfs_fsync_range(file, offset, end, 0); + int err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); if (err2 != -EINVAL) err = nfserrno(err2); @@ -1158,7 +1158,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserr_notsupp; } - fput(file); + nfsd_file_put(nf); out: return err; } From fd4f83fd7dfb1bce2f1af51fcbaf6575f4b9d189 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:52 -0400 Subject: [PATCH 0158/2880] nfsd: convert nfs4_file->fi_fds array to use nfsd_files Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 35 ++++++++++++++++++----------------- fs/nfsd/state.h | 2 +- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7857942c5ca6..b126b86ba644 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -50,6 +50,7 @@ #include "netns.h" #include "pnfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -433,7 +434,7 @@ static struct file * __nfs4_get_fd(struct nfs4_file *f, int oflag) { if (f->fi_fds[oflag]) - return get_file(f->fi_fds[oflag]); + return get_file(f->fi_fds[oflag]->nf_file); return NULL; } @@ -590,17 +591,17 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) might_lock(&fp->fi_lock); if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) { - struct file *f1 = NULL; - struct file *f2 = NULL; + struct nfsd_file *f1 = NULL; + struct nfsd_file *f2 = NULL; swap(f1, fp->fi_fds[oflag]); if (atomic_read(&fp->fi_access[1 - oflag]) == 0) swap(f2, fp->fi_fds[O_RDWR]); spin_unlock(&fp->fi_lock); if (f1) - fput(f1); + nfsd_file_put(f1); if (f2) - fput(f2); + nfsd_file_put(f2); } } @@ -2323,9 +2324,9 @@ static void states_stop(struct seq_file *s, void *v) spin_unlock(&clp->cl_lock); } -static void nfs4_show_superblock(struct seq_file *s, struct file *f) +static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) { - struct inode *inode = file_inode(f); + struct inode *inode = f->nf_inode; seq_printf(s, "superblock: \"%02x:%02x:%ld\"", MAJOR(inode->i_sb->s_dev), @@ -2343,7 +2344,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_ol_stateid *ols; struct nfs4_file *nf; - struct file *file; + struct nfsd_file *file; struct nfs4_stateowner *oo; unsigned int access, deny; @@ -2370,7 +2371,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_owner(s, oo); seq_printf(s, " }\n"); - fput(file); + nfsd_file_put(file); return 0; } @@ -2379,7 +2380,7 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_ol_stateid *ols; struct nfs4_file *nf; - struct file *file; + struct nfsd_file *file; struct nfs4_stateowner *oo; ols = openlockstateid(st); @@ -2401,7 +2402,7 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_owner(s, oo); seq_printf(s, " }\n"); - fput(file); + nfsd_file_put(file); return 0; } @@ -4651,7 +4652,7 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { - struct file *filp = NULL; + struct nfsd_file *nf = NULL; __be32 status; int oflag = nfs4_access_to_omode(open->op_share_access); int access = nfs4_access_to_access(open->op_share_access); @@ -4687,18 +4688,18 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, if (!fp->fi_fds[oflag]) { spin_unlock(&fp->fi_lock); - status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp); + status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); if (status) goto out_put_access; spin_lock(&fp->fi_lock); if (!fp->fi_fds[oflag]) { - fp->fi_fds[oflag] = filp; - filp = NULL; + fp->fi_fds[oflag] = nf; + nf = NULL; } } spin_unlock(&fp->fi_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); status = nfsd4_truncate(rqstp, cur_fh, open); if (status) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 5dbd16946e8e..8196bfb74f12 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -506,7 +506,7 @@ struct nfs4_file { }; struct list_head fi_clnt_odstate; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ - struct file * fi_fds[3]; + struct nfsd_file *fi_fds[3]; /* * Each open or lock stateid contributes 0-4 to the counts * below depending on which bits are set in st_access_bitmap: From eb82dd393744107ebc365a53e7813c7c67cb203b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:53 -0400 Subject: [PATCH 0159/2880] nfsd: convert fi_deleg_file and ls_file fields to nfsd_file Have them keep an nfsd_file reference instead of a struct file. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/locks.c | 1 + fs/nfsd/blocklayout.c | 3 +- fs/nfsd/nfs4layouts.c | 12 ++-- fs/nfsd/nfs4state.c | 144 +++++++++++++++++++++--------------------- fs/nfsd/state.h | 6 +- 5 files changed, 85 insertions(+), 81 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 1913481bfbf7..c31674d10567 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -212,6 +212,7 @@ struct file_lock_list_struct { static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list); DEFINE_STATIC_PERCPU_RWSEM(file_rwsem); + /* * The blocked_hash is used to find POSIX lock loops for deadlock detection. * It is protected by blocked_lock_lock. diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 66d4c55eb48e..9bbaa671c079 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -15,6 +15,7 @@ #include "blocklayoutxdr.h" #include "pnfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS @@ -404,7 +405,7 @@ static void nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls) { struct nfs4_client *clp = ls->ls_stid.sc_client; - struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev; + struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev; bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), 0, true); diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index a79e24b79095..2681c70283ce 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -169,8 +169,8 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) spin_unlock(&fp->fi_lock); if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) - vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); - fput(ls->ls_file); + vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls); + nfsd_file_put(ls->ls_file); if (ls->ls_recalled) atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); @@ -197,7 +197,7 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) fl->fl_end = OFFSET_MAX; fl->fl_owner = ls; fl->fl_pid = current->tgid; - fl->fl_file = ls->ls_file; + fl->fl_file = ls->ls_file->nf_file; status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); if (status) { @@ -236,13 +236,13 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, NFSPROC4_CLNT_CB_LAYOUT); if (parent->sc_type == NFS4_DELEG_STID) - ls->ls_file = get_file(fp->fi_deleg_file); + ls->ls_file = nfsd_file_get(fp->fi_deleg_file); else ls->ls_file = find_any_file(fp); BUG_ON(!ls->ls_file); if (nfsd4_layout_setlease(ls)) { - fput(ls->ls_file); + nfsd_file_put(ls->ls_file); put_nfs4_file(fp); kmem_cache_free(nfs4_layout_stateid_cache, ls); return NULL; @@ -626,7 +626,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) argv[0] = (char *)nfsd_recall_failed; argv[1] = addr_str; - argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id; + argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id; argv[3] = NULL; error = call_usermodehelper(nfsd_recall_failed, argv, envp, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b126b86ba644..137f9b119a54 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -430,18 +430,18 @@ put_nfs4_file(struct nfs4_file *fi) } } -static struct file * +static struct nfsd_file * __nfs4_get_fd(struct nfs4_file *f, int oflag) { if (f->fi_fds[oflag]) - return get_file(f->fi_fds[oflag]->nf_file); + return nfsd_file_get(f->fi_fds[oflag]); return NULL; } -static struct file * +static struct nfsd_file * find_writeable_file_locked(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); @@ -451,10 +451,10 @@ find_writeable_file_locked(struct nfs4_file *f) return ret; } -static struct file * +static struct nfsd_file * find_writeable_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_writeable_file_locked(f); @@ -463,9 +463,10 @@ find_writeable_file(struct nfs4_file *f) return ret; } -static struct file *find_readable_file_locked(struct nfs4_file *f) +static struct nfsd_file * +find_readable_file_locked(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); @@ -475,10 +476,10 @@ static struct file *find_readable_file_locked(struct nfs4_file *f) return ret; } -static struct file * +static struct nfsd_file * find_readable_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_readable_file_locked(f); @@ -487,10 +488,10 @@ find_readable_file(struct nfs4_file *f) return ret; } -struct file * +struct nfsd_file * find_any_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = __nfs4_get_fd(f, O_RDWR); @@ -934,25 +935,25 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) static void put_deleg_file(struct nfs4_file *fp) { - struct file *filp = NULL; + struct nfsd_file *nf = NULL; spin_lock(&fp->fi_lock); if (--fp->fi_delegees == 0) - swap(filp, fp->fi_deleg_file); + swap(nf, fp->fi_deleg_file); spin_unlock(&fp->fi_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); } static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; - struct file *filp = fp->fi_deleg_file; + struct nfsd_file *nf = fp->fi_deleg_file; WARN_ON_ONCE(!fp->fi_delegees); - vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp); + vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); put_deleg_file(fp); } @@ -1290,11 +1291,14 @@ static void nfs4_free_lock_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); - struct file *file; + struct nfsd_file *nf; - file = find_any_file(stp->st_stid.sc_file); - if (file) - filp_close(file, (fl_owner_t)lo); + nf = find_any_file(stp->st_stid.sc_file); + if (nf) { + get_file(nf->nf_file); + filp_close(nf->nf_file, (fl_owner_t)lo); + nfsd_file_put(nf); + } nfs4_free_ol_stateid(stid); } @@ -2411,7 +2415,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_delegation *ds; struct nfs4_file *nf; - struct file *file; + struct nfsd_file *file; ds = delegstateid(st); nf = st->sc_file; @@ -2434,7 +2438,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_layout_stateid *ls; - struct file *file; + struct nfsd_file *file; ls = container_of(st, struct nfs4_layout_stateid, ls_stid); file = ls->ls_file; @@ -4768,7 +4772,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, fl->fl_end = OFFSET_MAX; fl->fl_owner = (fl_owner_t)dp; fl->fl_pid = current->tgid; - fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file; + fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file; return fl; } @@ -4778,7 +4782,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, { int status = 0; struct nfs4_delegation *dp; - struct file *filp; + struct nfsd_file *nf; struct file_lock *fl; /* @@ -4789,8 +4793,8 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - filp = find_readable_file(fp); - if (!filp) { + nf = find_readable_file(fp); + if (!nf) { /* We should always have a readable file here */ WARN_ON_ONCE(1); return ERR_PTR(-EBADF); @@ -4800,17 +4804,17 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (nfs4_delegation_exists(clp, fp)) status = -EAGAIN; else if (!fp->fi_deleg_file) { - fp->fi_deleg_file = filp; + fp->fi_deleg_file = nf; /* increment early to prevent fi_deleg_file from being * cleared */ fp->fi_delegees = 1; - filp = NULL; + nf = NULL; } else fp->fi_delegees++; spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); if (status) return ERR_PTR(status); @@ -4823,7 +4827,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (!fl) goto out_clnt_odstate; - status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL); + status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL); if (fl) locks_free_lock(fl); if (status) @@ -4843,7 +4847,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, return dp; out_unlock: - vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp); + vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); @@ -5514,7 +5518,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, return nfs_ok; } -static struct file * +static struct nfsd_file * nfs4_find_file(struct nfs4_stid *s, int flags) { if (!s) @@ -5524,7 +5528,7 @@ nfs4_find_file(struct nfs4_stid *s, int flags) case NFS4_DELEG_STID: if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file)) return NULL; - return get_file(s->sc_file->fi_deleg_file); + return nfsd_file_get(s->sc_file->fi_deleg_file); case NFS4_OPEN_STID: case NFS4_LOCK_STID: if (flags & RD_STATE) @@ -5553,29 +5557,27 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, struct file **filpp, bool *tmp_file, int flags) { int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE; - struct file *file; + struct nfsd_file *nf; __be32 status; - file = nfs4_find_file(s, flags); - if (file) { + nf = nfs4_find_file(s, flags); + if (nf) { status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, acc | NFSD_MAY_OWNER_OVERRIDE); - if (status) { - fput(file); - return status; - } - - *filpp = file; + if (status) + goto out; } else { - status = nfsd_open(rqstp, fhp, S_IFREG, acc, filpp); + status = nfsd_file_acquire(rqstp, fhp, acc, &nf); if (status) return status; if (tmp_file) *tmp_file = true; } - - return 0; + *filpp = get_file(nf->nf_file); +out: + nfsd_file_put(nf); + return status; } /* @@ -6393,7 +6395,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *lock_stp = NULL; struct nfs4_ol_stateid *open_stp = NULL; struct nfs4_file *fp; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; @@ -6475,8 +6477,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Fallthrough */ case NFS4_READ_LT: spin_lock(&fp->fi_lock); - filp = find_readable_file_locked(fp); - if (filp) + nf = find_readable_file_locked(fp); + if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); fl_type = F_RDLCK; @@ -6487,8 +6489,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Fallthrough */ case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); - filp = find_writeable_file_locked(fp); - if (filp) + nf = find_writeable_file_locked(fp); + if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); fl_type = F_WRLCK; @@ -6498,7 +6500,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - if (!filp) { + if (!nf) { status = nfserr_openmode; goto out; } @@ -6514,7 +6516,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, file_lock->fl_type = fl_type; file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); file_lock->fl_pid = current->tgid; - file_lock->fl_file = filp; + file_lock->fl_file = nf->nf_file; file_lock->fl_flags = fl_flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; @@ -6536,7 +6538,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, spin_unlock(&nn->blocked_locks_lock); } - err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); + err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, conflock); switch (err) { case 0: /* success! */ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); @@ -6571,8 +6573,8 @@ out: } free_blocked_lock(nbl); } - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); if (lock_stp) { /* Bump seqid manually if the 4.0 replay owner is openowner */ if (cstate->replay_owner && @@ -6699,7 +6701,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_locku *locku = &u->locku; struct nfs4_ol_stateid *stp; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; struct file_lock *file_lock = NULL; __be32 status; int err; @@ -6717,8 +6719,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &stp, nn); if (status) goto out; - filp = find_any_file(stp->st_stid.sc_file); - if (!filp) { + nf = find_any_file(stp->st_stid.sc_file); + if (!nf) { status = nfserr_lock_range; goto put_stateid; } @@ -6726,13 +6728,13 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; - goto fput; + goto put_file; } file_lock->fl_type = F_UNLCK; file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); file_lock->fl_pid = current->tgid; - file_lock->fl_file = filp; + file_lock->fl_file = nf->nf_file; file_lock->fl_flags = FL_POSIX; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = locku->lu_offset; @@ -6741,14 +6743,14 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, locku->lu_length); nfs4_transform_lock_offset(file_lock); - err = vfs_lock_file(filp, F_SETLK, file_lock, NULL); + err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, NULL); if (err) { dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); goto out_nfserr; } nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid); -fput: - fput(filp); +put_file: + nfsd_file_put(nf); put_stateid: mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); @@ -6760,7 +6762,7 @@ out: out_nfserr: status = nfserrno(err); - goto fput; + goto put_file; } /* @@ -6773,17 +6775,17 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock *fl; int status = false; - struct file *filp = find_any_file(fp); + struct nfsd_file *nf = find_any_file(fp); struct inode *inode; struct file_lock_context *flctx; - if (!filp) { + if (!nf) { /* Any valid lock stateid should have some sort of access */ WARN_ON_ONCE(1); return status; } - inode = locks_inode(filp); + inode = locks_inode(nf->nf_file); flctx = inode->i_flctx; if (flctx && !list_empty_careful(&flctx->flc_posix)) { @@ -6796,7 +6798,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) } spin_unlock(&flctx->flc_lock); } - fput(filp); + nfsd_file_put(nf); return status; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 8196bfb74f12..d89d1ade1254 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -516,7 +516,7 @@ struct nfs4_file { */ atomic_t fi_access[2]; u32 fi_share_deny; - struct file *fi_deleg_file; + struct nfsd_file *fi_deleg_file; int fi_delegees; struct knfsd_fh fi_fhandle; bool fi_had_conflict; @@ -565,7 +565,7 @@ struct nfs4_layout_stateid { spinlock_t ls_lock; struct list_head ls_layouts; u32 ls_layout_type; - struct file *ls_file; + struct nfsd_file *ls_file; struct nfsd4_callback ls_recall; stateid_t ls_recall_sid; bool ls_recalled; @@ -657,7 +657,7 @@ static inline void get_nfs4_file(struct nfs4_file *fi) { refcount_inc(&fi->fi_ref); } -struct file *find_any_file(struct nfs4_file *f); +struct nfsd_file *find_any_file(struct nfs4_file *f); /* grace period management */ void nfsd4_end_grace(struct nfsd_net *nn); From 5c4583b2b78eef4eb33fca9a4598e72e08dd514b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:54 -0400 Subject: [PATCH 0160/2880] nfsd: hook up nfs4_preprocess_stateid_op to the nfsd_file cache Have nfs4_preprocess_stateid_op pass back a nfsd_file instead of a filp. Since we now presume that the struct file will be persistent in most cases, we can stop fiddling with the raparms in the read code. This also means that we don't really care about the rd_tmp_file field anymore. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 83 +++++++++++++++++++++++---------------------- fs/nfsd/nfs4state.c | 24 ++++++------- fs/nfsd/nfs4xdr.c | 14 ++++---- fs/nfsd/state.h | 2 +- fs/nfsd/xdr4.h | 19 +++++------ 5 files changed, 68 insertions(+), 74 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8beda999e134..cb51893ec1cd 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -761,7 +761,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_read *read = &u->read; __be32 status; - read->rd_filp = NULL; + read->rd_nf = NULL; if (read->rd_offset >= OFFSET_MAX) return nfserr_inval; @@ -782,7 +782,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* check stateid */ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &read->rd_stateid, RD_STATE, - &read->rd_filp, &read->rd_tmp_file); + &read->rd_nf); if (status) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; @@ -798,8 +798,8 @@ out: static void nfsd4_read_release(union nfsd4_op_u *u) { - if (u->read.rd_filp) - fput(u->read.rd_filp); + if (u->read.rd_nf) + nfsd_file_put(u->read.rd_nf); trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, u->read.rd_offset, u->read.rd_length); } @@ -954,7 +954,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &setattr->sa_stateid, - WR_STATE, NULL, NULL); + WR_STATE, NULL); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; @@ -993,7 +993,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_write *write = &u->write; stateid_t *stateid = &write->wr_stateid; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; __be32 status = nfs_ok; unsigned long cnt; int nvecs; @@ -1005,7 +1005,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, trace_nfsd_write_start(rqstp, &cstate->current_fh, write->wr_offset, cnt); status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - stateid, WR_STATE, &filp, NULL); + stateid, WR_STATE, &nf); if (status) { dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; @@ -1018,10 +1018,10 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &write->wr_head, write->wr_buflen); WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); - status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, + status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf->nf_file, write->wr_offset, rqstp->rq_vec, nvecs, &cnt, write->wr_how_written); - fput(filp); + nfsd_file_put(nf); write->wr_bytes_written = cnt; trace_nfsd_write_done(rqstp, &cstate->current_fh, @@ -1031,8 +1031,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static __be32 nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - stateid_t *src_stateid, struct file **src, - stateid_t *dst_stateid, struct file **dst) + stateid_t *src_stateid, struct nfsd_file **src, + stateid_t *dst_stateid, struct nfsd_file **dst) { __be32 status; @@ -1040,22 +1040,22 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_nofilehandle; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, - src_stateid, RD_STATE, src, NULL); + src_stateid, RD_STATE, src); if (status) { dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); goto out; } status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - dst_stateid, WR_STATE, dst, NULL); + dst_stateid, WR_STATE, dst); if (status) { dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); goto out_put_src; } /* fix up for NFS-specific error code */ - if (!S_ISREG(file_inode(*src)->i_mode) || - !S_ISREG(file_inode(*dst)->i_mode)) { + if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) || + !S_ISREG(file_inode((*dst)->nf_file)->i_mode)) { status = nfserr_wrong_type; goto out_put_dst; } @@ -1063,9 +1063,9 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: return status; out_put_dst: - fput(*dst); + nfsd_file_put(*dst); out_put_src: - fput(*src); + nfsd_file_put(*src); goto out; } @@ -1074,7 +1074,7 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_clone *clone = &u->clone; - struct file *src, *dst; + struct nfsd_file *src, *dst; __be32 status; status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src, @@ -1082,11 +1082,11 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - status = nfsd4_clone_file_range(src, clone->cl_src_pos, - dst, clone->cl_dst_pos, clone->cl_count); + status = nfsd4_clone_file_range(src->nf_file, clone->cl_src_pos, + dst->nf_file, clone->cl_dst_pos, clone->cl_count); - fput(dst); - fput(src); + nfsd_file_put(dst); + nfsd_file_put(src); out: return status; } @@ -1176,8 +1176,9 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) do { if (kthread_should_stop()) break; - bytes_copied = nfsd_copy_file_range(copy->file_src, src_pos, - copy->file_dst, dst_pos, bytes_total); + bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file, + src_pos, copy->nf_dst->nf_file, dst_pos, + bytes_total); if (bytes_copied <= 0) break; bytes_total -= bytes_copied; @@ -1204,8 +1205,8 @@ static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync) status = nfs_ok; } - fput(copy->file_src); - fput(copy->file_dst); + nfsd_file_put(copy->nf_src); + nfsd_file_put(copy->nf_dst); return status; } @@ -1218,16 +1219,16 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res)); memcpy(&dst->fh, &src->fh, sizeof(src->fh)); dst->cp_clp = src->cp_clp; - dst->file_dst = get_file(src->file_dst); - dst->file_src = get_file(src->file_src); + dst->nf_dst = nfsd_file_get(src->nf_dst); + dst->nf_src = nfsd_file_get(src->nf_src); memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid)); } static void cleanup_async_copy(struct nfsd4_copy *copy) { nfs4_free_cp_state(copy); - fput(copy->file_dst); - fput(copy->file_src); + nfsd_file_put(copy->nf_dst); + nfsd_file_put(copy->nf_src); spin_lock(©->cp_clp->async_lock); list_del(©->copies); spin_unlock(©->cp_clp->async_lock); @@ -1264,8 +1265,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_copy *async_copy = NULL; status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, - ©->file_src, ©->cp_dst_stateid, - ©->file_dst); + ©->nf_src, ©->cp_dst_stateid, + ©->nf_dst); if (status) goto out; @@ -1347,21 +1348,21 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) { __be32 status; - struct file *file; + struct nfsd_file *nf; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &fallocate->falloc_stateid, - WR_STATE, &file, NULL); + WR_STATE, &nf); if (status != nfs_ok) { dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); return status; } - status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, + status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file, fallocate->falloc_offset, fallocate->falloc_length, flags); - fput(file); + nfsd_file_put(nf); return status; } static __be32 @@ -1406,11 +1407,11 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_seek *seek = &u->seek; int whence; __be32 status; - struct file *file; + struct nfsd_file *nf; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &seek->seek_stateid, - RD_STATE, &file, NULL); + RD_STATE, &nf); if (status) { dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); return status; @@ -1432,14 +1433,14 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Note: This call does change file->f_pos, but nothing in NFSD * should ever file->f_pos. */ - seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence); + seek->seek_pos = vfs_llseek(nf->nf_file, seek->seek_offset, whence); if (seek->seek_pos < 0) status = nfserrno(seek->seek_pos); - else if (seek->seek_pos >= i_size_read(file_inode(file))) + else if (seek->seek_pos >= i_size_read(file_inode(nf->nf_file))) seek->seek_eof = true; out: - fput(file); + nfsd_file_put(nf); return status; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 137f9b119a54..17b1ad4a619f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5554,7 +5554,7 @@ nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags) static __be32 nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, - struct file **filpp, bool *tmp_file, int flags) + struct nfsd_file **nfp, int flags) { int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE; struct nfsd_file *nf; @@ -5564,19 +5564,17 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, if (nf) { status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, acc | NFSD_MAY_OWNER_OVERRIDE); - if (status) + if (status) { + nfsd_file_put(nf); goto out; + } } else { status = nfsd_file_acquire(rqstp, fhp, acc, &nf); if (status) return status; - - if (tmp_file) - *tmp_file = true; } - *filpp = get_file(nf->nf_file); + *nfp = nf; out: - nfsd_file_put(nf); return status; } @@ -5586,7 +5584,7 @@ out: __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, - stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file) + stateid_t *stateid, int flags, struct nfsd_file **nfp) { struct inode *ino = d_inode(fhp->fh_dentry); struct net *net = SVC_NET(rqstp); @@ -5594,10 +5592,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfs4_stid *s = NULL; __be32 status; - if (filpp) - *filpp = NULL; - if (tmp_file) - *tmp_file = false; + if (nfp) + *nfp = NULL; if (grace_disallows_io(net, ino)) return nfserr_grace; @@ -5634,8 +5630,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, status = nfs4_check_fh(fhp, s); done: - if (!status && filpp) - status = nfs4_check_file(rqstp, fhp, s, filpp, tmp_file, flags); + if (status == nfs_ok && nfp) + status = nfs4_check_file(rqstp, fhp, s, nfp, flags); out: if (s) nfs4_put_stid(s); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 442811809f3d..7b03bcca0dfe 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -49,6 +49,7 @@ #include "cache.h" #include "netns.h" #include "pnfs.h" +#include "filecache.h" #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include @@ -3574,11 +3575,14 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, { unsigned long maxcount; struct xdr_stream *xdr = &resp->xdr; - struct file *file = read->rd_filp; + struct file *file; int starting_len = xdr->buf->len; - struct raparms *ra = NULL; __be32 *p; + if (nfserr) + return nfserr; + file = read->rd_nf->nf_file; + p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ if (!p) { WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); @@ -3596,18 +3600,12 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, (xdr->buf->buflen - xdr->buf->len)); maxcount = min_t(unsigned long, maxcount, read->rd_length); - if (read->rd_tmp_file) - ra = nfsd_init_raparms(file); - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); else nfserr = nfsd4_encode_readv(resp, read, file, maxcount); - if (ra) - nfsd_put_raparams(file, ra); - if (nfserr) xdr_truncate_encode(xdr, starting_len); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index d89d1ade1254..d00c86d05beb 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -616,7 +616,7 @@ struct nfsd4_copy; extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, - stateid_t *stateid, int flags, struct file **filp, bool *tmp_file); + stateid_t *stateid, int flags, struct nfsd_file **filp); __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d64c870f998a..f4737d66ee98 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -273,15 +273,14 @@ struct nfsd4_open_downgrade { struct nfsd4_read { - stateid_t rd_stateid; /* request */ - u64 rd_offset; /* request */ - u32 rd_length; /* request */ - int rd_vlen; - struct file *rd_filp; - bool rd_tmp_file; + stateid_t rd_stateid; /* request */ + u64 rd_offset; /* request */ + u32 rd_length; /* request */ + int rd_vlen; + struct nfsd_file *rd_nf; - struct svc_rqst *rd_rqstp; /* response */ - struct svc_fh * rd_fhp; /* response */ + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh *rd_fhp; /* response */ }; struct nfsd4_readdir { @@ -538,8 +537,8 @@ struct nfsd4_copy { struct nfs4_client *cp_clp; - struct file *file_src; - struct file *file_dst; + struct nfsd_file *nf_src; + struct nfsd_file *nf_dst; stateid_t cp_stateid; From 6b556ca2872be223176c1a1e249606248ce0b201 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:55 -0400 Subject: [PATCH 0161/2880] nfsd: have nfsd_test_lock use the nfsd_file cache Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 17b1ad4a619f..906e214dcce8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6605,11 +6605,11 @@ out: */ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { - struct file *file; - __be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + struct nfsd_file *nf; + __be32 err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); if (!err) { - err = nfserrno(vfs_test_lock(file, lock)); - fput(file); + err = nfserrno(vfs_test_lock(nf->nf_file, lock)); + nfsd_file_put(nf); } return err; } From 501cb1849f865960501d19d54e6a5af306f9b6fd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:56 -0400 Subject: [PATCH 0162/2880] nfsd: rip out the raparms cache The raparms cache was set up in order to ensure that we carry readahead information forward from one RPC call to the next. In other words, it was set up because each RPC call was forced to open a struct file, then close it, causing the loss of readahead information that is normally cached in that struct file, and used to keep the page cache filled when a user calls read() multiple times on the same file descriptor. Now that we cache the struct file, and reuse it for all the I/O calls to a given file by a given user, we no longer have to keep a separate readahead cache. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 13 +---- fs/nfsd/vfs.c | 149 ----------------------------------------------- fs/nfsd/vfs.h | 6 -- 3 files changed, 1 insertion(+), 167 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a6b1eab7b722..d02712ca2685 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -317,22 +317,12 @@ static int nfsd_startup_generic(int nrservs) ret = nfsd_file_cache_init(); if (ret) goto dec_users; - /* - * Readahead param cache - will no-op if it already exists. - * (Note therefore results will be suboptimal if number of - * threads is modified after nfsd start.) - */ - ret = nfsd_racache_init(2*nrservs); - if (ret) - goto out_file_cache; ret = nfs4_state_start(); if (ret) - goto out_racache; + goto out_file_cache; return 0; -out_racache: - nfsd_racache_shutdown(); out_file_cache: nfsd_file_cache_shutdown(); dec_users: @@ -347,7 +337,6 @@ static void nfsd_shutdown_generic(void) nfs4_state_shutdown(); nfsd_file_cache_shutdown(); - nfsd_racache_shutdown(); } static bool nfsd_needs_lockd(struct nfsd_net *nn) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ec254bff1893..8e2c8f36eba3 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -49,34 +49,6 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP - -/* - * This is a cache of readahead params that help us choose the proper - * readahead strategy. Initially, we set all readahead parameters to 0 - * and let the VFS handle things. - * If you increase the number of cached files very much, you'll need to - * add a hash table here. - */ -struct raparms { - struct raparms *p_next; - unsigned int p_count; - ino_t p_ino; - dev_t p_dev; - int p_set; - struct file_ra_state p_ra; - unsigned int p_hindex; -}; - -struct raparm_hbucket { - struct raparms *pb_head; - spinlock_t pb_lock; -} ____cacheline_aligned_in_smp; - -#define RAPARM_HASH_BITS 4 -#define RAPARM_HASH_SIZE (1<i_sb->s_dev; - ino_t ino = inode->i_ino; - struct raparms *ra, **rap, **frap = NULL; - int depth = 0; - unsigned int hash; - struct raparm_hbucket *rab; - - hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; - rab = &raparm_hash[hash]; - - spin_lock(&rab->pb_lock); - for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { - if (ra->p_ino == ino && ra->p_dev == dev) - goto found; - depth++; - if (ra->p_count == 0) - frap = rap; - } - depth = nfsdstats.ra_size; - if (!frap) { - spin_unlock(&rab->pb_lock); - return NULL; - } - rap = frap; - ra = *frap; - ra->p_dev = dev; - ra->p_ino = ino; - ra->p_set = 0; - ra->p_hindex = hash; -found: - if (rap != &rab->pb_head) { - *rap = ra->p_next; - ra->p_next = rab->pb_head; - rab->pb_head = ra; - } - ra->p_count++; - nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; - spin_unlock(&rab->pb_lock); - - if (ra->p_set) - file->f_ra = ra->p_ra; - return ra; -} - -void nfsd_put_raparams(struct file *file, struct raparms *ra) -{ - struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; - - spin_lock(&rab->pb_lock); - ra->p_ra = file->f_ra; - ra->p_set = 1; - ra->p_count--; - spin_unlock(&rab->pb_lock); -} - /* * Grab and keep cached pages associated with a file in the svc_rqst * so that they can be passed to the network sendmsg/sendpage routines @@ -2094,63 +2005,3 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, return err? nfserrno(err) : 0; } - -void -nfsd_racache_shutdown(void) -{ - struct raparms *raparm, *last_raparm; - unsigned int i; - - dprintk("nfsd: freeing readahead buffers.\n"); - - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - raparm = raparm_hash[i].pb_head; - while(raparm) { - last_raparm = raparm; - raparm = raparm->p_next; - kfree(last_raparm); - } - raparm_hash[i].pb_head = NULL; - } -} -/* - * Initialize readahead param cache - */ -int -nfsd_racache_init(int cache_size) -{ - int i; - int j = 0; - int nperbucket; - struct raparms **raparm = NULL; - - - if (raparm_hash[0].pb_head) - return 0; - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - nperbucket = max(2, nperbucket); - cache_size = nperbucket * RAPARM_HASH_SIZE; - - dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); - - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - spin_lock_init(&raparm_hash[i].pb_lock); - - raparm = &raparm_hash[i].pb_head; - for (j = 0; j < nperbucket; j++) { - *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); - if (!*raparm) - goto out_nomem; - raparm = &(*raparm)->p_next; - } - *raparm = NULL; - } - - nfsdstats.ra_size = cache_size; - return 0; - -out_nomem: - dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); - nfsd_racache_shutdown(); - return -ENOMEM; -} diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 31fdae34e028..e0f7792165a6 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -40,8 +40,6 @@ typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); /* nfsd/vfs.c */ -int nfsd_racache_init(int); -void nfsd_racache_shutdown(void); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, @@ -80,7 +78,6 @@ __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); __be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -struct raparms; __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count); @@ -118,9 +115,6 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); -struct raparms *nfsd_init_raparms(struct file *file); -void nfsd_put_raparams(struct file *file, struct raparms *ra); - static inline int fh_want_write(struct svc_fh *fh) { int ret; From 7775ec57f4c71309109f9e98cf8854a2ca8c9df1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 18 Aug 2019 14:18:57 -0400 Subject: [PATCH 0163/2880] nfsd: close cached files prior to a REMOVE or RENAME that would replace target It's not uncommon for some workloads to do a bunch of I/O to a file and delete it just afterward. If knfsd has a cached open file however, then the file may still be open when the dentry is unlinked. If the underlying filesystem is nfs, then that could trigger it to do a sillyrename. On a REMOVE or RENAME scan the nfsd_file cache for open files that correspond to the inode, and proactively unhash and put their references. This should prevent any delete-on-last-close activity from occurring, solely due to knfsd's open file cache. This must be done synchronously though so we use the variants that call flush_delayed_fput. There are deadlock possibilities if you call flush_delayed_fput while holding locks, however. In the case of nfsd_rename, we don't even do the lookups of the dentries to be renamed until we've locked for rename. Once we've figured out what the target dentry is for a rename, check to see whether there are cached open files associated with it. If there are, then unwind all of the locking, close them all, and then reattempt the rename. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 62 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8e2c8f36eba3..84e87772c2b8 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1590,6 +1590,26 @@ out_nfserr: goto out_unlock; } +static void +nfsd_close_cached_files(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (inode && S_ISREG(inode->i_mode)) + nfsd_file_close_inode_sync(inode); +} + +static bool +nfsd_has_cached_files(struct dentry *dentry) +{ + bool ret = false; + struct inode *inode = d_inode(dentry); + + if (inode && S_ISREG(inode->i_mode)) + ret = nfsd_file_is_cached(inode); + return ret; +} + /* * Rename a file * N.B. After this call _both_ ffhp and tfhp need an fh_put @@ -1602,6 +1622,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct inode *fdir, *tdir; __be32 err; int host_err; + bool has_cached = false; err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) @@ -1620,6 +1641,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) goto out; +retry: host_err = fh_want_write(ffhp); if (host_err) { err = nfserrno(host_err); @@ -1659,11 +1681,16 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) goto out_dput_new; - host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); - if (!host_err) { - host_err = commit_metadata(tfhp); - if (!host_err) - host_err = commit_metadata(ffhp); + if (nfsd_has_cached_files(ndentry)) { + has_cached = true; + goto out_dput_old; + } else { + host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) + host_err = commit_metadata(ffhp); + } } out_dput_new: dput(ndentry); @@ -1676,12 +1703,26 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, * as that would do the wrong thing if the two directories * were the same, so again we do it by hand. */ - fill_post_wcc(ffhp); - fill_post_wcc(tfhp); + if (!has_cached) { + fill_post_wcc(ffhp); + fill_post_wcc(tfhp); + } unlock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = false; fh_drop_write(ffhp); + /* + * If the target dentry has cached open files, then we need to try to + * close them prior to doing the rename. Flushing delayed fput + * shouldn't be done with locks held however, so we delay it until this + * point and then reattempt the whole shebang. + */ + if (has_cached) { + has_cached = false; + nfsd_close_cached_files(ndentry); + dput(ndentry); + goto retry; + } out: return err; } @@ -1728,10 +1769,13 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; - if (type != S_IFDIR) + if (type != S_IFDIR) { + nfsd_close_cached_files(rdentry); host_err = vfs_unlink(dirp, rdentry, NULL); - else + } else { host_err = vfs_rmdir(dirp, rdentry); + } + if (!host_err) host_err = commit_metadata(fhp); dput(rdentry); From b96811cd02466c6bb65c17639a96f2a766b7db9c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 18 Aug 2019 14:18:58 -0400 Subject: [PATCH 0164/2880] nfsd: Fix up some unused variable warnings Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 7b03bcca0dfe..e08d890a1915 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1419,7 +1419,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, struct nfsd4_create_session *sess) { DECODE_HEAD; - u32 dummy; READ_BUF(16); COPYMEM(&sess->clientid, 8); @@ -1428,7 +1427,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, /* Fore channel attrs */ READ_BUF(28); - dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + p++; /* headerpadsz is always 0 */ sess->fore_channel.maxreq_sz = be32_to_cpup(p++); sess->fore_channel.maxresp_sz = be32_to_cpup(p++); sess->fore_channel.maxresp_cached = be32_to_cpup(p++); @@ -1445,7 +1444,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, /* Back channel attrs */ READ_BUF(28); - dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + p++; /* headerpadsz is always 0 */ sess->back_channel.maxreq_sz = be32_to_cpup(p++); sess->back_channel.maxresp_sz = be32_to_cpup(p++); sess->back_channel.maxresp_cached = be32_to_cpup(p++); @@ -1737,7 +1736,6 @@ static __be32 nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) { DECODE_HEAD; - unsigned int tmp; status = nfsd4_decode_stateid(argp, ©->cp_src_stateid); if (status) @@ -1752,7 +1750,7 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) p = xdr_decode_hyper(p, ©->cp_count); p++; /* ca_consecutive: we always do consecutive copies */ copy->cp_synchronous = be32_to_cpup(p++); - tmp = be32_to_cpup(p); /* Source server list not supported */ + /* tmp = be32_to_cpup(p); Source server list not supported */ DECODE_TAIL; } @@ -3218,9 +3216,8 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ if (!p) return nfserr_resource; encode_cinfo(p, &create->cr_cinfo); - nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0], + return nfsd4_encode_bitmap(xdr, create->cr_bmval[0], create->cr_bmval[1], create->cr_bmval[2]); - return 0; } static __be32 From ed9927533a64ae55fa5adc91bf7383ee12f5710d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 18 Aug 2019 14:18:59 -0400 Subject: [PATCH 0165/2880] nfsd: Fix the documentation for svcxdr_tmpalloc() Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e08d890a1915..565d2169902c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -212,10 +212,10 @@ static int zero_clientid(clientid_t *clid) /** * svcxdr_tmpalloc - allocate memory to be freed after compound processing * @argp: NFSv4 compound argument structure - * @p: pointer to be freed (with kfree()) + * @len: length of buffer to allocate * - * Marks @p to be freed when processing the compound operation - * described in @argp finishes. + * Allocates a buffer of size @len to be freed when processing the compound + * operation described in @argp finishes. */ static void * svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len) From 0ac33e4e9b5e4ccdd43c9d4e77882a1af0e41dec Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 16 Aug 2019 18:06:04 +0200 Subject: [PATCH 0166/2880] selftests: use "$(MAKE)" instead of "make" When doing "make kselftest TARGETS=bpf -j12", bpf progs end up being compiled sequentially and thus slowly. The reason is that parent make (tools/testing/selftests/Makefile) does not share its jobserver with child make (tools/testing/selftests/bpf/Makefile), therefore the latter runs with -j1. Change all instances of "make" to "$(MAKE)", so that the whole make hierarchy runs using a single jobserver. Signed-off-by: Ilya Leoshkevich Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 25b43a8c2b15..c3feccb99ff5 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -126,9 +126,9 @@ endif # in the default INSTALL_HDR_PATH usr/include. khdr: ifeq (1,$(DEFAULT_INSTALL_HDR_PATH)) - make --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install + $(MAKE) --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install else - make --no-builtin-rules INSTALL_HDR_PATH=$$BUILD/usr \ + $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$$BUILD/usr \ ARCH=$(ARCH) -C $(top_srcdir) headers_install endif @@ -136,35 +136,35 @@ all: khdr @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ mkdir $$BUILD_TARGET -p; \ - make OUTPUT=$$BUILD_TARGET -C $$TARGET;\ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET;\ done; run_tests: all @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ - make OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests;\ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests;\ done; hotplug: @for TARGET in $(TARGETS_HOTPLUG); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ - make OUTPUT=$$BUILD_TARGET -C $$TARGET;\ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET;\ done; run_hotplug: hotplug @for TARGET in $(TARGETS_HOTPLUG); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ - make OUTPUT=$$BUILD_TARGET -C $$TARGET run_full_test;\ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_full_test;\ done; clean_hotplug: @for TARGET in $(TARGETS_HOTPLUG); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ - make OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ done; run_pstore_crash: - make -C pstore run_crash + $(MAKE) -C pstore run_crash # Use $BUILD as the default install root. $BUILD points to the # right output location for the following cases: @@ -184,7 +184,7 @@ ifdef INSTALL_PATH install -m 744 kselftest/prefix.pl $(INSTALL_PATH)/kselftest/ @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ - make OUTPUT=$$BUILD_TARGET -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install; \ done; @# Ask all targets to emit their test scripts @@ -203,7 +203,7 @@ ifdef INSTALL_PATH echo "[ -w /dev/kmsg ] && echo \"kselftest: Running tests in $$TARGET\" >> /dev/kmsg" >> $(ALL_SCRIPT); \ echo "cd $$TARGET" >> $(ALL_SCRIPT); \ echo -n "run_many" >> $(ALL_SCRIPT); \ - make -s --no-print-directory OUTPUT=$$BUILD_TARGET -C $$TARGET emit_tests >> $(ALL_SCRIPT); \ + $(MAKE) -s --no-print-directory OUTPUT=$$BUILD_TARGET -C $$TARGET emit_tests >> $(ALL_SCRIPT); \ echo "" >> $(ALL_SCRIPT); \ echo "cd \$$ROOT" >> $(ALL_SCRIPT); \ done; @@ -216,7 +216,7 @@ endif clean: @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ - make OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ done; .PHONY: khdr all run_tests hotplug run_hotplug clean_hotplug run_pstore_crash install clean From e6b1db98cf4d54d9ea59cfcc195f70dc946fdd38 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:37 -0700 Subject: [PATCH 0167/2880] security: Support early LSMs The lockdown module is intended to allow for kernels to be locked down early in boot - sufficiently early that we don't have the ability to kmalloc() yet. Add support for early initialisation of some LSMs, and then add them to the list of names when we do full initialisation later. Early LSMs are initialised in link order and cannot be overridden via boot parameters, and cannot make use of kmalloc() (since the allocator isn't initialised yet). (Fixed by Stephen Rothwell to include a stub to fix builds when !CONFIG_SECURITY) Signed-off-by: Matthew Garrett Acked-by: Kees Cook Acked-by: Casey Schaufler Cc: Stephen Rothwell Signed-off-by: James Morris --- include/asm-generic/vmlinux.lds.h | 8 ++++- include/linux/lsm_hooks.h | 6 ++++ include/linux/security.h | 6 ++++ init/main.c | 1 + security/security.c | 50 ++++++++++++++++++++++++++----- 5 files changed, 62 insertions(+), 9 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 088987e9a3ea..c1807d14daa3 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -208,8 +208,13 @@ __start_lsm_info = .; \ KEEP(*(.lsm_info.init)) \ __end_lsm_info = .; +#define EARLY_LSM_TABLE() . = ALIGN(8); \ + __start_early_lsm_info = .; \ + KEEP(*(.early_lsm_info.init)) \ + __end_early_lsm_info = .; #else #define LSM_TABLE() +#define EARLY_LSM_TABLE() #endif #define ___OF_TABLE(cfg, name) _OF_TABLE_##cfg(name) @@ -609,7 +614,8 @@ ACPI_PROBE_TABLE(irqchip) \ ACPI_PROBE_TABLE(timer) \ EARLYCON_TABLE() \ - LSM_TABLE() + LSM_TABLE() \ + EARLY_LSM_TABLE() #define INIT_TEXT \ *(.init.text .init.text.*) \ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 47f58cfb6a19..b02e8bb6654d 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -2104,12 +2104,18 @@ struct lsm_info { }; extern struct lsm_info __start_lsm_info[], __end_lsm_info[]; +extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; #define DEFINE_LSM(lsm) \ static struct lsm_info __lsm_##lsm \ __used __section(.lsm_info.init) \ __aligned(sizeof(unsigned long)) +#define DEFINE_EARLY_LSM(lsm) \ + static struct lsm_info __early_lsm_##lsm \ + __used __section(.early_lsm_info.init) \ + __aligned(sizeof(unsigned long)) + #ifdef CONFIG_SECURITY_SELINUX_DISABLE /* * Assuring the safety of deleting a security module is up to diff --git a/include/linux/security.h b/include/linux/security.h index 659071c2e57c..c5dd90981c98 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -195,6 +195,7 @@ int unregister_lsm_notifier(struct notifier_block *nb); /* prototypes */ extern int security_init(void); +extern int early_security_init(void); /* Security operations */ int security_binder_set_context_mgr(struct task_struct *mgr); @@ -423,6 +424,11 @@ static inline int security_init(void) return 0; } +static inline int early_security_init(void) +{ + return 0; +} + static inline int security_binder_set_context_mgr(struct task_struct *mgr) { return 0; diff --git a/init/main.c b/init/main.c index 66a196c5e4c3..598effd29a0a 100644 --- a/init/main.c +++ b/init/main.c @@ -569,6 +569,7 @@ asmlinkage __visible void __init start_kernel(void) boot_cpu_init(); page_address_init(); pr_notice("%s", linux_banner); + early_security_init(); setup_arch(&command_line); mm_init_cpumask(&init_mm); setup_command_line(command_line); diff --git a/security/security.c b/security/security.c index f493db0bf62a..ef4a0111c8b4 100644 --- a/security/security.c +++ b/security/security.c @@ -33,6 +33,7 @@ /* How many LSMs were built into the kernel? */ #define LSM_COUNT (__end_lsm_info - __start_lsm_info) +#define EARLY_LSM_COUNT (__end_early_lsm_info - __start_early_lsm_info) struct security_hook_heads security_hook_heads __lsm_ro_after_init; static ATOMIC_NOTIFIER_HEAD(lsm_notifier_chain); @@ -277,6 +278,8 @@ static void __init ordered_lsm_parse(const char *order, const char *origin) static void __init lsm_early_cred(struct cred *cred); static void __init lsm_early_task(struct task_struct *task); +static int lsm_append(const char *new, char **result); + static void __init ordered_lsm_init(void) { struct lsm_info **lsm; @@ -323,6 +326,26 @@ static void __init ordered_lsm_init(void) kfree(ordered_lsms); } +int __init early_security_init(void) +{ + int i; + struct hlist_head *list = (struct hlist_head *) &security_hook_heads; + struct lsm_info *lsm; + + for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head); + i++) + INIT_HLIST_HEAD(&list[i]); + + for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { + if (!lsm->enabled) + lsm->enabled = &lsm_enabled_true; + prepare_lsm(lsm); + initialize_lsm(lsm); + } + + return 0; +} + /** * security_init - initializes the security framework * @@ -330,14 +353,18 @@ static void __init ordered_lsm_init(void) */ int __init security_init(void) { - int i; - struct hlist_head *list = (struct hlist_head *) &security_hook_heads; + struct lsm_info *lsm; pr_info("Security Framework initializing\n"); - for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head); - i++) - INIT_HLIST_HEAD(&list[i]); + /* + * Append the names of the early LSM modules now that kmalloc() is + * available + */ + for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { + if (lsm->enabled) + lsm_append(lsm->name, &lsm_names); + } /* Load LSMs in specified order. */ ordered_lsm_init(); @@ -384,7 +411,7 @@ static bool match_last_lsm(const char *list, const char *lsm) return !strcmp(last, lsm); } -static int lsm_append(char *new, char **result) +static int lsm_append(const char *new, char **result) { char *cp; @@ -422,8 +449,15 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count, hooks[i].lsm = lsm; hlist_add_tail_rcu(&hooks[i].list, hooks[i].head); } - if (lsm_append(lsm, &lsm_names) < 0) - panic("%s - Cannot get early memory.\n", __func__); + + /* + * Don't try to append during early_security_init(), we'll come back + * and fix this up afterwards. + */ + if (slab_is_available()) { + if (lsm_append(lsm, &lsm_names) < 0) + panic("%s - Cannot get early memory.\n", __func__); + } } int call_lsm_notifier(enum lsm_event event, void *data) From 9e47d31d6a57b5babaca36d42b0d11b6db6019b7 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:38 -0700 Subject: [PATCH 0168/2880] security: Add a "locked down" LSM hook Add a mechanism to allow LSMs to make a policy decision around whether kernel functionality that would allow tampering with or examining the runtime state of the kernel should be permitted. Signed-off-by: Matthew Garrett Acked-by: Kees Cook Acked-by: Casey Schaufler Signed-off-by: James Morris --- include/linux/lsm_hooks.h | 7 +++++++ include/linux/security.h | 32 ++++++++++++++++++++++++++++++++ security/security.c | 6 ++++++ 3 files changed, 45 insertions(+) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index b02e8bb6654d..2f4ba9062fb8 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1446,6 +1446,11 @@ * @bpf_prog_free_security: * Clean up the security information stored inside bpf prog. * + * @locked_down + * Determine whether a kernel feature that potentially enables arbitrary + * code execution in kernel space should be permitted. + * + * @what: kernel feature being accessed */ union security_list_options { int (*binder_set_context_mgr)(struct task_struct *mgr); @@ -1807,6 +1812,7 @@ union security_list_options { int (*bpf_prog_alloc_security)(struct bpf_prog_aux *aux); void (*bpf_prog_free_security)(struct bpf_prog_aux *aux); #endif /* CONFIG_BPF_SYSCALL */ + int (*locked_down)(enum lockdown_reason what); }; struct security_hook_heads { @@ -2046,6 +2052,7 @@ struct security_hook_heads { struct hlist_head bpf_prog_alloc_security; struct hlist_head bpf_prog_free_security; #endif /* CONFIG_BPF_SYSCALL */ + struct hlist_head locked_down; } __randomize_layout; /* diff --git a/include/linux/security.h b/include/linux/security.h index c5dd90981c98..04cf48fab15d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -77,6 +77,33 @@ enum lsm_event { LSM_POLICY_CHANGE, }; +/* + * These are reasons that can be passed to the security_locked_down() + * LSM hook. Lockdown reasons that protect kernel integrity (ie, the + * ability for userland to modify kernel code) are placed before + * LOCKDOWN_INTEGRITY_MAX. Lockdown reasons that protect kernel + * confidentiality (ie, the ability for userland to extract + * information from the running kernel that would otherwise be + * restricted) are placed before LOCKDOWN_CONFIDENTIALITY_MAX. + * + * LSM authors should note that the semantics of any given lockdown + * reason are not guaranteed to be stable - the same reason may block + * one set of features in one kernel release, and a slightly different + * set of features in a later kernel release. LSMs that seek to expose + * lockdown policy at any level of granularity other than "none", + * "integrity" or "confidentiality" are responsible for either + * ensuring that they expose a consistent level of functionality to + * userland, or ensuring that userland is aware that this is + * potentially a moving target. It is easy to misuse this information + * in a way that could break userspace. Please be careful not to do + * so. + */ +enum lockdown_reason { + LOCKDOWN_NONE, + LOCKDOWN_INTEGRITY_MAX, + LOCKDOWN_CONFIDENTIALITY_MAX, +}; + /* These functions are in security/commoncap.c */ extern int cap_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts); @@ -393,6 +420,7 @@ void security_inode_invalidate_secctx(struct inode *inode); int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); +int security_locked_down(enum lockdown_reason what); #else /* CONFIG_SECURITY */ static inline int call_lsm_notifier(enum lsm_event event, void *data) @@ -1210,6 +1238,10 @@ static inline int security_inode_getsecctx(struct inode *inode, void **ctx, u32 { return -EOPNOTSUPP; } +static inline int security_locked_down(enum lockdown_reason what) +{ + return 0; +} #endif /* CONFIG_SECURITY */ #ifdef CONFIG_SECURITY_NETWORK diff --git a/security/security.c b/security/security.c index ef4a0111c8b4..7fc373486d7a 100644 --- a/security/security.c +++ b/security/security.c @@ -2389,3 +2389,9 @@ void security_bpf_prog_free(struct bpf_prog_aux *aux) call_void_hook(bpf_prog_free_security, aux); } #endif /* CONFIG_BPF_SYSCALL */ + +int security_locked_down(enum lockdown_reason what) +{ + return call_int_hook(locked_down, 0, what); +} +EXPORT_SYMBOL(security_locked_down); From 000d388ed3bbed745f366ce71b2bb7c2ee70f449 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:39 -0700 Subject: [PATCH 0169/2880] security: Add a static lockdown policy LSM While existing LSMs can be extended to handle lockdown policy, distributions generally want to be able to apply a straightforward static policy. This patch adds a simple LSM that can be configured to reject either integrity or all lockdown queries, and can be configured at runtime (through securityfs), boot time (via a kernel parameter) or build time (via a kconfig option). Based on initial code by David Howells. Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: David Howells Signed-off-by: James Morris --- .../admin-guide/kernel-parameters.txt | 9 + include/linux/security.h | 3 + security/Kconfig | 11 +- security/Makefile | 2 + security/lockdown/Kconfig | 46 +++++ security/lockdown/Makefile | 1 + security/lockdown/lockdown.c | 169 ++++++++++++++++++ 7 files changed, 236 insertions(+), 5 deletions(-) create mode 100644 security/lockdown/Kconfig create mode 100644 security/lockdown/Makefile create mode 100644 security/lockdown/lockdown.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..0f28350f1ee6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2244,6 +2244,15 @@ lockd.nlm_udpport=M [NFS] Assign UDP port. Format: + lockdown= [SECURITY] + { integrity | confidentiality } + Enable the kernel lockdown feature. If set to + integrity, kernel features that allow userland to + modify the running kernel are disabled. If set to + confidentiality, kernel features that allow userland + to extract confidential information from the kernel + are also disabled. + locktorture.nreaders_stress= [KNL] Set the number of locking read-acquisition kthreads. Defaults to being automatically set based on the diff --git a/include/linux/security.h b/include/linux/security.h index 04cf48fab15d..74787335d9ce 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -97,6 +97,9 @@ enum lsm_event { * potentially a moving target. It is easy to misuse this information * in a way that could break userspace. Please be careful not to do * so. + * + * If you add to this, remember to extend lockdown_reasons in + * security/lockdown/lockdown.c. */ enum lockdown_reason { LOCKDOWN_NONE, diff --git a/security/Kconfig b/security/Kconfig index 466cc1f8ffed..7c62d446e209 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -237,6 +237,7 @@ source "security/apparmor/Kconfig" source "security/loadpin/Kconfig" source "security/yama/Kconfig" source "security/safesetid/Kconfig" +source "security/lockdown/Kconfig" source "security/integrity/Kconfig" @@ -276,11 +277,11 @@ endchoice config LSM string "Ordered list of enabled LSMs" - default "yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor" if DEFAULT_SECURITY_SMACK - default "yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo" if DEFAULT_SECURITY_APPARMOR - default "yama,loadpin,safesetid,integrity,tomoyo" if DEFAULT_SECURITY_TOMOYO - default "yama,loadpin,safesetid,integrity" if DEFAULT_SECURITY_DAC - default "yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" + default "lockdown,yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor" if DEFAULT_SECURITY_SMACK + default "lockdown,yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo" if DEFAULT_SECURITY_APPARMOR + default "lockdown,yama,loadpin,safesetid,integrity,tomoyo" if DEFAULT_SECURITY_TOMOYO + default "lockdown,yama,loadpin,safesetid,integrity" if DEFAULT_SECURITY_DAC + default "lockdown,yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" help A comma-separated list of LSMs, in initialization order. Any LSMs left off this list will be ignored. This can be diff --git a/security/Makefile b/security/Makefile index c598b904938f..be1dd9d2cb2f 100644 --- a/security/Makefile +++ b/security/Makefile @@ -11,6 +11,7 @@ subdir-$(CONFIG_SECURITY_APPARMOR) += apparmor subdir-$(CONFIG_SECURITY_YAMA) += yama subdir-$(CONFIG_SECURITY_LOADPIN) += loadpin subdir-$(CONFIG_SECURITY_SAFESETID) += safesetid +subdir-$(CONFIG_SECURITY_LOCKDOWN_LSM) += lockdown # always enable default capabilities obj-y += commoncap.o @@ -27,6 +28,7 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor/ obj-$(CONFIG_SECURITY_YAMA) += yama/ obj-$(CONFIG_SECURITY_LOADPIN) += loadpin/ obj-$(CONFIG_SECURITY_SAFESETID) += safesetid/ +obj-$(CONFIG_SECURITY_LOCKDOWN_LSM) += lockdown/ obj-$(CONFIG_CGROUP_DEVICE) += device_cgroup.o # Object integrity file lists diff --git a/security/lockdown/Kconfig b/security/lockdown/Kconfig new file mode 100644 index 000000000000..7a1d213227a4 --- /dev/null +++ b/security/lockdown/Kconfig @@ -0,0 +1,46 @@ +config SECURITY_LOCKDOWN_LSM + bool "Basic module for enforcing kernel lockdown" + depends on SECURITY + help + Build support for an LSM that enforces a coarse kernel lockdown + behaviour. + +config SECURITY_LOCKDOWN_LSM_EARLY + bool "Enable lockdown LSM early in init" + depends on SECURITY_LOCKDOWN_LSM + help + Enable the lockdown LSM early in boot. This is necessary in order + to ensure that lockdown enforcement can be carried out on kernel + boot parameters that are otherwise parsed before the security + subsystem is fully initialised. If enabled, lockdown will + unconditionally be called before any other LSMs. + +choice + prompt "Kernel default lockdown mode" + default LOCK_DOWN_KERNEL_FORCE_NONE + depends on SECURITY_LOCKDOWN_LSM + help + The kernel can be configured to default to differing levels of + lockdown. + +config LOCK_DOWN_KERNEL_FORCE_NONE + bool "None" + help + No lockdown functionality is enabled by default. Lockdown may be + enabled via the kernel commandline or /sys/kernel/security/lockdown. + +config LOCK_DOWN_KERNEL_FORCE_INTEGRITY + bool "Integrity" + help + The kernel runs in integrity mode by default. Features that allow + the kernel to be modified at runtime are disabled. + +config LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY + bool "Confidentiality" + help + The kernel runs in confidentiality mode by default. Features that + allow the kernel to be modified at runtime or that permit userland + code to read confidential material held inside the kernel are + disabled. + +endchoice diff --git a/security/lockdown/Makefile b/security/lockdown/Makefile new file mode 100644 index 000000000000..e3634b9017e7 --- /dev/null +++ b/security/lockdown/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_SECURITY_LOCKDOWN_LSM) += lockdown.o diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c new file mode 100644 index 000000000000..7172ad75496b --- /dev/null +++ b/security/lockdown/lockdown.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Lock down the kernel + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include + +static enum lockdown_reason kernel_locked_down; + +static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { + [LOCKDOWN_NONE] = "none", + [LOCKDOWN_INTEGRITY_MAX] = "integrity", + [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", +}; + +static enum lockdown_reason lockdown_levels[] = {LOCKDOWN_NONE, + LOCKDOWN_INTEGRITY_MAX, + LOCKDOWN_CONFIDENTIALITY_MAX}; + +/* + * Put the kernel into lock-down mode. + */ +static int lock_kernel_down(const char *where, enum lockdown_reason level) +{ + if (kernel_locked_down >= level) + return -EPERM; + + kernel_locked_down = level; + pr_notice("Kernel is locked down from %s; see man kernel_lockdown.7\n", + where); + return 0; +} + +static int __init lockdown_param(char *level) +{ + if (!level) + return -EINVAL; + + if (strcmp(level, "integrity") == 0) + lock_kernel_down("command line", LOCKDOWN_INTEGRITY_MAX); + else if (strcmp(level, "confidentiality") == 0) + lock_kernel_down("command line", LOCKDOWN_CONFIDENTIALITY_MAX); + else + return -EINVAL; + + return 0; +} + +early_param("lockdown", lockdown_param); + +/** + * lockdown_is_locked_down - Find out if the kernel is locked down + * @what: Tag to use in notice generated if lockdown is in effect + */ +static int lockdown_is_locked_down(enum lockdown_reason what) +{ + if (kernel_locked_down >= what) { + if (lockdown_reasons[what]) + pr_notice("Lockdown: %s is restricted; see man kernel_lockdown.7\n", + lockdown_reasons[what]); + return -EPERM; + } + + return 0; +} + +static struct security_hook_list lockdown_hooks[] __lsm_ro_after_init = { + LSM_HOOK_INIT(locked_down, lockdown_is_locked_down), +}; + +static int __init lockdown_lsm_init(void) +{ +#if defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY) + lock_kernel_down("Kernel configuration", LOCKDOWN_INTEGRITY_MAX); +#elif defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY) + lock_kernel_down("Kernel configuration", LOCKDOWN_CONFIDENTIALITY_MAX); +#endif + security_add_hooks(lockdown_hooks, ARRAY_SIZE(lockdown_hooks), + "lockdown"); + return 0; +} + +static ssize_t lockdown_read(struct file *filp, char __user *buf, size_t count, + loff_t *ppos) +{ + char temp[80]; + int i, offset = 0; + + for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) { + enum lockdown_reason level = lockdown_levels[i]; + + if (lockdown_reasons[level]) { + const char *label = lockdown_reasons[level]; + + if (kernel_locked_down == level) + offset += sprintf(temp+offset, "[%s] ", label); + else + offset += sprintf(temp+offset, "%s ", label); + } + } + + /* Convert the last space to a newline if needed. */ + if (offset > 0) + temp[offset-1] = '\n'; + + return simple_read_from_buffer(buf, count, ppos, temp, strlen(temp)); +} + +static ssize_t lockdown_write(struct file *file, const char __user *buf, + size_t n, loff_t *ppos) +{ + char *state; + int i, len, err = -EINVAL; + + state = memdup_user_nul(buf, n); + if (IS_ERR(state)) + return PTR_ERR(state); + + len = strlen(state); + if (len && state[len-1] == '\n') { + state[len-1] = '\0'; + len--; + } + + for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) { + enum lockdown_reason level = lockdown_levels[i]; + const char *label = lockdown_reasons[level]; + + if (label && !strcmp(state, label)) + err = lock_kernel_down("securityfs", level); + } + + kfree(state); + return err ? err : n; +} + +static const struct file_operations lockdown_ops = { + .read = lockdown_read, + .write = lockdown_write, +}; + +static int __init lockdown_secfs_init(void) +{ + struct dentry *dentry; + + dentry = securityfs_create_file("lockdown", 0600, NULL, NULL, + &lockdown_ops); + return PTR_ERR_OR_ZERO(dentry); +} + +core_initcall(lockdown_secfs_init); + +#ifdef CONFIG_SECURITY_LOCKDOWN_LSM_EARLY +DEFINE_EARLY_LSM(lockdown) = { +#else +DEFINE_LSM(lockdown) = { +#endif + .name = "lockdown", + .init = lockdown_lsm_init, +}; From 49fcf732bdae0550721ef73af7c45109ce26b2a9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:40 -0700 Subject: [PATCH 0170/2880] lockdown: Enforce module signatures if the kernel is locked down If the kernel is locked down, require that all modules have valid signatures that we can verify. I have adjusted the errors generated: (1) If there's no signature (ENODATA) or we can't check it (ENOPKG, ENOKEY), then: (a) If signatures are enforced then EKEYREJECTED is returned. (b) If there's no signature or we can't check it, but the kernel is locked down then EPERM is returned (this is then consistent with other lockdown cases). (2) If the signature is unparseable (EBADMSG, EINVAL), the signature fails the check (EKEYREJECTED) or a system error occurs (eg. ENOMEM), we return the error we got. Note that the X.509 code doesn't check for key expiry as the RTC might not be valid or might not have been transferred to the kernel's clock yet. [Modified by Matthew Garrett to remove the IMA integration. This will be replaced with integration with the IMA architecture policy patchset.] Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: Jessica Yu Signed-off-by: James Morris --- include/linux/security.h | 1 + init/Kconfig | 5 +++++ kernel/module.c | 39 ++++++++++++++++++++++++++++-------- security/lockdown/Kconfig | 1 + security/lockdown/lockdown.c | 1 + 5 files changed, 39 insertions(+), 8 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index 74787335d9ce..9e8abb60a99f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -103,6 +103,7 @@ enum lsm_event { */ enum lockdown_reason { LOCKDOWN_NONE, + LOCKDOWN_MODULE_SIGNATURE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/init/Kconfig b/init/Kconfig index 0e2344389501..e6069368f278 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1939,6 +1939,11 @@ config MODULE_SIG kernel build dependency so that the signing tool can use its crypto library. + You should enable this option if you wish to use either + CONFIG_SECURITY_LOCKDOWN_LSM or lockdown functionality imposed via + another LSM - otherwise unsigned modules will be loadable regardless + of the lockdown policy. + !!!WARNING!!! If you enable this option, you MUST make sure that the module DOES NOT get stripped after being signed. This includes the debuginfo strip done by some packagers (such as rpmbuild) and diff --git a/kernel/module.c b/kernel/module.c index 80c7c09584cf..2206c08a5e10 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2753,8 +2753,9 @@ static inline void kmemleak_load_module(const struct module *mod, #ifdef CONFIG_MODULE_SIG static int module_sig_check(struct load_info *info, int flags) { - int err = -ENOKEY; + int err = -ENODATA; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const char *reason; const void *mod = info->hdr; /* @@ -2769,16 +2770,38 @@ static int module_sig_check(struct load_info *info, int flags) err = mod_verify_sig(mod, info); } - if (!err) { + switch (err) { + case 0: info->sig_ok = true; return 0; + + /* We don't permit modules to be loaded into trusted kernels + * without a valid signature on them, but if we're not + * enforcing, certain errors are non-fatal. + */ + case -ENODATA: + reason = "Loading of unsigned module"; + goto decide; + case -ENOPKG: + reason = "Loading of module with unsupported crypto"; + goto decide; + case -ENOKEY: + reason = "Loading of module with unavailable key"; + decide: + if (is_module_sig_enforced()) { + pr_notice("%s is rejected\n", reason); + return -EKEYREJECTED; + } + + return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); + + /* All other errors are fatal, including nomem, unparseable + * signatures and signature check failures - even if signatures + * aren't required. + */ + default: + return err; } - - /* Not having a signature is only an error if we're strict. */ - if (err == -ENOKEY && !is_module_sig_enforced()) - err = 0; - - return err; } #else /* !CONFIG_MODULE_SIG */ static int module_sig_check(struct load_info *info, int flags) diff --git a/security/lockdown/Kconfig b/security/lockdown/Kconfig index 7a1d213227a4..e84ddf484010 100644 --- a/security/lockdown/Kconfig +++ b/security/lockdown/Kconfig @@ -1,6 +1,7 @@ config SECURITY_LOCKDOWN_LSM bool "Basic module for enforcing kernel lockdown" depends on SECURITY + select MODULE_SIG if MODULES help Build support for an LSM that enforces a coarse kernel lockdown behaviour. diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 7172ad75496b..d8e42125a5dd 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -18,6 +18,7 @@ static enum lockdown_reason kernel_locked_down; static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_NONE] = "none", + [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 9b9d8dda1ed72e9bd560ab0ca93d322a9440510e Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:41 -0700 Subject: [PATCH 0171/2880] lockdown: Restrict /dev/{mem,kmem,port} when the kernel is locked down Allowing users to read and write to core kernel memory makes it possible for the kernel to be subverted, avoiding module loading restrictions, and also to steal cryptographic information. Disallow /dev/mem and /dev/kmem from being opened this when the kernel has been locked down to prevent this. Also disallow /dev/port from being opened to prevent raw ioport access and thus DMA from being used to accomplish the same thing. Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: x86@kernel.org Signed-off-by: James Morris --- drivers/char/mem.c | 7 +++++-- include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index b08dc50f9f26..d0148aee1aab 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -29,8 +29,8 @@ #include #include #include - #include +#include #ifdef CONFIG_IA64 # include @@ -786,7 +786,10 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig) static int open_port(struct inode *inode, struct file *filp) { - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + return security_locked_down(LOCKDOWN_DEV_MEM); } #define zero_lseek null_lseek diff --git a/include/linux/security.h b/include/linux/security.h index 9e8abb60a99f..e5dd446ef35b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -104,6 +104,7 @@ enum lsm_event { enum lockdown_reason { LOCKDOWN_NONE, LOCKDOWN_MODULE_SIGNATURE, + LOCKDOWN_DEV_MEM, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index d8e42125a5dd..240ecaa10a1d 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -19,6 +19,7 @@ static enum lockdown_reason kernel_locked_down; static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_NONE] = "none", [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", + [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 7d31f4602f8d366072471ca138e4ea7b8edf9be0 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:42 -0700 Subject: [PATCH 0172/2880] kexec_load: Disable at runtime if the kernel is locked down The kexec_load() syscall permits the loading and execution of arbitrary code in ring 0, which is something that lock-down is meant to prevent. It makes sense to disable kexec_load() in this situation. This does not affect kexec_file_load() syscall which can check for a signature on the image to be booted. Signed-off-by: David Howells Signed-off-by: Matthew Garrett Acked-by: Dave Young Reviewed-by: Kees Cook cc: kexec@lists.infradead.org Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/kexec.c | 8 ++++++++ security/lockdown/lockdown.c | 1 + 3 files changed, 10 insertions(+) diff --git a/include/linux/security.h b/include/linux/security.h index e5dd446ef35b..b607a8ac97fe 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -105,6 +105,7 @@ enum lockdown_reason { LOCKDOWN_NONE, LOCKDOWN_MODULE_SIGNATURE, LOCKDOWN_DEV_MEM, + LOCKDOWN_KEXEC, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/kexec.c b/kernel/kexec.c index 1b018f1a6e0d..bc933c0db9bf 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -205,6 +205,14 @@ static inline int kexec_load_check(unsigned long nr_segments, if (result < 0) return result; + /* + * kexec can be used to circumvent module loading restrictions, so + * prevent loading in that case + */ + result = security_locked_down(LOCKDOWN_KEXEC); + if (result) + return result; + /* * Verify we have a legal set of flags * This leaves us room for future extensions. diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 240ecaa10a1d..aaf30ad351f9 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -20,6 +20,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_NONE] = "none", [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", + [LOCKDOWN_KEXEC] = "kexec of unsigned images", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From fef5dad9876034253d59acbf8c0c314f4d94cf87 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Mon, 19 Aug 2019 17:17:43 -0700 Subject: [PATCH 0173/2880] lockdown: Copy secure_boot flag in boot params across kexec reboot Kexec reboot in case secure boot being enabled does not keep the secure boot mode in new kernel, so later one can load unsigned kernel via legacy kexec_load. In this state, the system is missing the protections provided by secure boot. Adding a patch to fix this by retain the secure_boot flag in original kernel. secure_boot flag in boot_params is set in EFI stub, but kexec bypasses the stub. Fixing this issue by copying secure_boot flag across kexec reboot. Signed-off-by: Dave Young Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook cc: kexec@lists.infradead.org Signed-off-by: James Morris --- arch/x86/kernel/kexec-bzimage64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index f03237e3f192..5d64a22f99ce 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -180,6 +180,7 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, if (efi_enabled(EFI_OLD_MEMMAP)) return 0; + params->secure_boot = boot_params.secure_boot; ei->efi_loader_signature = current_ei->efi_loader_signature; ei->efi_systab = current_ei->efi_systab; ei->efi_systab_hi = current_ei->efi_systab_hi; From 99d5cadfde2b1acb7650021df5abaa5ec447dd10 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Mon, 19 Aug 2019 17:17:44 -0700 Subject: [PATCH 0174/2880] kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG and KEXEC_SIG_FORCE This is a preparatory patch for kexec_file_load() lockdown. A locked down kernel needs to prevent unsigned kernel images from being loaded with kexec_file_load(). Currently, the only way to force the signature verification is compiling with KEXEC_VERIFY_SIG. This prevents loading usigned images even when the kernel is not locked down at runtime. This patch splits KEXEC_VERIFY_SIG into KEXEC_SIG and KEXEC_SIG_FORCE. Analogous to the MODULE_SIG and MODULE_SIG_FORCE for modules, KEXEC_SIG turns on the signature verification but allows unsigned images to be loaded. KEXEC_SIG_FORCE disallows images without a valid signature. Signed-off-by: Jiri Bohac Signed-off-by: David Howells Signed-off-by: Matthew Garrett cc: kexec@lists.infradead.org Signed-off-by: James Morris --- arch/arm64/Kconfig | 6 +-- arch/s390/Kconfig | 2 +- arch/s390/configs/debug_defconfig | 2 +- arch/s390/configs/defconfig | 2 +- arch/s390/configs/performance_defconfig | 2 +- arch/s390/kernel/kexec_elf.c | 4 +- arch/s390/kernel/kexec_image.c | 4 +- arch/s390/kernel/machine_kexec_file.c | 4 +- arch/x86/Kconfig | 22 ++++++--- arch/x86/kernel/ima_arch.c | 4 +- crypto/asymmetric_keys/verify_pefile.c | 4 +- include/linux/kexec.h | 4 +- kernel/kexec_file.c | 60 +++++++++++++++++++++---- security/integrity/ima/Kconfig | 2 +- security/integrity/ima/ima_main.c | 2 +- 15 files changed, 89 insertions(+), 35 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 697ea0510729..f940500a941b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -961,7 +961,7 @@ config KEXEC_FILE for kernel and initramfs as opposed to list of segments as accepted by previous system call. -config KEXEC_VERIFY_SIG +config KEXEC_SIG bool "Verify kernel signature during kexec_file_load() syscall" depends on KEXEC_FILE help @@ -976,13 +976,13 @@ config KEXEC_VERIFY_SIG config KEXEC_IMAGE_VERIFY_SIG bool "Enable Image signature verification support" default y - depends on KEXEC_VERIFY_SIG + depends on KEXEC_SIG depends on EFI && SIGNED_PE_FILE_VERIFICATION help Enable Image signature verification support. comment "Support for PE file signature verification disabled" - depends on KEXEC_VERIFY_SIG + depends on KEXEC_SIG depends on !EFI || !SIGNED_PE_FILE_VERIFICATION config CRASH_DUMP diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 109243fdb6ec..c4a423f30d49 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -555,7 +555,7 @@ config ARCH_HAS_KEXEC_PURGATORY def_bool y depends on KEXEC_FILE -config KEXEC_VERIFY_SIG +config KEXEC_SIG bool "Verify kernel signature during kexec_file_load() syscall" depends on KEXEC_FILE && SYSTEM_DATA_VERIFICATION help diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index b0920b35f87b..525e0a6addb9 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -64,7 +64,7 @@ CONFIG_NUMA=y CONFIG_PREEMPT=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_VERIFY_SIG=y +CONFIG_KEXEC_SIG=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index c59b922cb6c5..4c37279acdb4 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -39,7 +39,7 @@ CONFIG_NR_CPUS=256 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_VERIFY_SIG=y +CONFIG_KEXEC_SIG=y CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 09aa5cb14873..158ad0f0d433 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -65,7 +65,7 @@ CONFIG_NR_CPUS=512 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_VERIFY_SIG=y +CONFIG_KEXEC_SIG=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 6d0635ceddd0..9b4f37a4edf1 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -130,7 +130,7 @@ static int s390_elf_probe(const char *buf, unsigned long len) const struct kexec_file_ops s390_kexec_elf_ops = { .probe = s390_elf_probe, .load = s390_elf_load, -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC__SIG .verify_sig = s390_verify_sig, -#endif /* CONFIG_KEXEC_VERIFY_SIG */ +#endif /* CONFIG_KEXEC_SIG */ }; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index 58318bf89fd9..af23eff5774d 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -59,7 +59,7 @@ static int s390_image_probe(const char *buf, unsigned long len) const struct kexec_file_ops s390_kexec_image_ops = { .probe = s390_image_probe, .load = s390_image_load, -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG .verify_sig = s390_verify_sig, -#endif /* CONFIG_KEXEC_VERIFY_SIG */ +#endif /* CONFIG_KEXEC_SIG */ }; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index fbdd3ea73667..c0f33ba49a9a 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -22,7 +22,7 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { NULL, }; -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG /* * Module signature information block. * @@ -90,7 +90,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len) VERIFYING_MODULE_SIGNATURE, NULL, NULL); } -#endif /* CONFIG_KEXEC_VERIFY_SIG */ +#endif /* CONFIG_KEXEC_SIG */ static int kexec_file_update_purgatory(struct kimage *image, struct s390_load_data *data) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2bbbd4d1ba31..cd41998aa6e6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2006,20 +2006,30 @@ config KEXEC_FILE config ARCH_HAS_KEXEC_PURGATORY def_bool KEXEC_FILE -config KEXEC_VERIFY_SIG +config KEXEC_SIG bool "Verify kernel signature during kexec_file_load() syscall" depends on KEXEC_FILE + ---help--- + + This option makes the kexec_file_load() syscall check for a valid + signature of the kernel image. The image can still be loaded without + a valid signature unless you also enable KEXEC_SIG_FORCE, though if + there's a signature that we can check, then it must be valid. + + In addition to this option, you need to enable signature + verification for the corresponding kernel image type being + loaded in order for this to work. + +config KEXEC_SIG_FORCE + bool "Require a valid signature in kexec_file_load() syscall" + depends on KEXEC_SIG ---help--- This option makes kernel signature verification mandatory for the kexec_file_load() syscall. - In addition to that option, you need to enable signature - verification for the corresponding kernel image type being - loaded in order for this to work. - config KEXEC_BZIMAGE_VERIFY_SIG bool "Enable bzImage signature verification support" - depends on KEXEC_VERIFY_SIG + depends on KEXEC_SIG depends on SIGNED_PE_FILE_VERIFICATION select SYSTEM_TRUSTED_KEYRING ---help--- diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c index 64b973f0e985..b98890894731 100644 --- a/arch/x86/kernel/ima_arch.c +++ b/arch/x86/kernel/ima_arch.c @@ -66,9 +66,9 @@ bool arch_ima_get_secureboot(void) /* secureboot arch rules */ static const char * const sb_arch_rules[] = { -#if !IS_ENABLED(CONFIG_KEXEC_VERIFY_SIG) +#if !IS_ENABLED(CONFIG_KEXEC_SIG) "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig", -#endif /* CONFIG_KEXEC_VERIFY_SIG */ +#endif /* CONFIG_KEXEC_SIG */ "measure func=KEXEC_KERNEL_CHECK", #if !IS_ENABLED(CONFIG_MODULE_SIG) "appraise func=MODULE_CHECK appraise_type=imasig", diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c index 3b303fe2f061..cc9dbcecaaca 100644 --- a/crypto/asymmetric_keys/verify_pefile.c +++ b/crypto/asymmetric_keys/verify_pefile.c @@ -96,7 +96,7 @@ static int pefile_parse_binary(const void *pebuf, unsigned int pelen, if (!ddir->certs.virtual_address || !ddir->certs.size) { pr_debug("Unsigned PE binary\n"); - return -EKEYREJECTED; + return -ENODATA; } chkaddr(ctx->header_size, ddir->certs.virtual_address, @@ -403,6 +403,8 @@ error_no_desc: * (*) 0 if at least one signature chain intersects with the keys in the trust * keyring, or: * + * (*) -ENODATA if there is no signature present. + * * (*) -ENOPKG if a suitable crypto module couldn't be found for a check on a * chain. * diff --git a/include/linux/kexec.h b/include/linux/kexec.h index b9b1bc5f9669..58b27c7bdc2b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -125,7 +125,7 @@ typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, unsigned long cmdline_len); typedef int (kexec_cleanup_t)(void *loader_data); -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG typedef int (kexec_verify_sig_t)(const char *kernel_buf, unsigned long kernel_len); #endif @@ -134,7 +134,7 @@ struct kexec_file_ops { kexec_probe_t *probe; kexec_load_t *load; kexec_cleanup_t *cleanup; -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG kexec_verify_sig_t *verify_sig; #endif }; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ef7b951a8087..972931201995 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -88,7 +88,7 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) return kexec_image_post_load_cleanup_default(image); } -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG static int kexec_image_verify_sig_default(struct kimage *image, void *buf, unsigned long buf_len) { @@ -177,6 +177,51 @@ void kimage_file_post_load_cleanup(struct kimage *image) image->image_loader_data = NULL; } +#ifdef CONFIG_KEXEC_SIG +static int +kimage_validate_signature(struct kimage *image) +{ + const char *reason; + int ret; + + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + switch (ret) { + case 0: + break; + + /* Certain verification errors are non-fatal if we're not + * checking errors, provided we aren't mandating that there + * must be a valid signature. + */ + case -ENODATA: + reason = "kexec of unsigned image"; + goto decide; + case -ENOPKG: + reason = "kexec of image with unsupported crypto"; + goto decide; + case -ENOKEY: + reason = "kexec of image with unavailable key"; + decide: + if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { + pr_notice("%s rejected\n", reason); + return ret; + } + + return 0; + + /* All other errors are fatal, including nomem, unparseable + * signatures and signature check failures - even if signatures + * aren't required. + */ + default: + pr_notice("kernel signature verification failed (%d).\n", ret); + } + + return ret; +} +#endif + /* * In file mode list of segments is prepared by kernel. Copy relevant * data from user space, do error checking, prepare segment list @@ -186,7 +231,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, const char __user *cmdline_ptr, unsigned long cmdline_len, unsigned flags) { - int ret = 0; + int ret; void *ldata; loff_t size; @@ -205,14 +250,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, if (ret) goto out; -#ifdef CONFIG_KEXEC_VERIFY_SIG - ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, - image->kernel_buf_len); - if (ret) { - pr_debug("kernel signature verification failed.\n"); +#ifdef CONFIG_KEXEC_SIG + ret = kimage_validate_signature(image); + + if (ret) goto out; - } - pr_debug("kernel signature verification successful.\n"); #endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index 2692c7358c2c..32cd25fa44a5 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -160,7 +160,7 @@ config IMA_APPRAISE config IMA_ARCH_POLICY bool "Enable loading an IMA architecture specific policy" - depends on KEXEC_VERIFY_SIG || IMA_APPRAISE && INTEGRITY_ASYMMETRIC_KEYS + depends on KEXEC_SIG || IMA_APPRAISE && INTEGRITY_ASYMMETRIC_KEYS default n help This option enables loading an IMA architecture specific policy diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index f556e6c18f9b..1cffda4412b7 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -541,7 +541,7 @@ int ima_load_data(enum kernel_load_data_id id) switch (id) { case LOADING_KEXEC_IMAGE: - if (IS_ENABLED(CONFIG_KEXEC_VERIFY_SIG) + if (IS_ENABLED(CONFIG_KEXEC_SIG) && arch_ima_get_secureboot()) { pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n"); return -EACCES; From 155bdd30af17e90941589b5db4dab9a29b28c112 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Mon, 19 Aug 2019 17:17:45 -0700 Subject: [PATCH 0175/2880] kexec_file: Restrict at runtime if the kernel is locked down When KEXEC_SIG is not enabled, kernel should not load images through kexec_file systemcall if the kernel is locked down. [Modified by David Howells to fit with modifications to the previous patch and to return -EPERM if the kernel is locked down for consistency with other lockdowns. Modified by Matthew Garrett to remove the IMA integration, which will be replaced by integrating with the IMA architecture policy patches.] Signed-off-by: Jiri Bohac Signed-off-by: David Howells Signed-off-by: Matthew Garrett cc: kexec@lists.infradead.org Signed-off-by: James Morris --- kernel/kexec_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 972931201995..43109ef4d6bf 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -208,7 +208,7 @@ kimage_validate_signature(struct kimage *image) return ret; } - return 0; + return security_locked_down(LOCKDOWN_KEXEC); /* All other errors are fatal, including nomem, unparseable * signatures and signature check failures - even if signatures From 38bd94b8a1bd46e6d3d9718c7ff582e4c8ccb440 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Mon, 19 Aug 2019 17:17:46 -0700 Subject: [PATCH 0176/2880] hibernate: Disable when the kernel is locked down There is currently no way to verify the resume image when returning from hibernate. This might compromise the signed modules trust model, so until we can work with signed hibernate images we disable it when the kernel is locked down. Signed-off-by: Josh Boyer Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: rjw@rjwysocki.net Cc: pavel@ucw.cz cc: linux-pm@vger.kernel.org Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/power/hibernate.c | 3 ++- security/lockdown/lockdown.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/security.h b/include/linux/security.h index b607a8ac97fe..80ac7fb27aa9 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -106,6 +106,7 @@ enum lockdown_reason { LOCKDOWN_MODULE_SIGNATURE, LOCKDOWN_DEV_MEM, LOCKDOWN_KEXEC, + LOCKDOWN_HIBERNATION, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index cd7434e6000d..3c0a5a8170b0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "power.h" @@ -68,7 +69,7 @@ static const struct platform_hibernation_ops *hibernation_ops; bool hibernation_available(void) { - return (nohibernate == 0); + return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION); } /** diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index aaf30ad351f9..3462f7edcaac 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -21,6 +21,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", [LOCKDOWN_KEXEC] = "kexec of unsigned images", + [LOCKDOWN_HIBERNATION] = "hibernation", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From eb627e17727ebeede70697ae1798688b0d328b54 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:47 -0700 Subject: [PATCH 0177/2880] PCI: Lock down BAR access when the kernel is locked down Any hardware that can potentially generate DMA has to be locked down in order to avoid it being possible for an attacker to modify kernel code, allowing them to circumvent disabled module loading or module signing. Default to paranoid - in future we can potentially relax this for sufficiently IOMMU-isolated devices. Signed-off-by: David Howells Signed-off-by: Matthew Garrett Acked-by: Bjorn Helgaas Reviewed-by: Kees Cook cc: linux-pci@vger.kernel.org Signed-off-by: James Morris --- drivers/pci/pci-sysfs.c | 16 ++++++++++++++++ drivers/pci/proc.c | 14 ++++++++++++-- drivers/pci/syscall.c | 4 +++- include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 5 files changed, 33 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 6d27475e39b2..ec103a7e13fc 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -903,6 +903,11 @@ static ssize_t pci_write_config(struct file *filp, struct kobject *kobj, unsigned int size = count; loff_t init_off = off; u8 *data = (u8 *) buf; + int ret; + + ret = security_locked_down(LOCKDOWN_PCI_ACCESS); + if (ret) + return ret; if (off > dev->cfg_size) return 0; @@ -1164,6 +1169,11 @@ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, int bar = (unsigned long)attr->private; enum pci_mmap_state mmap_type; struct resource *res = &pdev->resource[bar]; + int ret; + + ret = security_locked_down(LOCKDOWN_PCI_ACCESS); + if (ret) + return ret; if (res->flags & IORESOURCE_MEM && iomem_is_exclusive(res->start)) return -EINVAL; @@ -1240,6 +1250,12 @@ static ssize_t pci_write_resource_io(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { + int ret; + + ret = security_locked_down(LOCKDOWN_PCI_ACCESS); + if (ret) + return ret; + return pci_resource_io(filp, kobj, attr, buf, off, count, true); } diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c index 445b51db75b0..e29b0d5ced62 100644 --- a/drivers/pci/proc.c +++ b/drivers/pci/proc.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "pci.h" @@ -115,7 +116,11 @@ static ssize_t proc_bus_pci_write(struct file *file, const char __user *buf, struct pci_dev *dev = PDE_DATA(ino); int pos = *ppos; int size = dev->cfg_size; - int cnt; + int cnt, ret; + + ret = security_locked_down(LOCKDOWN_PCI_ACCESS); + if (ret) + return ret; if (pos >= size) return 0; @@ -196,6 +201,10 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd, #endif /* HAVE_PCI_MMAP */ int ret = 0; + ret = security_locked_down(LOCKDOWN_PCI_ACCESS); + if (ret) + return ret; + switch (cmd) { case PCIIOC_CONTROLLER: ret = pci_domain_nr(dev->bus); @@ -238,7 +247,8 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma) struct pci_filp_private *fpriv = file->private_data; int i, ret, write_combine = 0, res_bit = IORESOURCE_MEM; - if (!capable(CAP_SYS_RAWIO)) + if (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_PCI_ACCESS)) return -EPERM; if (fpriv->mmap_state == pci_mmap_io) { diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c index d96626c614f5..31e39558d49d 100644 --- a/drivers/pci/syscall.c +++ b/drivers/pci/syscall.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include "pci.h" @@ -90,7 +91,8 @@ SYSCALL_DEFINE5(pciconfig_write, unsigned long, bus, unsigned long, dfn, u32 dword; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) || + security_locked_down(LOCKDOWN_PCI_ACCESS)) return -EPERM; dev = pci_get_domain_bus_and_slot(0, bus, dfn); diff --git a/include/linux/security.h b/include/linux/security.h index 80ac7fb27aa9..2b763f0ee352 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -107,6 +107,7 @@ enum lockdown_reason { LOCKDOWN_DEV_MEM, LOCKDOWN_KEXEC, LOCKDOWN_HIBERNATION, + LOCKDOWN_PCI_ACCESS, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 3462f7edcaac..410e90eda848 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -22,6 +22,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", [LOCKDOWN_KEXEC] = "kexec of unsigned images", [LOCKDOWN_HIBERNATION] = "hibernation", + [LOCKDOWN_PCI_ACCESS] = "direct PCI access", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 96c4f67293e4cd8b3394adce5a8041a2784e68a3 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:48 -0700 Subject: [PATCH 0178/2880] x86: Lock down IO port access when the kernel is locked down IO port access would permit users to gain access to PCI configuration registers, which in turn (on a lot of hardware) give access to MMIO register space. This would potentially permit root to trigger arbitrary DMA, so lock it down by default. This also implicitly locks down the KDADDIO, KDDELIO, KDENABIO and KDDISABIO console ioctls. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Reviewed-by: Kees Cook cc: x86@kernel.org Signed-off-by: James Morris --- arch/x86/kernel/ioport.c | 7 +++++-- include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 0fe1c8782208..61a89d3c0382 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -31,7 +32,8 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) + if (turn_on && (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_IOPORT))) return -EPERM; /* @@ -126,7 +128,8 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) return -EINVAL; /* Trying to gain more privileges? */ if (level > old) { - if (!capable(CAP_SYS_RAWIO)) + if (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_IOPORT)) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | diff --git a/include/linux/security.h b/include/linux/security.h index 2b763f0ee352..cd93fa5d3c6d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -108,6 +108,7 @@ enum lockdown_reason { LOCKDOWN_KEXEC, LOCKDOWN_HIBERNATION, LOCKDOWN_PCI_ACCESS, + LOCKDOWN_IOPORT, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 410e90eda848..8b7d65dbb086 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -23,6 +23,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_KEXEC] = "kexec of unsigned images", [LOCKDOWN_HIBERNATION] = "hibernation", [LOCKDOWN_PCI_ACCESS] = "direct PCI access", + [LOCKDOWN_IOPORT] = "raw io port access", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 95f5e95f41dff31b2a4566c5a8975c08a49ae4e3 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:49 -0700 Subject: [PATCH 0179/2880] x86/msr: Restrict MSR access when the kernel is locked down Writing to MSRs should not be allowed if the kernel is locked down, since it could lead to execution of arbitrary code in kernel mode. Based on a patch by Kees Cook. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Acked-by: Kees Cook Reviewed-by: Thomas Gleixner cc: x86@kernel.org Signed-off-by: James Morris --- arch/x86/kernel/msr.c | 8 ++++++++ include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 10 insertions(+) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 3db2252b958d..1547be359d7f 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -79,6 +80,10 @@ static ssize_t msr_write(struct file *file, const char __user *buf, int err = 0; ssize_t bytes = 0; + err = security_locked_down(LOCKDOWN_MSR); + if (err) + return err; + if (count % 8) return -EINVAL; /* Invalid chunk size */ @@ -130,6 +135,9 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) err = -EFAULT; break; } + err = security_locked_down(LOCKDOWN_MSR); + if (err) + break; err = wrmsr_safe_regs_on_cpu(cpu, regs); if (err) break; diff --git a/include/linux/security.h b/include/linux/security.h index cd93fa5d3c6d..010637a79eac 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -109,6 +109,7 @@ enum lockdown_reason { LOCKDOWN_HIBERNATION, LOCKDOWN_PCI_ACCESS, LOCKDOWN_IOPORT, + LOCKDOWN_MSR, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 8b7d65dbb086..b1c1c72440d5 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -24,6 +24,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_HIBERNATION] = "hibernation", [LOCKDOWN_PCI_ACCESS] = "direct PCI access", [LOCKDOWN_IOPORT] = "raw io port access", + [LOCKDOWN_MSR] = "raw MSR access", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From f474e1486b78ac15322f8a1cda48a32a1deff9d3 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:50 -0700 Subject: [PATCH 0180/2880] ACPI: Limit access to custom_method when the kernel is locked down custom_method effectively allows arbitrary access to system memory, making it possible for an attacker to circumvent restrictions on module loading. Disable it if the kernel is locked down. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Reviewed-by: Kees Cook cc: linux-acpi@vger.kernel.org Signed-off-by: James Morris --- drivers/acpi/custom_method.c | 6 ++++++ include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 8 insertions(+) diff --git a/drivers/acpi/custom_method.c b/drivers/acpi/custom_method.c index b2ef4c2ec955..7031307becd7 100644 --- a/drivers/acpi/custom_method.c +++ b/drivers/acpi/custom_method.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "internal.h" @@ -29,6 +30,11 @@ static ssize_t cm_write(struct file *file, const char __user * user_buf, struct acpi_table_header table; acpi_status status; + int ret; + + ret = security_locked_down(LOCKDOWN_ACPI_TABLES); + if (ret) + return ret; if (!(*ppos)) { /* parse the table header to get the table length */ diff --git a/include/linux/security.h b/include/linux/security.h index 010637a79eac..390e39395112 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -110,6 +110,7 @@ enum lockdown_reason { LOCKDOWN_PCI_ACCESS, LOCKDOWN_IOPORT, LOCKDOWN_MSR, + LOCKDOWN_ACPI_TABLES, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index b1c1c72440d5..6d44db0ddffa 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -25,6 +25,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_PCI_ACCESS] = "direct PCI access", [LOCKDOWN_IOPORT] = "raw io port access", [LOCKDOWN_MSR] = "raw MSR access", + [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 41fa1ee9c6d687afb05760dd349f361855f1d7f5 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Mon, 19 Aug 2019 17:17:51 -0700 Subject: [PATCH 0181/2880] acpi: Ignore acpi_rsdp kernel param when the kernel has been locked down This option allows userspace to pass the RSDP address to the kernel, which makes it possible for a user to modify the workings of hardware. Reject the option when the kernel is locked down. This requires some reworking of the existing RSDP command line logic, since the early boot code also makes use of a command-line passed RSDP when locating the SRAT table before the lockdown code has been initialised. This is achieved by separating the command line RSDP path in the early boot code from the generic RSDP path, and then copying the command line RSDP into boot params in the kernel proper if lockdown is not enabled. If lockdown is enabled and an RSDP is provided on the command line, this will only be used when parsing SRAT (which shouldn't permit kernel code execution) and will be ignored in the rest of the kernel. (Modified by Matthew Garrett in order to handle the early boot RSDP environment) Signed-off-by: Josh Boyer Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook cc: Dave Young cc: linux-acpi@vger.kernel.org Signed-off-by: James Morris --- arch/x86/boot/compressed/acpi.c | 19 +++++++++++++------ arch/x86/include/asm/acpi.h | 9 +++++++++ arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/acpi/boot.c | 5 +++++ arch/x86/kernel/x86_init.c | 1 + drivers/acpi/osl.c | 14 +++++++++++++- include/linux/acpi.h | 6 ++++++ 7 files changed, 49 insertions(+), 7 deletions(-) diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index ad84239e595e..e726e9b44bb1 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -26,7 +26,7 @@ struct mem_vector immovable_mem[MAX_NUMNODES*2]; */ #define MAX_ADDR_LEN 19 -static acpi_physical_address get_acpi_rsdp(void) +static acpi_physical_address get_cmdline_acpi_rsdp(void) { acpi_physical_address addr = 0; @@ -215,10 +215,7 @@ acpi_physical_address get_rsdp_addr(void) { acpi_physical_address pa; - pa = get_acpi_rsdp(); - - if (!pa) - pa = boot_params->acpi_rsdp_addr; + pa = boot_params->acpi_rsdp_addr; if (!pa) pa = efi_get_rsdp_addr(); @@ -240,7 +237,17 @@ static unsigned long get_acpi_srat_table(void) char arg[10]; u8 *entry; - rsdp = (struct acpi_table_rsdp *)(long)boot_params->acpi_rsdp_addr; + /* + * Check whether we were given an RSDP on the command line. We don't + * stash this in boot params because the kernel itself may have + * different ideas about whether to trust a command-line parameter. + */ + rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp(); + + if (!rsdp) + rsdp = (struct acpi_table_rsdp *)(long) + boot_params->acpi_rsdp_addr; + if (!rsdp) return 0; diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index aac686e1e005..bc9693c9107e 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -117,6 +117,12 @@ static inline bool acpi_has_cpu_in_madt(void) return !!acpi_lapic; } +#define ACPI_HAVE_ARCH_SET_ROOT_POINTER +static inline void acpi_arch_set_root_pointer(u64 addr) +{ + x86_init.acpi.set_root_pointer(addr); +} + #define ACPI_HAVE_ARCH_GET_ROOT_POINTER static inline u64 acpi_arch_get_root_pointer(void) { @@ -125,6 +131,7 @@ static inline u64 acpi_arch_get_root_pointer(void) void acpi_generic_reduced_hw_init(void); +void x86_default_set_root_pointer(u64 addr); u64 x86_default_get_root_pointer(void); #else /* !CONFIG_ACPI */ @@ -138,6 +145,8 @@ static inline void disable_acpi(void) { } static inline void acpi_generic_reduced_hw_init(void) { } +static inline void x86_default_set_root_pointer(u64 addr) { } + static inline u64 x86_default_get_root_pointer(void) { return 0; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index b85a7c54c6a1..d584128435cb 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -134,10 +134,12 @@ struct x86_hyper_init { /** * struct x86_init_acpi - x86 ACPI init functions + * @set_root_poitner: set RSDP address * @get_root_pointer: get RSDP address * @reduced_hw_early_init: hardware reduced platform early init */ struct x86_init_acpi { + void (*set_root_pointer)(u64 addr); u64 (*get_root_pointer)(void); void (*reduced_hw_early_init)(void); }; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 17b33ef604f3..04205ce127a1 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1760,6 +1760,11 @@ void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) e820__update_table_print(); } +void x86_default_set_root_pointer(u64 addr) +{ + boot_params.acpi_rsdp_addr = addr; +} + u64 x86_default_get_root_pointer(void) { return boot_params.acpi_rsdp_addr; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 50a2b492fdd6..d0b8f5585a73 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -95,6 +95,7 @@ struct x86_init_ops x86_init __initdata = { }, .acpi = { + .set_root_pointer = x86_default_set_root_pointer, .get_root_pointer = x86_default_get_root_pointer, .reduced_hw_early_init = acpi_generic_reduced_hw_init, }, diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index cc7507091dec..b7c3aeb175dd 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -180,8 +181,19 @@ acpi_physical_address __init acpi_os_get_root_pointer(void) acpi_physical_address pa; #ifdef CONFIG_KEXEC - if (acpi_rsdp) + /* + * We may have been provided with an RSDP on the command line, + * but if a malicious user has done so they may be pointing us + * at modified ACPI tables that could alter kernel behaviour - + * so, we check the lockdown status before making use of + * it. If we trust it then also stash it in an architecture + * specific location (if appropriate) so it can be carried + * over further kexec()s. + */ + if (acpi_rsdp && !security_locked_down(LOCKDOWN_ACPI_TABLES)) { + acpi_arch_set_root_pointer(acpi_rsdp); return acpi_rsdp; + } #endif pa = acpi_arch_get_root_pointer(); if (pa) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index d315d86844e4..268a4d91f54c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -632,6 +632,12 @@ bool acpi_gtdt_c3stop(int type); int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count); #endif +#ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER +static inline void acpi_arch_set_root_pointer(u64 addr) +{ +} +#endif + #ifndef ACPI_HAVE_ARCH_GET_ROOT_POINTER static inline u64 acpi_arch_get_root_pointer(void) { From 6ea0e815fc5e18597724169caa6e4d46dd8e693d Mon Sep 17 00:00:00 2001 From: Linn Crosetto Date: Mon, 19 Aug 2019 17:17:52 -0700 Subject: [PATCH 0182/2880] acpi: Disable ACPI table override if the kernel is locked down >From the kernel documentation (initrd_table_override.txt): If the ACPI_INITRD_TABLE_OVERRIDE compile option is true, it is possible to override nearly any ACPI table provided by the BIOS with an instrumented, modified one. When lockdown is enabled, the kernel should disallow any unauthenticated changes to kernel space. ACPI tables contain code invoked by the kernel, so do not allow ACPI tables to be overridden if the kernel is locked down. Signed-off-by: Linn Crosetto Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook cc: linux-acpi@vger.kernel.org Signed-off-by: James Morris --- drivers/acpi/tables.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index de974322a197..b7c29a11c0c1 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "internal.h" #ifdef CONFIG_ACPI_CUSTOM_DSDT @@ -577,6 +578,11 @@ void __init acpi_table_upgrade(void) if (table_nr == 0) return; + if (security_locked_down(LOCKDOWN_ACPI_TABLES)) { + pr_notice("kernel is locked down, ignoring table override\n"); + return; + } + acpi_tables_addr = memblock_find_in_range(0, ACPI_TABLE_UPGRADE_MAX_PHYS, all_tables_size, PAGE_SIZE); From 3f19cad3fa0d0fff18ee126f03a80420ae7bcbc9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:53 -0700 Subject: [PATCH 0183/2880] lockdown: Prohibit PCMCIA CIS storage when the kernel is locked down Prohibit replacement of the PCMCIA Card Information Structure when the kernel is locked down. Suggested-by: Dominik Brodowski Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Signed-off-by: James Morris --- drivers/pcmcia/cistpl.c | 5 +++++ include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+) diff --git a/drivers/pcmcia/cistpl.c b/drivers/pcmcia/cistpl.c index abd029945cc8..629359fe3513 100644 --- a/drivers/pcmcia/cistpl.c +++ b/drivers/pcmcia/cistpl.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -1575,6 +1576,10 @@ static ssize_t pccard_store_cis(struct file *filp, struct kobject *kobj, struct pcmcia_socket *s; int error; + error = security_locked_down(LOCKDOWN_PCMCIA_CIS); + if (error) + return error; + s = to_socket(container_of(kobj, struct device, kobj)); if (off) diff --git a/include/linux/security.h b/include/linux/security.h index 390e39395112..683f0607e6f2 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -111,6 +111,7 @@ enum lockdown_reason { LOCKDOWN_IOPORT, LOCKDOWN_MSR, LOCKDOWN_ACPI_TABLES, + LOCKDOWN_PCMCIA_CIS, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 6d44db0ddffa..db3477585972 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -26,6 +26,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_IOPORT] = "raw io port access", [LOCKDOWN_MSR] = "raw MSR access", [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", + [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 794edf30ee6cd088d5f4079b1d4a4cfe5371203e Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:54 -0700 Subject: [PATCH 0184/2880] lockdown: Lock down TIOCSSERIAL Lock down TIOCSSERIAL as that can be used to change the ioport and irq settings on a serial port. This only appears to be an issue for the serial drivers that use the core serial code. All other drivers seem to either ignore attempts to change port/irq or give an error. Reported-by: Greg Kroah-Hartman Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook cc: Jiri Slaby Cc: linux-serial@vger.kernel.org Signed-off-by: James Morris --- drivers/tty/serial/serial_core.c | 5 +++++ include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 83f4dd0bfd74..bbad407557b9 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -862,6 +863,10 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port, goto check_and_exit; } + retval = security_locked_down(LOCKDOWN_TIOCSSERIAL); + if (retval && (change_irq || change_port)) + goto exit; + /* * Ask the low level driver to verify the settings. */ diff --git a/include/linux/security.h b/include/linux/security.h index 683f0607e6f2..b4a85badb03a 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -112,6 +112,7 @@ enum lockdown_reason { LOCKDOWN_MSR, LOCKDOWN_ACPI_TABLES, LOCKDOWN_PCMCIA_CIS, + LOCKDOWN_TIOCSSERIAL, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index db3477585972..771c77f9c04a 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -27,6 +27,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_MSR] = "raw MSR access", [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", + [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 20657f66ef52e5005369e4ef539d4cbf01eab10d Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:55 -0700 Subject: [PATCH 0185/2880] lockdown: Lock down module params that specify hardware parameters (eg. ioport) Provided an annotation for module parameters that specify hardware parameters (such as io ports, iomem addresses, irqs, dma channels, fixed dma buffers and other types). Suggested-by: Alan Cox Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: Jessica Yu Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/params.c | 21 ++++++++++++++++----- security/lockdown/lockdown.c | 1 + 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index b4a85badb03a..1a3404f9c060 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -113,6 +113,7 @@ enum lockdown_reason { LOCKDOWN_ACPI_TABLES, LOCKDOWN_PCMCIA_CIS, LOCKDOWN_TIOCSSERIAL, + LOCKDOWN_MODULE_PARAMETERS, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/params.c b/kernel/params.c index cf448785d058..8e56f8b12d8f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_SYSFS /* Protects all built-in parameters, modules use their own param_lock */ @@ -96,13 +97,19 @@ bool parameq(const char *a, const char *b) return parameqn(a, b, strlen(a)+1); } -static void param_check_unsafe(const struct kernel_param *kp) +static bool param_check_unsafe(const struct kernel_param *kp) { + if (kp->flags & KERNEL_PARAM_FL_HWPARAM && + security_locked_down(LOCKDOWN_MODULE_PARAMETERS)) + return false; + if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { pr_notice("Setting dangerous option %s - tainting kernel\n", kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } + + return true; } static int parse_one(char *param, @@ -132,8 +139,10 @@ static int parse_one(char *param, pr_debug("handling %s with %p\n", param, params[i].ops->set); kernel_param_lock(params[i].mod); - param_check_unsafe(¶ms[i]); - err = params[i].ops->set(val, ¶ms[i]); + if (param_check_unsafe(¶ms[i])) + err = params[i].ops->set(val, ¶ms[i]); + else + err = -EPERM; kernel_param_unlock(params[i].mod); return err; } @@ -553,8 +562,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr, return -EPERM; kernel_param_lock(mk->mod); - param_check_unsafe(attribute->param); - err = attribute->param->ops->set(buf, attribute->param); + if (param_check_unsafe(attribute->param)) + err = attribute->param->ops->set(buf, attribute->param); + else + err = -EPERM; kernel_param_unlock(mk->mod); if (!err) return len; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 771c77f9c04a..0fa434294667 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -28,6 +28,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", + [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 906357f77a077508d160e729f917c5f0a4304f25 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:56 -0700 Subject: [PATCH 0186/2880] x86/mmiotrace: Lock down the testmmiotrace module The testmmiotrace module shouldn't be permitted when the kernel is locked down as it can be used to arbitrarily read and write MMIO space. This is a runtime check rather than buildtime in order to allow configurations where the same kernel may be run in both locked down or permissive modes depending on local policy. Suggested-by: Thomas Gleixner Signed-off-by: David Howells Acked-by: Steven Rostedt (VMware) Reviewed-by: Kees Cook cc: Thomas Gleixner cc: Steven Rostedt cc: Ingo Molnar cc: "H. Peter Anvin" cc: x86@kernel.org Signed-off-by: James Morris --- arch/x86/mm/testmmiotrace.c | 5 +++++ include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+) diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 0881e1ff1e58..a8bd952e136d 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -8,6 +8,7 @@ #include #include #include +#include static unsigned long mmio_address; module_param_hw(mmio_address, ulong, iomem, 0); @@ -115,6 +116,10 @@ static void do_test_bulk_ioremapping(void) static int __init init(void) { unsigned long size = (read_far) ? (8 << 20) : (16 << 10); + int ret = security_locked_down(LOCKDOWN_MMIOTRACE); + + if (ret) + return ret; if (mmio_address == 0) { pr_err("you have to use the module argument mmio_address.\n"); diff --git a/include/linux/security.h b/include/linux/security.h index 1a3404f9c060..d8db7ea4c4bf 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -114,6 +114,7 @@ enum lockdown_reason { LOCKDOWN_PCMCIA_CIS, LOCKDOWN_TIOCSSERIAL, LOCKDOWN_MODULE_PARAMETERS, + LOCKDOWN_MMIOTRACE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 0fa434294667..2eadbe0667e7 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -29,6 +29,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", + [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 02e935bf5b34edcc4cb0dc532dd0e1a1bfb33b51 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:57 -0700 Subject: [PATCH 0187/2880] lockdown: Lock down /proc/kcore Disallow access to /proc/kcore when the kernel is locked down to prevent access to cryptographic data. This is limited to lockdown confidentiality mode and is still permitted in integrity mode. Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Signed-off-by: James Morris --- fs/proc/kcore.c | 5 +++++ include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index f5834488b67d..ee2c576cc94e 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "internal.h" @@ -545,6 +546,10 @@ out: static int open_kcore(struct inode *inode, struct file *filp) { + int ret = security_locked_down(LOCKDOWN_KCORE); + + if (ret) + return ret; if (!capable(CAP_SYS_RAWIO)) return -EPERM; diff --git a/include/linux/security.h b/include/linux/security.h index d8db7ea4c4bf..669e8de5299d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -116,6 +116,7 @@ enum lockdown_reason { LOCKDOWN_MODULE_PARAMETERS, LOCKDOWN_MMIOTRACE, LOCKDOWN_INTEGRITY_MAX, + LOCKDOWN_KCORE, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 2eadbe0667e7..403b30357f75 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -31,6 +31,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_INTEGRITY_MAX] = "integrity", + [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From a94549dd87f5ea4ca50fee493df08a2dc6256b53 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:58 -0700 Subject: [PATCH 0188/2880] lockdown: Lock down tracing and perf kprobes when in confidentiality mode Disallow the creation of perf and ftrace kprobes when the kernel is locked down in confidentiality mode by preventing their registration. This prevents kprobes from being used to access kernel memory to steal crypto data, but continues to allow the use of kprobes from signed modules. Reported-by: Alexei Starovoitov Signed-off-by: David Howells Signed-off-by: Matthew Garrett Acked-by: Masami Hiramatsu Reviewed-by: Kees Cook Cc: Naveen N. Rao Cc: Anil S Keshavamurthy Cc: davem@davemloft.net Cc: Masami Hiramatsu Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/trace/trace_kprobe.c | 5 +++++ security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+) diff --git a/include/linux/security.h b/include/linux/security.h index 669e8de5299d..0b2529dbf0f4 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -117,6 +117,7 @@ enum lockdown_reason { LOCKDOWN_MMIOTRACE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, + LOCKDOWN_KPROBES, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7d736248a070..fcb28b0702b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "trace_dynevent.h" #include "trace_kprobe_selftest.h" @@ -415,6 +416,10 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) { int i, ret; + ret = security_locked_down(LOCKDOWN_KPROBES); + if (ret) + return ret; + if (trace_probe_is_registered(&tk->tp)) return -EINVAL; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 403b30357f75..27b2cf51e443 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -32,6 +32,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", + [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 9d1f8be5cf42b497a3bddf1d523f2bb142e9318c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:59 -0700 Subject: [PATCH 0189/2880] bpf: Restrict bpf when kernel lockdown is in confidentiality mode bpf_read() and bpf_read_str() could potentially be abused to (eg) allow private keys in kernel memory to be leaked. Disable them if the kernel has been locked down in confidentiality mode. Suggested-by: Alexei Starovoitov Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook cc: netdev@vger.kernel.org cc: Chun-Yi Lee cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/trace/bpf_trace.c | 10 ++++++++++ security/lockdown/lockdown.c | 1 + 3 files changed, 12 insertions(+) diff --git a/include/linux/security.h b/include/linux/security.h index 0b2529dbf0f4..e604f4c67f03 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -118,6 +118,7 @@ enum lockdown_reason { LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, + LOCKDOWN_BPF_READ, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1c9a4745e596..33a954c367f3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -139,8 +139,13 @@ BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; + ret = security_locked_down(LOCKDOWN_BPF_READ); + if (ret < 0) + goto out; + ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) +out: memset(dst, 0, size); return ret; @@ -566,6 +571,10 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, { int ret; + ret = security_locked_down(LOCKDOWN_BPF_READ); + if (ret < 0) + goto out; + /* * The strncpy_from_unsafe() call will likely not fill the entire * buffer, but that's okay in this circumstance as we're probing @@ -577,6 +586,7 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, */ ret = strncpy_from_unsafe(dst, unsafe_ptr, size); if (unlikely(ret < 0)) +out: memset(dst, 0, size); return ret; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 27b2cf51e443..2397772c56bd 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -33,6 +33,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", + [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From b0c8fdc7fdb77586c3d1937050925b960743306e Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:18:00 -0700 Subject: [PATCH 0190/2880] lockdown: Lock down perf when in confidentiality mode Disallow the use of certain perf facilities that might allow userspace to access kernel data. Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/events/core.c | 7 +++++++ security/lockdown/lockdown.c | 1 + 3 files changed, 9 insertions(+) diff --git a/include/linux/security.h b/include/linux/security.h index e604f4c67f03..b94f1e697537 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -119,6 +119,7 @@ enum lockdown_reason { LOCKDOWN_KCORE, LOCKDOWN_KPROBES, LOCKDOWN_BPF_READ, + LOCKDOWN_PERF, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/events/core.c b/kernel/events/core.c index f85929ce13be..8732f980a4fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10798,6 +10798,13 @@ SYSCALL_DEFINE5(perf_event_open, perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) return -EACCES; + err = security_locked_down(LOCKDOWN_PERF); + if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) + /* REGS_INTR can leak data, lockdown must prevent this */ + return err; + + err = 0; + /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 2397772c56bd..3d7b1039457b 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -34,6 +34,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM", + [LOCKDOWN_PERF] = "unsafe use of perf", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 29d3c1c8dfe752c01b7115ecd5a3142b232a38e1 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:18:01 -0700 Subject: [PATCH 0191/2880] kexec: Allow kexec_file() with appropriate IMA policy when locked down Systems in lockdown mode should block the kexec of untrusted kernels. For x86 and ARM we can ensure that a kernel is trustworthy by validating a PE signature, but this isn't possible on other architectures. On those platforms we can use IMA digital signatures instead. Add a function to determine whether IMA has or will verify signatures for a given event type, and if so permit kexec_file() even if the kernel is otherwise locked down. This is restricted to cases where CONFIG_INTEGRITY_TRUSTED_KEYRING is set in order to prevent an attacker from loading additional keys at runtime. Signed-off-by: Matthew Garrett Acked-by: Mimi Zohar Cc: Dmitry Kasatkin Cc: linux-integrity@vger.kernel.org Signed-off-by: James Morris --- include/linux/ima.h | 9 ++++++ kernel/kexec_file.c | 10 +++++- security/integrity/ima/ima.h | 2 ++ security/integrity/ima/ima_main.c | 2 +- security/integrity/ima/ima_policy.c | 50 +++++++++++++++++++++++++++++ 5 files changed, 71 insertions(+), 2 deletions(-) diff --git a/include/linux/ima.h b/include/linux/ima.h index 00036d2f57c3..8e2f324fb901 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -129,4 +129,13 @@ static inline int ima_inode_removexattr(struct dentry *dentry, return 0; } #endif /* CONFIG_IMA_APPRAISE */ + +#if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING) +extern bool ima_appraise_signature(enum kernel_read_file_id func); +#else +static inline bool ima_appraise_signature(enum kernel_read_file_id func) +{ + return false; +} +#endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */ #endif /* _LINUX_IMA_H */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 43109ef4d6bf..7f4a618fc8c1 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -208,7 +208,15 @@ kimage_validate_signature(struct kimage *image) return ret; } - return security_locked_down(LOCKDOWN_KEXEC); + /* If IMA is guaranteed to appraise a signature on the kexec + * image, permit it even if the kernel is otherwise locked + * down. + */ + if (!ima_appraise_signature(READING_KEXEC_IMAGE) && + security_locked_down(LOCKDOWN_KEXEC)) + return -EPERM; + + return 0; /* All other errors are fatal, including nomem, unparseable * signatures and signature check failures - even if signatures diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index ca10917b5f89..874bd77d3b91 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -111,6 +111,8 @@ struct ima_kexec_hdr { u64 count; }; +extern const int read_idmap[]; + #ifdef CONFIG_HAVE_IMA_KEXEC void ima_load_kexec_buffer(void); #else diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 1cffda4412b7..1747bc7bcb60 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -469,7 +469,7 @@ int ima_read_file(struct file *file, enum kernel_read_file_id read_id) return 0; } -static const int read_idmap[READING_MAX_ID] = { +const int read_idmap[READING_MAX_ID] = { [READING_FIRMWARE] = FIRMWARE_CHECK, [READING_FIRMWARE_PREALLOC_BUFFER] = FIRMWARE_CHECK, [READING_MODULE] = MODULE_CHECK, diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 7b53f2ca58e2..b8773f05f9da 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1339,3 +1339,53 @@ int ima_policy_show(struct seq_file *m, void *v) return 0; } #endif /* CONFIG_IMA_READ_POLICY */ + +#if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING) +/* + * ima_appraise_signature: whether IMA will appraise a given function using + * an IMA digital signature. This is restricted to cases where the kernel + * has a set of built-in trusted keys in order to avoid an attacker simply + * loading additional keys. + */ +bool ima_appraise_signature(enum kernel_read_file_id id) +{ + struct ima_rule_entry *entry; + bool found = false; + enum ima_hooks func; + + if (id >= READING_MAX_ID) + return false; + + func = read_idmap[id] ?: FILE_CHECK; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, ima_rules, list) { + if (entry->action != APPRAISE) + continue; + + /* + * A generic entry will match, but otherwise require that it + * match the func we're looking for + */ + if (entry->func && entry->func != func) + continue; + + /* + * We require this to be a digital signature, not a raw IMA + * hash. + */ + if (entry->flags & IMA_DIGSIG_REQUIRED) + found = true; + + /* + * We've found a rule that matches, so break now even if it + * didn't require a digital signature - a later rule that does + * won't override it, so would be a false positive. + */ + break; + } + + rcu_read_unlock(); + return found; +} +#endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */ From 5496197f9b084f086cb410dd566648b0896fcc74 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:18:02 -0700 Subject: [PATCH 0192/2880] debugfs: Restrict debugfs when the kernel is locked down Disallow opening of debugfs files that might be used to muck around when the kernel is locked down as various drivers give raw access to hardware through debugfs. Given the effort of auditing all 2000 or so files and manually fixing each one as necessary, I've chosen to apply a heuristic instead. The following changes are made: (1) chmod and chown are disallowed on debugfs objects (though the root dir can be modified by mount and remount, but I'm not worried about that). (2) When the kernel is locked down, only files with the following criteria are permitted to be opened: - The file must have mode 00444 - The file must not have ioctl methods - The file must not have mmap (3) When the kernel is locked down, files may only be opened for reading. Normal device interaction should be done through configfs, sysfs or a miscdev, not debugfs. Note that this makes it unnecessary to specifically lock down show_dsts(), show_devs() and show_call() in the asus-wmi driver. I would actually prefer to lock down all files by default and have the the files unlocked by the creator. This is tricky to manage correctly, though, as there are 19 creation functions and ~1600 call sites (some of them in loops scanning tables). Signed-off-by: David Howells cc: Andy Shevchenko cc: acpi4asus-user@lists.sourceforge.net cc: platform-driver-x86@vger.kernel.org cc: Matthew Garrett cc: Thomas Gleixner Cc: Greg KH Cc: Rafael J. Wysocki Signed-off-by: Matthew Garrett Signed-off-by: James Morris --- fs/debugfs/file.c | 30 ++++++++++++++++++++++++++++++ fs/debugfs/inode.c | 32 ++++++++++++++++++++++++++++++-- include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 4 files changed, 62 insertions(+), 2 deletions(-) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index ddd708b09fa1..5d3e449b5988 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "internal.h" @@ -136,6 +137,25 @@ void debugfs_file_put(struct dentry *dentry) } EXPORT_SYMBOL_GPL(debugfs_file_put); +/* + * Only permit access to world-readable files when the kernel is locked down. + * We also need to exclude any file that has ways to write or alter it as root + * can bypass the permissions check. + */ +static bool debugfs_is_locked_down(struct inode *inode, + struct file *filp, + const struct file_operations *real_fops) +{ + if ((inode->i_mode & 07777) == 0444 && + !(filp->f_mode & FMODE_WRITE) && + !real_fops->unlocked_ioctl && + !real_fops->compat_ioctl && + !real_fops->mmap) + return false; + + return security_locked_down(LOCKDOWN_DEBUGFS); +} + static int open_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); @@ -147,6 +167,11 @@ static int open_proxy_open(struct inode *inode, struct file *filp) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); + + r = debugfs_is_locked_down(inode, filp, real_fops); + if (r) + goto out; + real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not clean up after itself at exit? */ @@ -272,6 +297,11 @@ static int full_proxy_open(struct inode *inode, struct file *filp) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); + + r = debugfs_is_locked_down(inode, filp, real_fops); + if (r) + goto out; + real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not cleanup after itself at exit? */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index acef14ad53db..c8613bcad252 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "internal.h" @@ -32,6 +33,32 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; +/* + * Don't allow access attributes to be changed whilst the kernel is locked down + * so that we can use the file mode as part of a heuristic to determine whether + * to lock down individual files. + */ +static int debugfs_setattr(struct dentry *dentry, struct iattr *ia) +{ + int ret = security_locked_down(LOCKDOWN_DEBUGFS); + + if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) + return ret; + return simple_setattr(dentry, ia); +} + +static const struct inode_operations debugfs_file_inode_operations = { + .setattr = debugfs_setattr, +}; +static const struct inode_operations debugfs_dir_inode_operations = { + .lookup = simple_lookup, + .setattr = debugfs_setattr, +}; +static const struct inode_operations debugfs_symlink_inode_operations = { + .get_link = simple_get_link, + .setattr = debugfs_setattr, +}; + static struct inode *debugfs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -355,6 +382,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, inode->i_mode = mode; inode->i_private = data; + inode->i_op = &debugfs_file_inode_operations; inode->i_fop = proxy_fops; dentry->d_fsdata = (void *)((unsigned long)real_fops | DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); @@ -515,7 +543,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) return failed_creating(dentry); inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; - inode->i_op = &simple_dir_inode_operations; + inode->i_op = &debugfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -610,7 +638,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, return failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; - inode->i_op = &simple_symlink_inode_operations; + inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; d_instantiate(dentry, inode); return end_creating(dentry); diff --git a/include/linux/security.h b/include/linux/security.h index b94f1e697537..152824b6f456 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -115,6 +115,7 @@ enum lockdown_reason { LOCKDOWN_TIOCSSERIAL, LOCKDOWN_MODULE_PARAMETERS, LOCKDOWN_MMIOTRACE, + LOCKDOWN_DEBUGFS, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 3d7b1039457b..edd1fff0147d 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -30,6 +30,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", [LOCKDOWN_MMIOTRACE] = "unsafe mmio", + [LOCKDOWN_DEBUGFS] = "debugfs access", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", From ccbd54ff54e8b1880456b81c4aea352ebe208843 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:18:03 -0700 Subject: [PATCH 0193/2880] tracefs: Restrict tracefs when the kernel is locked down Tracefs may release more information about the kernel than desirable, so restrict it when the kernel is locked down in confidentiality mode by preventing open(). (Fixed by Ben Hutchings to avoid a null dereference in default_file_open()) Signed-off-by: Matthew Garrett Reviewed-by: Steven Rostedt (VMware) Cc: Ben Hutchings Signed-off-by: James Morris --- fs/tracefs/inode.c | 42 +++++++++++++++++++++++++++++++++++- include/linux/security.h | 1 + security/lockdown/lockdown.c | 1 + 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index a5bab190a297..761af8ce4015 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -20,6 +20,7 @@ #include #include #include +#include #define TRACEFS_DEFAULT_MODE 0700 @@ -27,6 +28,25 @@ static struct vfsmount *tracefs_mount; static int tracefs_mount_count; static bool tracefs_registered; +static int default_open_file(struct inode *inode, struct file *filp) +{ + struct dentry *dentry = filp->f_path.dentry; + struct file_operations *real_fops; + int ret; + + if (!dentry) + return -EINVAL; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + real_fops = dentry->d_fsdata; + if (!real_fops->open) + return 0; + return real_fops->open(inode, filp); +} + static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -221,6 +241,12 @@ static int tracefs_apply_options(struct super_block *sb) return 0; } +static void tracefs_destroy_inode(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + kfree(inode->i_fop); +} + static int tracefs_remount(struct super_block *sb, int *flags, char *data) { int err; @@ -257,6 +283,7 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root) static const struct super_operations tracefs_super_operations = { .statfs = simple_statfs, .remount_fs = tracefs_remount, + .destroy_inode = tracefs_destroy_inode, .show_options = tracefs_show_options, }; @@ -387,6 +414,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { + struct file_operations *proxy_fops; struct dentry *dentry; struct inode *inode; @@ -402,8 +430,20 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, if (unlikely(!inode)) return failed_creating(dentry); + proxy_fops = kzalloc(sizeof(struct file_operations), GFP_KERNEL); + if (unlikely(!proxy_fops)) { + iput(inode); + return failed_creating(dentry); + } + + if (!fops) + fops = &tracefs_file_operations; + + dentry->d_fsdata = (void *)fops; + memcpy(proxy_fops, fops, sizeof(*proxy_fops)); + proxy_fops->open = default_open_file; inode->i_mode = mode; - inode->i_fop = fops ? fops : &tracefs_file_operations; + inode->i_fop = proxy_fops; inode->i_private = data; d_instantiate(dentry, inode); fsnotify_create(dentry->d_parent->d_inode, dentry); diff --git a/include/linux/security.h b/include/linux/security.h index 152824b6f456..429f9f03372b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -121,6 +121,7 @@ enum lockdown_reason { LOCKDOWN_KPROBES, LOCKDOWN_BPF_READ, LOCKDOWN_PERF, + LOCKDOWN_TRACEFS, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index edd1fff0147d..84df03b1f5a7 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -36,6 +36,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM", [LOCKDOWN_PERF] = "unsafe use of perf", + [LOCKDOWN_TRACEFS] = "use of tracefs", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; From 1957a85b0032a81e6482ca4aab883643b8dae06e Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:18:04 -0700 Subject: [PATCH 0194/2880] efi: Restrict efivar_ssdt_load when the kernel is locked down efivar_ssdt_load allows the kernel to import arbitrary ACPI code from an EFI variable, which gives arbitrary code execution in ring 0. Prevent that when the kernel is locked down. Signed-off-by: Matthew Garrett Acked-by: Ard Biesheuvel Reviewed-by: Kees Cook Cc: Ard Biesheuvel Cc: linux-efi@vger.kernel.org Signed-off-by: James Morris --- drivers/firmware/efi/efi.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 4b7cf7bc0ded..5f98374f77f4 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -241,6 +242,11 @@ static void generic_ops_unregister(void) static char efivar_ssdt[EFIVAR_SSDT_NAME_MAX] __initdata; static int __init efivar_ssdt_setup(char *str) { + int ret = security_locked_down(LOCKDOWN_ACPI_TABLES); + + if (ret) + return ret; + if (strlen(str) < sizeof(efivar_ssdt)) memcpy(efivar_ssdt, str, strlen(str)); else From b602614a81078bf29c82b2671bb96a63488f68d6 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:18:05 -0700 Subject: [PATCH 0195/2880] lockdown: Print current->comm in restriction messages Print the content of current->comm in messages generated by lockdown to indicate a restriction that was hit. This makes it a bit easier to find out what caused the message. The message now patterned something like: Lockdown: : is restricted; see man kernel_lockdown.7 Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Signed-off-by: James Morris --- fs/proc/kcore.c | 5 +++-- security/lockdown/lockdown.c | 8 ++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index ee2c576cc94e..e2ed8e08cc7a 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -548,11 +548,12 @@ static int open_kcore(struct inode *inode, struct file *filp) { int ret = security_locked_down(LOCKDOWN_KCORE); - if (ret) - return ret; if (!capable(CAP_SYS_RAWIO)) return -EPERM; + if (ret) + return ret; + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!filp->private_data) return -ENOMEM; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 84df03b1f5a7..0068cec77c05 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -81,10 +81,14 @@ early_param("lockdown", lockdown_param); */ static int lockdown_is_locked_down(enum lockdown_reason what) { + if (WARN(what >= LOCKDOWN_CONFIDENTIALITY_MAX, + "Invalid lockdown reason")) + return -EPERM; + if (kernel_locked_down >= what) { if (lockdown_reasons[what]) - pr_notice("Lockdown: %s is restricted; see man kernel_lockdown.7\n", - lockdown_reasons[what]); + pr_notice("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n", + current->comm, lockdown_reasons[what]); return -EPERM; } From be819aa6f11145de32dab8690ec6055348488c18 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 20 Aug 2019 15:31:29 +0800 Subject: [PATCH 0196/2880] csky: Fixup arch_get_unmapped_area() implementation Current arch_get_unmapped_area() of abiv1 doesn't use standard kernel api. After referring to the implementation of arch/arm, we implement it with vm_unmapped_area() from linux/mm.h. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/abiv1/inc/abi/page.h | 5 ++- arch/csky/abiv1/mmap.c | 77 ++++++++++++++++++---------------- 2 files changed, 44 insertions(+), 38 deletions(-) diff --git a/arch/csky/abiv1/inc/abi/page.h b/arch/csky/abiv1/inc/abi/page.h index 6336e92a103a..c864519117c7 100644 --- a/arch/csky/abiv1/inc/abi/page.h +++ b/arch/csky/abiv1/inc/abi/page.h @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. -extern unsigned long shm_align_mask; +#include + extern void flush_dcache_page(struct page *page); static inline unsigned long pages_do_alias(unsigned long addr1, unsigned long addr2) { - return (addr1 ^ addr2) & shm_align_mask; + return (addr1 ^ addr2) & (SHMLBA-1); } static inline void clear_user_page(void *addr, unsigned long vaddr, diff --git a/arch/csky/abiv1/mmap.c b/arch/csky/abiv1/mmap.c index b462fd50b23a..6792aca49999 100644 --- a/arch/csky/abiv1/mmap.c +++ b/arch/csky/abiv1/mmap.c @@ -9,58 +9,63 @@ #include #include -unsigned long shm_align_mask = (0x4000 >> 1) - 1; /* Sane caches */ +#define COLOUR_ALIGN(addr,pgoff) \ + ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ + (((pgoff)<mm; + struct vm_area_struct *vma; + int do_align = 0; + struct vm_unmapped_area_info info; + /* + * We only need to do colour alignment if either the I or D + * caches alias. + */ + do_align = filp || (flags & MAP_SHARED); + + /* + * We enforce the MAP_FIXED case. + */ if (flags & MAP_FIXED) { - /* - * We do not accept a shared mapping if it would violate - * cache aliasing constraints. - */ - if ((flags & MAP_SHARED) && - ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask)) + if (flags & MAP_SHARED && + (addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)) return -EINVAL; return addr; } if (len > TASK_SIZE) return -ENOMEM; - do_color_align = 0; - if (filp || (flags & MAP_SHARED)) - do_color_align = 1; + if (addr) { - if (do_color_align) + if (do_align) addr = COLOUR_ALIGN(addr, pgoff); else addr = PAGE_ALIGN(addr); - vmm = find_vma(current->mm, addr); - if (TASK_SIZE - len >= addr && - (!vmm || addr + len <= vmm->vm_start)) - return addr; - } - addr = TASK_UNMAPPED_BASE; - if (do_color_align) - addr = COLOUR_ALIGN(addr, pgoff); - else - addr = PAGE_ALIGN(addr); - for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { - /* At this point: (!vmm || addr < vmm->vm_end). */ - if (TASK_SIZE - len < addr) - return -ENOMEM; - if (!vmm || addr + len <= vmm->vm_start) + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vm_start_gap(vma))) return addr; - addr = vmm->vm_end; - if (do_color_align) - addr = COLOUR_ALIGN(addr, pgoff); } + + info.flags = 0; + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; + info.align_offset = pgoff << PAGE_SHIFT; + return vm_unmapped_area(&info); } From dc140045c0cace809af872e3799e8fbe1b7d7f86 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 20 Aug 2019 12:47:24 +0800 Subject: [PATCH 0197/2880] csky: Fixup defer cache flush for 610 We use defer cache flush mechanism to improve the performance of 610, but the implementation is wrong. We fix it up now and update the mechanism: - Zero page needn't be flushed. - If page is file mapping & non-touched in user space, defer flush. - If page is anon mapping or dirty file mapping, flush immediately. - In update_mmu_cache finish the defer flush by flush_dcache_page(). For 610 we need take care the dcache aliasing issue: - VIPT cache with 8K-bytes size per way in 4K page granularity. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/abiv1/cacheflush.c | 50 +++++++++++++++------------- arch/csky/abiv1/inc/abi/cacheflush.h | 4 +-- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 10af8b6fe322..fee99fc6612f 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -11,42 +11,46 @@ #include #include +#define PG_dcache_clean PG_arch_1 + void flush_dcache_page(struct page *page) { - struct address_space *mapping = page_mapping(page); - unsigned long addr; + struct address_space *mapping; - if (mapping && !mapping_mapped(mapping)) { - set_bit(PG_arch_1, &(page)->flags); + if (page == ZERO_PAGE(0)) return; + + mapping = page_mapping_file(page); + + if (mapping && !page_mapcount(page)) + clear_bit(PG_dcache_clean, &page->flags); + else { + dcache_wbinv_all(); + if (mapping) + icache_inv_all(); + set_bit(PG_dcache_clean, &page->flags); } - - /* - * We could delay the flush for the !page_mapping case too. But that - * case is for exec env/arg pages and those are %99 certainly going to - * get faulted into the tlb (and thus flushed) anyways. - */ - addr = (unsigned long) page_address(page); - dcache_wb_range(addr, addr + PAGE_SIZE); } +EXPORT_SYMBOL(flush_dcache_page); -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *pte) +void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) { - unsigned long addr; + unsigned long pfn = pte_pfn(*ptep); struct page *page; - unsigned long pfn; - pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) + if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); - addr = (unsigned long) page_address(page); + if (page == ZERO_PAGE(0)) + return; - if (vma->vm_flags & VM_EXEC || - pages_do_alias(addr, address & PAGE_MASK)) - cache_wbinv_all(); + if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + dcache_wbinv_all(); - clear_bit(PG_arch_1, &(page)->flags); + if (page_mapping_file(page)) { + if (vma->vm_flags & VM_EXEC) + icache_inv_all(); + } } diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index 5f663aef9b1b..fce5604cef40 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -26,8 +26,8 @@ extern void flush_dcache_page(struct page *); #define flush_icache_page(vma, page) cache_wbinv_all() #define flush_icache_range(start, end) cache_wbinv_range(start, end) -#define flush_icache_user_range(vma, pg, adr, len) \ - cache_wbinv_range(adr, adr + len) +#define flush_icache_user_range(vma,page,addr,len) \ + flush_dcache_page(page) #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ do { \ From c7e6f0e99227b3dcdc5e62f789119e000887ff79 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 20 Aug 2019 20:15:44 +0800 Subject: [PATCH 0198/2880] csky: Support kernel non-aligned access We prohibit non-aligned access in kernel mode, but some special NIC driver needs to support kernel-state unaligned access. For example, when the bus does not support unaligned access, IP header parsing will cause non-aligned access and driver does not recopy the skb buffer to dma for performance reasons. Added kernel_enable & user_enable to control unaligned access and added kernel_count & user_count for statistical unaligned access. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/abiv1/alignment.c | 62 +++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/arch/csky/abiv1/alignment.c b/arch/csky/abiv1/alignment.c index 27ef5b2c43ab..cb2a0d94a144 100644 --- a/arch/csky/abiv1/alignment.c +++ b/arch/csky/abiv1/alignment.c @@ -5,8 +5,10 @@ #include #include -static int align_enable = 1; -static int align_count; +static int align_kern_enable = 1; +static int align_usr_enable = 1; +static int align_kern_count = 0; +static int align_usr_count = 0; static inline uint32_t get_ptreg(struct pt_regs *regs, uint32_t rx) { @@ -32,9 +34,6 @@ static int ldb_asm(uint32_t addr, uint32_t *valp) uint32_t val; int err; - if (!access_ok((void *)addr, 1)) - return 1; - asm volatile ( "movi %0, 0\n" "1:\n" @@ -67,9 +66,6 @@ static int stb_asm(uint32_t addr, uint32_t val) { int err; - if (!access_ok((void *)addr, 1)) - return 1; - asm volatile ( "movi %0, 0\n" "1:\n" @@ -203,8 +199,6 @@ static int stw_c(struct pt_regs *regs, uint32_t rz, uint32_t addr) if (stb_asm(addr, byte3)) return 1; - align_count++; - return 0; } @@ -226,7 +220,14 @@ void csky_alignment(struct pt_regs *regs) uint32_t addr = 0; if (!user_mode(regs)) + goto kernel_area; + + if (!align_usr_enable) { + pr_err("%s user disabled.\n", __func__); goto bad_area; + } + + align_usr_count++; ret = get_user(tmp, (uint16_t *)instruction_pointer(regs)); if (ret) { @@ -234,6 +235,19 @@ void csky_alignment(struct pt_regs *regs) goto bad_area; } + goto good_area; + +kernel_area: + if (!align_kern_enable) { + pr_err("%s kernel disabled.\n", __func__); + goto bad_area; + } + + align_kern_count++; + + tmp = *(uint16_t *)instruction_pointer(regs); + +good_area: opcode = (uint32_t)tmp; rx = opcode & 0xf; @@ -286,18 +300,32 @@ bad_area: force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr); } -static struct ctl_table alignment_tbl[4] = { +static struct ctl_table alignment_tbl[5] = { { - .procname = "enable", - .data = &align_enable, - .maxlen = sizeof(align_enable), + .procname = "kernel_enable", + .data = &align_kern_enable, + .maxlen = sizeof(align_kern_enable), .mode = 0666, .proc_handler = &proc_dointvec }, { - .procname = "count", - .data = &align_count, - .maxlen = sizeof(align_count), + .procname = "user_enable", + .data = &align_usr_enable, + .maxlen = sizeof(align_usr_enable), + .mode = 0666, + .proc_handler = &proc_dointvec + }, + { + .procname = "kernel_count", + .data = &align_kern_count, + .maxlen = sizeof(align_kern_count), + .mode = 0666, + .proc_handler = &proc_dointvec + }, + { + .procname = "user_count", + .data = &align_usr_count, + .maxlen = sizeof(align_usr_count), .mode = 0666, .proc_handler = &proc_dointvec }, From e2751463eaa6f9fec8fea80abbdc62dbc487b3c5 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 26 Jul 2019 15:48:53 +0800 Subject: [PATCH 0199/2880] fs: nfs: Fix possible null-pointer dereferences in encode_attrs() In encode_attrs(), there is an if statement on line 1145 to check whether label is NULL: if (label && (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL)) When label is NULL, it is used on lines 1178-1181: *p++ = cpu_to_be32(label->lfs); *p++ = cpu_to_be32(label->pi); *p++ = cpu_to_be32(label->len); p = xdr_encode_opaque_fixed(p, label->label, label->len); To fix these bugs, label is checked before being used. These bugs are found by a static analysis tool STCheck written by us. Signed-off-by: Jia-Ju Bai Signed-off-by: Anna Schumaker --- fs/nfs/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 46a8d636d151..ab07db0f07cd 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1174,7 +1174,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } - if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { + if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) { *p++ = cpu_to_be32(label->lfs); *p++ = cpu_to_be32(label->pi); *p++ = cpu_to_be32(label->len); From 691b45ddbd182a4ce0bc91953c70c845cf0935f1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:36:19 -0400 Subject: [PATCH 0200/2880] SUNRPC: Remove rpc_wake_up_queued_task_on_wq() Clean up: commit c544577daddb ("SUNRPC: Clean up transport write space handling") appears to have removed the last caller of rpc_wake_up_queued_task_on_wq(). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 3 --- net/sunrpc/sched.c | 27 ++++----------------------- 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index baa3ecdb882f..d1283bddd218 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -243,9 +243,6 @@ void rpc_sleep_on_priority_timeout(struct rpc_wait_queue *queue, void rpc_sleep_on_priority(struct rpc_wait_queue *, struct rpc_task *, int priority); -void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, - struct rpc_task *task); void rpc_wake_up_queued_task(struct rpc_wait_queue *, struct rpc_task *); void rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *, diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 1f275aba786f..f25c4b9ba185 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -541,33 +541,14 @@ rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq, return NULL; } -static void -rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, struct rpc_task *task) -{ - rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL); -} - /* * Wake up a queued task while the queue lock is being held */ -static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, + struct rpc_task *task) { - rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task); -} - -/* - * Wake up a task on a specific queue - */ -void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, - struct rpc_task *task) -{ - if (!RPC_IS_QUEUED(task)) - return; - spin_lock(&queue->lock); - rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); - spin_unlock(&queue->lock); + rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue, + task, NULL, NULL); } /* From 95bd8304b34656d30bc0c908384de007b8fbcb55 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:37:05 -0400 Subject: [PATCH 0201/2880] SUNRPC: Inline xdr_commit_encode Micro-optimization: For xdr_commit_encode call sites in net/sunrpc/xdr.c, eliminate the extra calling sequence. On my client, this change saves about a microsecond for every 30 calls to xdr_reserve_space(). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 48c93b9e525e..7ba0ede6b417 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -560,7 +560,7 @@ EXPORT_SYMBOL_GPL(xdr_init_encode); * required at the end of encoding, or any other time when the xdr_buf * data might be read. */ -void xdr_commit_encode(struct xdr_stream *xdr) +inline void xdr_commit_encode(struct xdr_stream *xdr) { int shift = xdr->scratch.iov_len; void *page; From 2fb2a4d529fe3b1db4ced06d3b6568d46a991269 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:37:52 -0400 Subject: [PATCH 0202/2880] xprtrdma: Refresh the documenting comment in frwr_ops.c Things have changed since this comment was written. In particular, the reworking of connection closing, on-demand creation of MRs, and the removal of fr_state all mean that deferring MR recovery to frwr_map is no longer needed. The description is obsolete. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 64 +++++++++------------------------- 1 file changed, 17 insertions(+), 47 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 0b6dad7580a1..a30f2ae49578 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -7,67 +7,37 @@ /* Lightweight memory registration using Fast Registration Work * Requests (FRWR). * - * FRWR features ordered asynchronous registration and deregistration - * of arbitrarily sized memory regions. This is the fastest and safest + * FRWR features ordered asynchronous registration and invalidation + * of arbitrarily-sized memory regions. This is the fastest and safest * but most complex memory registration mode. */ /* Normal operation * - * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG + * A Memory Region is prepared for RDMA Read or Write using a FAST_REG * Work Request (frwr_map). When the RDMA operation is finished, this * Memory Region is invalidated using a LOCAL_INV Work Request - * (frwr_unmap_sync). + * (frwr_unmap_async and frwr_unmap_sync). * - * Typically these Work Requests are not signaled, and neither are RDMA - * SEND Work Requests (with the exception of signaling occasionally to - * prevent provider work queue overflows). This greatly reduces HCA + * Typically FAST_REG Work Requests are not signaled, and neither are + * RDMA Send Work Requests (with the exception of signaling occasionally + * to prevent provider work queue overflows). This greatly reduces HCA * interrupt workload. - * - * As an optimization, frwr_unmap marks MRs INVALID before the - * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on - * rb_mrs immediately so that no work (like managing a linked list - * under a spinlock) is needed in the completion upcall. - * - * But this means that frwr_map() can occasionally encounter an MR - * that is INVALID but the LOCAL_INV WR has not completed. Work Queue - * ordering prevents a subsequent FAST_REG WR from executing against - * that MR while it is still being invalidated. */ /* Transport recovery * - * ->op_map and the transport connect worker cannot run at the same - * time, but ->op_unmap can fire while the transport connect worker - * is running. Thus MR recovery is handled in ->op_map, to guarantee - * that recovered MRs are owned by a sending RPC, and not one where - * ->op_unmap could fire at the same time transport reconnect is - * being done. + * frwr_map and frwr_unmap_* cannot run at the same time the transport + * connect worker is running. The connect worker holds the transport + * send lock, just as ->send_request does. This prevents frwr_map and + * the connect worker from running concurrently. When a connection is + * closed, the Receive completion queue is drained before the allowing + * the connect worker to get control. This prevents frwr_unmap and the + * connect worker from running concurrently. * - * When the underlying transport disconnects, MRs are left in one of - * four states: - * - * INVALID: The MR was not in use before the QP entered ERROR state. - * - * VALID: The MR was registered before the QP entered ERROR state. - * - * FLUSHED_FR: The MR was being registered when the QP entered ERROR - * state, and the pending WR was flushed. - * - * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR - * state, and the pending WR was flushed. - * - * When frwr_map encounters FLUSHED and VALID MRs, they are recovered - * with ib_dereg_mr and then are re-initialized. Because MR recovery - * allocates fresh resources, it is deferred to a workqueue, and the - * recovered MRs are placed back on the rb_mrs list when recovery is - * complete. frwr_map allocates another MR for the current RPC while - * the broken MR is reset. - * - * To ensure that frwr_map doesn't encounter an MR that is marked - * INVALID but that is about to be flushed due to a previous transport - * disconnect, the transport connect worker attempts to drain all - * pending send queue WRs before the transport is reconnected. + * When the underlying transport disconnects, MRs that are in flight + * are flushed and are likely unusable. Thus all flushed MRs are + * destroyed. New MRs are created on demand. */ #include From af08a7754a5da7ae704624cbe4ef83e46246c92d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:38:38 -0400 Subject: [PATCH 0203/2880] xprtrdma: Update obsolete comment Comment was made obsolete by commit 8cec3dba76a4 ("xprtrdma: rpcrdma_regbuf alignment"). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/xprt_rdma.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 92ce09fcea74..3b2f2041e889 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -117,9 +117,6 @@ struct rpcrdma_ep { #endif /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV - * - * The below structure appears at the front of a large region of kmalloc'd - * memory, which always starts on a good alignment boundary. */ struct rpcrdma_regbuf { From 36bdd9056b6a83d573ffdde282a5a91ce734c536 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:39:25 -0400 Subject: [PATCH 0204/2880] xprtrdma: Fix calculation of ri_max_segs again Commit 302d3deb206 ("xprtrdma: Prevent inline overflow") added this calculation back in 2016, but got it wrong. I tested only the lower bound, which is why there is a max_t there. The upper bound should be rounded up too. Now, when using DIV_ROUND_UP, that takes care of the lower bound as well. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index a30f2ae49578..3a10bfff2125 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -260,8 +260,8 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep) ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ - ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / - ia->ri_max_frwr_depth); + ia->ri_max_segs = + DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth); /* Reply chunks require segments for head and tail buffers */ ia->ri_max_segs += 2; if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS) From f3c66a2f56683a20ced4e700a8167d6b357641da Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:40:11 -0400 Subject: [PATCH 0205/2880] xprtrdma: Boost maximum transport header size Although I haven't seen any performance results that justify it, I've received several complaints that NFS/RDMA no longer supports a maximum rsize and wsize of 1MB. These days it is somewhat smaller. To simplify the logic that determines whether a chunk list is necessary, the implementation uses a fixed maximum size of the transport header. Currently that maximum size is 256 bytes, one quarter of the default inline threshold size for RPC/RDMA v1. Since commit a78868497c2e ("xprtrdma: Reduce max_frwr_depth"), the size of chunks is also smaller to take advantage of inline page lists in device internal MR data structures. The combination of these two design choices has reduced the maximum NFS rsize and wsize that can be used for most RNIC/HCAs. Increasing the maximum transport header size and the maximum number of RDMA segments it can contain increases the negotiated maximum rsize/wsize on common RNIC/HCAs. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 9 ++++++++- net/sunrpc/xprtrdma/xprt_rdma.h | 23 ++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 805b1f35e1ca..e639ea0faf19 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -1000,12 +1001,18 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; + size_t maxhdrsize; req = kzalloc(sizeof(*req), flags); if (req == NULL) goto out1; - rb = rpcrdma_regbuf_alloc(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags); + /* Compute maximum header buffer size in bytes */ + maxhdrsize = rpcrdma_fixed_maxsz + 3 + + r_xprt->rx_ia.ri_max_segs * rpcrdma_readchunk_maxsz; + maxhdrsize *= sizeof(__be32); + rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), + DMA_TO_DEVICE, flags); if (!rb) goto out2; req->rl_rdmabuf = rb; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 3b2f2041e889..eaf6b907a76e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -155,25 +155,22 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb) /* To ensure a transport can always make forward progress, * the number of RDMA segments allowed in header chunk lists - * is capped at 8. This prevents less-capable devices and - * memory registrations from overrunning the Send buffer - * while building chunk lists. + * is capped at 16. This prevents less-capable devices from + * overrunning the Send buffer while building chunk lists. * * Elements of the Read list take up more room than the - * Write list or Reply chunk. 8 read segments means the Read - * list (or Write list or Reply chunk) cannot consume more - * than + * Write list or Reply chunk. 16 read segments means the + * chunk lists cannot consume more than * - * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes. + * ((16 + 2) * read segment size) + 1 XDR words, * - * And the fixed part of the header is another 24 bytes. - * - * The smallest inline threshold is 1024 bytes, ensuring that - * at least 750 bytes are available for RPC messages. + * or about 400 bytes. The fixed part of the header is + * another 24 bytes. Thus when the inline threshold is + * 1024 bytes, at least 600 bytes are available for RPC + * message bodies. */ enum { - RPCRDMA_MAX_HDR_SEGS = 8, - RPCRDMA_HDRBUF_SIZE = 256, + RPCRDMA_MAX_HDR_SEGS = 16, }; /* From bb13f35b96f4c83dc4015d71fa2b543c665fbdae Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 20 Aug 2019 01:32:43 +0000 Subject: [PATCH 0206/2880] nfsd: remove duplicated include from filecache.c Remove duplicated include. Signed-off-by: YueHaibing Signed-off-by: J. Bruce Fields --- fs/nfsd/filecache.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index a2fcb251d2f6..2e1a972231e5 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include From aeaed4848234c97fb720b0e51a0c56dc8de0eeed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:40:58 -0400 Subject: [PATCH 0207/2880] xprtrdma: Boost client's max slot table size to match Linux server I've heard rumors of an NFS/RDMA server implementation that has a default credit limit of 1024. The client's default setting remains at 128. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtrdma.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 86fc38ff0355..16c239e0d6dd 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -49,9 +49,9 @@ * fully-chunked NFS message (read chunks are the largest). Note only * a single chunk type per message is supported currently. */ -#define RPCRDMA_MIN_SLOT_TABLE (2U) +#define RPCRDMA_MIN_SLOT_TABLE (4U) #define RPCRDMA_DEF_SLOT_TABLE (128U) -#define RPCRDMA_MAX_SLOT_TABLE (256U) +#define RPCRDMA_MAX_SLOT_TABLE (16384U) #define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */ #define RPCRDMA_DEF_INLINE (4096) /* default inline thresh */ From 9b543419652917f048310d0863c47c107c26fb0c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 19 Aug 2019 15:41:00 +0300 Subject: [PATCH 0208/2880] Tools: hv: move to tools buildsystem There is a nice buildsystem dedicated for userspace tools in Linux kernel tree. Switch Hyper-V daemons to be built by it. Cc: Vitaly Kuznetsov Signed-off-by: Andy Shevchenko Tested-by: Vitaly Kuznetsov Signed-off-by: Sasha Levin --- tools/hv/Build | 3 +++ tools/hv/Makefile | 51 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 10 deletions(-) create mode 100644 tools/hv/Build diff --git a/tools/hv/Build b/tools/hv/Build new file mode 100644 index 000000000000..6cf51fa4b306 --- /dev/null +++ b/tools/hv/Build @@ -0,0 +1,3 @@ +hv_kvp_daemon-y += hv_kvp_daemon.o +hv_vss_daemon-y += hv_vss_daemon.o +hv_fcopy_daemon-y += hv_fcopy_daemon.o diff --git a/tools/hv/Makefile b/tools/hv/Makefile index 5db5e62cebda..b57143d9459c 100644 --- a/tools/hv/Makefile +++ b/tools/hv/Makefile @@ -1,28 +1,55 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for Hyper-V tools - -WARNINGS = -Wall -Wextra -CFLAGS = $(WARNINGS) -g $(shell getconf LFS_CFLAGS) - -CFLAGS += -D__EXPORTED_HEADERS__ -I../../include/uapi -I../../include +include ../scripts/Makefile.include sbindir ?= /usr/sbin libexecdir ?= /usr/libexec sharedstatedir ?= /var/lib -ALL_PROGRAMS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +endif + +# Do not use make's built-in rules +# (this improves performance and avoids hard-to-debug behaviour); +MAKEFLAGS += -r + +override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include + +ALL_TARGETS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon +ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) ALL_SCRIPTS := hv_get_dhcp_info.sh hv_get_dns_info.sh hv_set_ifconfig.sh all: $(ALL_PROGRAMS) -%: %.c - $(CC) $(CFLAGS) -o $@ $^ +export srctree OUTPUT CC LD CFLAGS +include $(srctree)/tools/build/Makefile.include + +HV_KVP_DAEMON_IN := $(OUTPUT)hv_kvp_daemon-in.o +$(HV_KVP_DAEMON_IN): FORCE + $(Q)$(MAKE) $(build)=hv_kvp_daemon +$(OUTPUT)hv_kvp_daemon: $(HV_KVP_DAEMON_IN) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ + +HV_VSS_DAEMON_IN := $(OUTPUT)hv_vss_daemon-in.o +$(HV_VSS_DAEMON_IN): FORCE + $(Q)$(MAKE) $(build)=hv_vss_daemon +$(OUTPUT)hv_vss_daemon: $(HV_VSS_DAEMON_IN) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ + +HV_FCOPY_DAEMON_IN := $(OUTPUT)hv_fcopy_daemon-in.o +$(HV_FCOPY_DAEMON_IN): FORCE + $(Q)$(MAKE) $(build)=hv_fcopy_daemon +$(OUTPUT)hv_fcopy_daemon: $(HV_FCOPY_DAEMON_IN) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ clean: - $(RM) hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon + rm -f $(ALL_PROGRAMS) + find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -install: all +install: $(ALL_PROGRAMS) install -d -m 755 $(DESTDIR)$(sbindir); \ install -d -m 755 $(DESTDIR)$(libexecdir)/hypervkvpd; \ install -d -m 755 $(DESTDIR)$(sharedstatedir); \ @@ -33,3 +60,7 @@ install: all for script in $(ALL_SCRIPTS); do \ install $$script -m 755 $(DESTDIR)$(libexecdir)/hypervkvpd/$${script%.sh}; \ done + +FORCE: + +.PHONY: all install clean FORCE prepare From 2dfdcd88cf0ea66eec0478de82283ef20eb6f421 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:41:44 -0400 Subject: [PATCH 0209/2880] xprtrdma: Rename CQE field in Receive trace points Make the field name the same for all trace points that handle pointers to struct rpcrdma_rep. That makes it easy to grep for matching rep points in trace output. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 21 +++++++++++---------- net/sunrpc/xprtrdma/verbs.c | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index f6a4eaa85a3e..6e6055eb67e7 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -623,21 +623,21 @@ TRACE_EVENT(xprtrdma_post_send, TRACE_EVENT(xprtrdma_post_recv, TP_PROTO( - const struct ib_cqe *cqe + const struct rpcrdma_rep *rep ), - TP_ARGS(cqe), + TP_ARGS(rep), TP_STRUCT__entry( - __field(const void *, cqe) + __field(const void *, rep) ), TP_fast_assign( - __entry->cqe = cqe; + __entry->rep = rep; ), - TP_printk("cqe=%p", - __entry->cqe + TP_printk("rep=%p", + __entry->rep ) ); @@ -715,14 +715,15 @@ TRACE_EVENT(xprtrdma_wc_receive, TP_ARGS(wc), TP_STRUCT__entry( - __field(const void *, cqe) + __field(const void *, rep) __field(u32, byte_len) __field(unsigned int, status) __field(u32, vendor_err) ), TP_fast_assign( - __entry->cqe = wc->wr_cqe; + __entry->rep = container_of(wc->wr_cqe, struct rpcrdma_rep, + rr_cqe); __entry->status = wc->status; if (wc->status) { __entry->byte_len = 0; @@ -733,8 +734,8 @@ TRACE_EVENT(xprtrdma_wc_receive, } ), - TP_printk("cqe=%p %u bytes: %s (%u/0x%x)", - __entry->cqe, __entry->byte_len, + TP_printk("rep=%p %u bytes: %s (%u/0x%x)", + __entry->rep, __entry->byte_len, rdma_show_wc_status(__entry->status), __entry->status, __entry->vendor_err ) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e639ea0faf19..3c275a7a4e4c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1531,7 +1531,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) goto release_wrs; - trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe); + trace_xprtrdma_post_recv(rep); ++count; } From eed48a9c161588544999ee8d451392921d846ee8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:42:31 -0400 Subject: [PATCH 0210/2880] xprtrdma: Rename rpcrdma_buffer::rb_all Clean up: There are other "all" list heads. For code clarity distinguish this one as for use only for MRs by renaming it. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 26 ++++++++------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3c275a7a4e4c..e004873cc4f0 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -944,8 +944,6 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ia *ia = &r_xprt->rx_ia; unsigned int count; - LIST_HEAD(free); - LIST_HEAD(all); for (count = 0; count < ia->ri_max_segs; count++) { struct rpcrdma_mr *mr; @@ -963,15 +961,13 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) mr->mr_xprt = r_xprt; - list_add(&mr->mr_list, &free); - list_add(&mr->mr_all, &all); + spin_lock(&buf->rb_mrlock); + list_add(&mr->mr_list, &buf->rb_mrs); + list_add(&mr->mr_all, &buf->rb_all_mrs); + spin_unlock(&buf->rb_mrlock); } - spin_lock(&buf->rb_mrlock); - list_splice(&free, &buf->rb_mrs); - list_splice(&all, &buf->rb_all); r_xprt->rx_stats.mrs_allocated += count; - spin_unlock(&buf->rb_mrlock); trace_xprtrdma_createmrs(r_xprt, count); } @@ -1089,7 +1085,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); INIT_LIST_HEAD(&buf->rb_mrs); - INIT_LIST_HEAD(&buf->rb_all); + INIT_LIST_HEAD(&buf->rb_all_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); @@ -1156,24 +1152,18 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) count = 0; spin_lock(&buf->rb_mrlock); - while (!list_empty(&buf->rb_all)) { - mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all); + while ((mr = list_first_entry_or_null(&buf->rb_all_mrs, + struct rpcrdma_mr, + mr_all)) != NULL) { list_del(&mr->mr_all); - spin_unlock(&buf->rb_mrlock); - /* Ensure MW is not on any rl_registered list */ - if (!list_empty(&mr->mr_list)) - list_del(&mr->mr_list); - frwr_release_mr(mr); count++; spin_lock(&buf->rb_mrlock); } spin_unlock(&buf->rb_mrlock); r_xprt->rx_stats.mrs_allocated = 0; - - dprintk("RPC: %s: released %u MRs\n", __func__, count); } /** diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index eaf6b907a76e..5aaa53b8ae12 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -360,7 +360,7 @@ rpcrdma_mr_pop(struct list_head *list) struct rpcrdma_buffer { spinlock_t rb_mrlock; /* protect rb_mrs list */ struct list_head rb_mrs; - struct list_head rb_all; + struct list_head rb_all_mrs; unsigned long rb_sc_head; unsigned long rb_sc_tail; From 395790566eec37706dedeb94779045adc3a7581e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:43:17 -0400 Subject: [PATCH 0211/2880] xprtrdma: Toggle XPRT_CONGESTED in xprtrdma's slot methods Commit 48be539dd44a ("xprtrdma: Introduce ->alloc_slot call-out for xprtrdma") added a separate alloc_slot and free_slot to the RPC/RDMA transport. Later, commit 75891f502f5f ("SUNRPC: Support for congestion control when queuing is enabled") modified the generic alloc/free_slot methods, but neglected the methods in xprtrdma. Found via code review. Fixes: 75891f502f5f ("SUNRPC: Support for congestion control ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2ec349ed4770..f4763e8a6761 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -571,6 +571,7 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) return; out_sleep: + set_bit(XPRT_CONGESTED, &xprt->state); rpc_sleep_on(&xprt->backlog, task, NULL); task->tk_status = -EAGAIN; } @@ -589,7 +590,8 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst) memset(rqst, 0, sizeof(*rqst)); rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); - rpc_wake_up_next(&xprt->backlog); + if (unlikely(!rpc_wake_up_next(&xprt->backlog))) + clear_bit(XPRT_CONGESTED, &xprt->state); } static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt, From 8bdfa145f58220796abddc93b5038efa6e7e90fc Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Tue, 13 Aug 2019 14:45:11 -0600 Subject: [PATCH 0212/2880] PCI: sysfs: Define device attributes with DEVICE_ATTR*() Device attributes should be defined using DEVICE_ATTR*(_name, _mode, _show, _store). Convert them all from __ATTR*() to DEVICE_ATTR*(), e.g., - struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store) + static DEVICE_ATTR(foo, S_IWUSR | S_IRUGO, show_foo, store_foo) Link: https://lore.kernel.org/r/20190813204513.4790-2-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Greg Kroah-Hartman Reviewed-by: Donald Dutile --- drivers/pci/pci-sysfs.c | 59 +++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 965c72104150..8af7944fdccb 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -464,9 +464,7 @@ static ssize_t dev_rescan_store(struct device *dev, } return count; } -static struct device_attribute dev_rescan_attr = __ATTR(rescan, - (S_IWUSR|S_IWGRP), - NULL, dev_rescan_store); +static DEVICE_ATTR(rescan, (S_IWUSR | S_IWGRP), NULL, dev_rescan_store); static ssize_t remove_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -480,9 +478,8 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr, pci_stop_and_remove_bus_device_locked(to_pci_dev(dev)); return count; } -static struct device_attribute dev_remove_attr = __ATTR_IGNORE_LOCKDEP(remove, - (S_IWUSR|S_IWGRP), - NULL, remove_store); +static DEVICE_ATTR_IGNORE_LOCKDEP(remove, (S_IWUSR | S_IWGRP), NULL, + remove_store); static ssize_t dev_bus_rescan_store(struct device *dev, struct device_attribute *attr, @@ -504,7 +501,7 @@ static ssize_t dev_bus_rescan_store(struct device *dev, } return count; } -static DEVICE_ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, dev_bus_rescan_store); +static DEVICE_ATTR(bus_rescan, (S_IWUSR | S_IWGRP), NULL, dev_bus_rescan_store); #if defined(CONFIG_PM) && defined(CONFIG_ACPI) static ssize_t d3cold_allowed_store(struct device *dev, @@ -687,16 +684,14 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev, return count; } -static struct device_attribute sriov_totalvfs_attr = __ATTR_RO(sriov_totalvfs); -static struct device_attribute sriov_numvfs_attr = - __ATTR(sriov_numvfs, (S_IRUGO|S_IWUSR|S_IWGRP), - sriov_numvfs_show, sriov_numvfs_store); -static struct device_attribute sriov_offset_attr = __ATTR_RO(sriov_offset); -static struct device_attribute sriov_stride_attr = __ATTR_RO(sriov_stride); -static struct device_attribute sriov_vf_device_attr = __ATTR_RO(sriov_vf_device); -static struct device_attribute sriov_drivers_autoprobe_attr = - __ATTR(sriov_drivers_autoprobe, (S_IRUGO|S_IWUSR|S_IWGRP), - sriov_drivers_autoprobe_show, sriov_drivers_autoprobe_store); +static DEVICE_ATTR_RO(sriov_totalvfs); +static DEVICE_ATTR(sriov_numvfs, (S_IRUGO | S_IWUSR | S_IWGRP), + sriov_numvfs_show, sriov_numvfs_store); +static DEVICE_ATTR_RO(sriov_offset); +static DEVICE_ATTR_RO(sriov_stride); +static DEVICE_ATTR_RO(sriov_vf_device); +static DEVICE_ATTR(sriov_drivers_autoprobe, (S_IRUGO | S_IWUSR | S_IWGRP), + sriov_drivers_autoprobe_show, sriov_drivers_autoprobe_store); #endif /* CONFIG_PCI_IOV */ static ssize_t driver_override_store(struct device *dev, @@ -792,7 +787,7 @@ static struct attribute *pcie_dev_attrs[] = { }; static struct attribute *pcibus_attrs[] = { - &dev_attr_rescan.attr, + &dev_attr_bus_rescan.attr, &dev_attr_cpuaffinity.attr, &dev_attr_cpulistaffinity.attr, NULL, @@ -820,7 +815,7 @@ static ssize_t boot_vga_show(struct device *dev, struct device_attribute *attr, !!(pdev->resource[PCI_ROM_RESOURCE].flags & IORESOURCE_ROM_SHADOW)); } -static struct device_attribute vga_attr = __ATTR_RO(boot_vga); +static DEVICE_ATTR_RO(boot_vga); static ssize_t pci_read_config(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, @@ -1458,7 +1453,7 @@ static ssize_t reset_store(struct device *dev, struct device_attribute *attr, return count; } -static struct device_attribute reset_attr = __ATTR(reset, 0200, NULL, reset_store); +static DEVICE_ATTR(reset, 0200, NULL, reset_store); static int pci_create_capabilities_sysfs(struct pci_dev *dev) { @@ -1468,7 +1463,7 @@ static int pci_create_capabilities_sysfs(struct pci_dev *dev) pcie_aspm_create_sysfs_dev_files(dev); if (dev->reset_fn) { - retval = device_create_file(&dev->dev, &reset_attr); + retval = device_create_file(&dev->dev, &dev_attr_reset); if (retval) goto error; } @@ -1553,7 +1548,7 @@ static void pci_remove_capabilities_sysfs(struct pci_dev *dev) pcie_vpd_remove_sysfs_dev_files(dev); pcie_aspm_remove_sysfs_dev_files(dev); if (dev->reset_fn) { - device_remove_file(&dev->dev, &reset_attr); + device_remove_file(&dev->dev, &dev_attr_reset); dev->reset_fn = 0; } } @@ -1606,7 +1601,7 @@ static int __init pci_sysfs_init(void) late_initcall(pci_sysfs_init); static struct attribute *pci_dev_dev_attrs[] = { - &vga_attr.attr, + &dev_attr_boot_vga.attr, NULL, }; @@ -1616,7 +1611,7 @@ static umode_t pci_dev_attrs_are_visible(struct kobject *kobj, struct device *dev = kobj_to_dev(kobj); struct pci_dev *pdev = to_pci_dev(dev); - if (a == &vga_attr.attr) + if (a == &dev_attr_boot_vga.attr) if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA) return 0; @@ -1624,8 +1619,8 @@ static umode_t pci_dev_attrs_are_visible(struct kobject *kobj, } static struct attribute *pci_dev_hp_attrs[] = { - &dev_remove_attr.attr, - &dev_rescan_attr.attr, + &dev_attr_remove.attr, + &dev_attr_rescan.attr, NULL, }; @@ -1699,12 +1694,12 @@ static const struct attribute_group pci_dev_hp_attr_group = { #ifdef CONFIG_PCI_IOV static struct attribute *sriov_dev_attrs[] = { - &sriov_totalvfs_attr.attr, - &sriov_numvfs_attr.attr, - &sriov_offset_attr.attr, - &sriov_stride_attr.attr, - &sriov_vf_device_attr.attr, - &sriov_drivers_autoprobe_attr.attr, + &dev_attr_sriov_totalvfs.attr, + &dev_attr_sriov_numvfs.attr, + &dev_attr_sriov_offset.attr, + &dev_attr_sriov_stride.attr, + &dev_attr_sriov_vf_device.attr, + &dev_attr_sriov_drivers_autoprobe.attr, NULL, }; From 4e2b79436e4f22b225ee445832705bb7c4675807 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Thu, 15 Aug 2019 09:33:52 -0600 Subject: [PATCH 0213/2880] PCI: sysfs: Change DEVICE_ATTR() to DEVICE_ATTR_WO() DEVICE_ATTR() should only be used when files have unusual permissions. Change DEVICE_ATTR() with '0220' write-only permissions to DEVICE_ATTR_WO(), e.g., - static DEVICE_ATTR(_name, (S_IWUSR | S_IWGRP), NULL, _store); + static DEVICE_ATTR_WO(_name); Since _store is no longer passed, make the _name passed by DEVICE_ATTR_WO() and the related _name##_store() name match with each other, e.g., DEVICE_ATTR_WO(bus_rescan) must be able to call bus_rescan_store() Link: https://lore.kernel.org/r/20190815153352.86143-4-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Greg Kroah-Hartman Reviewed-by: Donald Dutile --- drivers/pci/pci-sysfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 8af7944fdccb..4fe988d54807 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -464,7 +464,7 @@ static ssize_t dev_rescan_store(struct device *dev, } return count; } -static DEVICE_ATTR(rescan, (S_IWUSR | S_IWGRP), NULL, dev_rescan_store); +static DEVICE_ATTR_WO(dev_rescan); static ssize_t remove_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -481,9 +481,9 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR_IGNORE_LOCKDEP(remove, (S_IWUSR | S_IWGRP), NULL, remove_store); -static ssize_t dev_bus_rescan_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t bus_rescan_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { unsigned long val; struct pci_bus *bus = to_pci_bus(dev); @@ -501,7 +501,7 @@ static ssize_t dev_bus_rescan_store(struct device *dev, } return count; } -static DEVICE_ATTR(bus_rescan, (S_IWUSR | S_IWGRP), NULL, dev_bus_rescan_store); +static DEVICE_ATTR_WO(bus_rescan); #if defined(CONFIG_PM) && defined(CONFIG_ACPI) static ssize_t d3cold_allowed_store(struct device *dev, @@ -1620,7 +1620,7 @@ static umode_t pci_dev_attrs_are_visible(struct kobject *kobj, static struct attribute *pci_dev_hp_attrs[] = { &dev_attr_remove.attr, - &dev_attr_rescan.attr, + &dev_attr_dev_rescan.attr, NULL, }; From e2154044dd4168bc25c70170dfa6179b57f63914 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Tue, 13 Aug 2019 14:45:12 -0600 Subject: [PATCH 0214/2880] PCI: sysfs: Change permissions from symbolic to octal We prefer octal permissions over symbolic permissions such as "(S_IWUSR | S_IWGRP)". Change all symbolic permissions to octal permissions, e.g., - (S_IWUSR | S_IWGRP) + 0220 Link: https://lore.kernel.org/r/20190813204513.4790-3-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Greg Kroah-Hartman Reviewed-by: Donald Dutile --- drivers/pci/pci-sysfs.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 4fe988d54807..5bb301efec98 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -478,7 +478,7 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr, pci_stop_and_remove_bus_device_locked(to_pci_dev(dev)); return count; } -static DEVICE_ATTR_IGNORE_LOCKDEP(remove, (S_IWUSR | S_IWGRP), NULL, +static DEVICE_ATTR_IGNORE_LOCKDEP(remove, 0220, NULL, remove_store); static ssize_t bus_rescan_store(struct device *dev, @@ -685,13 +685,12 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev, } static DEVICE_ATTR_RO(sriov_totalvfs); -static DEVICE_ATTR(sriov_numvfs, (S_IRUGO | S_IWUSR | S_IWGRP), - sriov_numvfs_show, sriov_numvfs_store); +static DEVICE_ATTR(sriov_numvfs, 0664, sriov_numvfs_show, sriov_numvfs_store); static DEVICE_ATTR_RO(sriov_offset); static DEVICE_ATTR_RO(sriov_stride); static DEVICE_ATTR_RO(sriov_vf_device); -static DEVICE_ATTR(sriov_drivers_autoprobe, (S_IRUGO | S_IWUSR | S_IWGRP), - sriov_drivers_autoprobe_show, sriov_drivers_autoprobe_store); +static DEVICE_ATTR(sriov_drivers_autoprobe, 0664, sriov_drivers_autoprobe_show, + sriov_drivers_autoprobe_store); #endif /* CONFIG_PCI_IOV */ static ssize_t driver_override_store(struct device *dev, @@ -1080,7 +1079,7 @@ void pci_create_legacy_files(struct pci_bus *b) sysfs_bin_attr_init(b->legacy_io); b->legacy_io->attr.name = "legacy_io"; b->legacy_io->size = 0xffff; - b->legacy_io->attr.mode = S_IRUSR | S_IWUSR; + b->legacy_io->attr.mode = 0600; b->legacy_io->read = pci_read_legacy_io; b->legacy_io->write = pci_write_legacy_io; b->legacy_io->mmap = pci_mmap_legacy_io; @@ -1094,7 +1093,7 @@ void pci_create_legacy_files(struct pci_bus *b) sysfs_bin_attr_init(b->legacy_mem); b->legacy_mem->attr.name = "legacy_mem"; b->legacy_mem->size = 1024*1024; - b->legacy_mem->attr.mode = S_IRUSR | S_IWUSR; + b->legacy_mem->attr.mode = 0600; b->legacy_mem->mmap = pci_mmap_legacy_mem; pci_adjust_legacy_attr(b, pci_mmap_mem); error = device_create_bin_file(&b->dev, b->legacy_mem); @@ -1301,7 +1300,7 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) } } res_attr->attr.name = res_attr_name; - res_attr->attr.mode = S_IRUSR | S_IWUSR; + res_attr->attr.mode = 0600; res_attr->size = pci_resource_len(pdev, num); res_attr->private = (void *)(unsigned long)num; retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr); @@ -1414,7 +1413,7 @@ static ssize_t pci_read_rom(struct file *filp, struct kobject *kobj, static const struct bin_attribute pci_config_attr = { .attr = { .name = "config", - .mode = S_IRUGO | S_IWUSR, + .mode = 0644, }, .size = PCI_CFG_SPACE_SIZE, .read = pci_read_config, @@ -1424,7 +1423,7 @@ static const struct bin_attribute pci_config_attr = { static const struct bin_attribute pcie_config_attr = { .attr = { .name = "config", - .mode = S_IRUGO | S_IWUSR, + .mode = 0644, }, .size = PCI_CFG_SPACE_EXP_SIZE, .read = pci_read_config, @@ -1506,7 +1505,7 @@ int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) sysfs_bin_attr_init(attr); attr->size = rom_size; attr->attr.name = "rom"; - attr->attr.mode = S_IRUSR | S_IWUSR; + attr->attr.mode = 0600; attr->read = pci_read_rom; attr->write = pci_write_rom; retval = sysfs_create_bin_file(&pdev->dev.kobj, attr); From aaee0c1ffd6399d291b030b49d622b81dd5071c5 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Tue, 13 Aug 2019 14:45:13 -0600 Subject: [PATCH 0215/2880] PCI/IOV: Move sysfs SR-IOV functions to iov.c The sysfs SR-IOV functions are only needed when the kernel is built with SR-IOV support. Rather than put them in pci-sysfs.c under #ifdef CONFIG_PCI_IOV, move them to iov.c, which is only compiled when CONFIG_PCI_IOV=y. Link: https://lore.kernel.org/r/20190813204513.4790-4-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Greg Kroah-Hartman Reviewed-by: Donald Dutile Reviewed-by: Kuppuswamy Sathyanarayanan --- drivers/pci/iov.c | 168 ++++++++++++++++++++++++++++++++++++++ drivers/pci/pci-sysfs.c | 173 ---------------------------------------- drivers/pci/pci.h | 2 +- 3 files changed, 169 insertions(+), 174 deletions(-) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 525fd3f272b3..053054362247 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -240,6 +240,174 @@ void pci_iov_remove_virtfn(struct pci_dev *dev, int id) pci_dev_put(dev); } +static ssize_t sriov_totalvfs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pci_sriov_get_totalvfs(pdev)); +} + +static ssize_t sriov_numvfs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pdev->sriov->num_VFs); +} + +/* + * num_vfs > 0; number of VFs to enable + * num_vfs = 0; disable all VFs + * + * Note: SRIOV spec does not allow partial VF + * disable, so it's all or none. + */ +static ssize_t sriov_numvfs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int ret; + u16 num_vfs; + + ret = kstrtou16(buf, 0, &num_vfs); + if (ret < 0) + return ret; + + if (num_vfs > pci_sriov_get_totalvfs(pdev)) + return -ERANGE; + + device_lock(&pdev->dev); + + if (num_vfs == pdev->sriov->num_VFs) + goto exit; + + /* is PF driver loaded w/callback */ + if (!pdev->driver || !pdev->driver->sriov_configure) { + pci_info(pdev, "Driver does not support SRIOV configuration via sysfs\n"); + ret = -ENOENT; + goto exit; + } + + if (num_vfs == 0) { + /* disable VFs */ + ret = pdev->driver->sriov_configure(pdev, 0); + goto exit; + } + + /* enable VFs */ + if (pdev->sriov->num_VFs) { + pci_warn(pdev, "%d VFs already enabled. Disable before enabling %d VFs\n", + pdev->sriov->num_VFs, num_vfs); + ret = -EBUSY; + goto exit; + } + + ret = pdev->driver->sriov_configure(pdev, num_vfs); + if (ret < 0) + goto exit; + + if (ret != num_vfs) + pci_warn(pdev, "%d VFs requested; only %d enabled\n", + num_vfs, ret); + +exit: + device_unlock(&pdev->dev); + + if (ret < 0) + return ret; + + return count; +} + +static ssize_t sriov_offset_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pdev->sriov->offset); +} + +static ssize_t sriov_stride_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pdev->sriov->stride); +} + +static ssize_t sriov_vf_device_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%x\n", pdev->sriov->vf_device); +} + +static ssize_t sriov_drivers_autoprobe_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pdev->sriov->drivers_autoprobe); +} + +static ssize_t sriov_drivers_autoprobe_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + bool drivers_autoprobe; + + if (kstrtobool(buf, &drivers_autoprobe) < 0) + return -EINVAL; + + pdev->sriov->drivers_autoprobe = drivers_autoprobe; + + return count; +} + +static DEVICE_ATTR_RO(sriov_totalvfs); +static DEVICE_ATTR(sriov_numvfs, 0664, sriov_numvfs_show, sriov_numvfs_store); +static DEVICE_ATTR_RO(sriov_offset); +static DEVICE_ATTR_RO(sriov_stride); +static DEVICE_ATTR_RO(sriov_vf_device); +static DEVICE_ATTR(sriov_drivers_autoprobe, 0664, sriov_drivers_autoprobe_show, + sriov_drivers_autoprobe_store); + +static struct attribute *sriov_dev_attrs[] = { + &dev_attr_sriov_totalvfs.attr, + &dev_attr_sriov_numvfs.attr, + &dev_attr_sriov_offset.attr, + &dev_attr_sriov_stride.attr, + &dev_attr_sriov_vf_device.attr, + &dev_attr_sriov_drivers_autoprobe.attr, + NULL, +}; + +static umode_t sriov_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + + if (!dev_is_pf(dev)) + return 0; + + return a->mode; +} + +const struct attribute_group sriov_dev_attr_group = { + .attrs = sriov_dev_attrs, + .is_visible = sriov_attrs_are_visible, +}; + int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) { return 0; diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 5bb301efec98..868e35109284 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -548,151 +548,6 @@ static ssize_t devspec_show(struct device *dev, static DEVICE_ATTR_RO(devspec); #endif -#ifdef CONFIG_PCI_IOV -static ssize_t sriov_totalvfs_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pci_sriov_get_totalvfs(pdev)); -} - - -static ssize_t sriov_numvfs_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pdev->sriov->num_VFs); -} - -/* - * num_vfs > 0; number of VFs to enable - * num_vfs = 0; disable all VFs - * - * Note: SRIOV spec doesn't allow partial VF - * disable, so it's all or none. - */ -static ssize_t sriov_numvfs_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct pci_dev *pdev = to_pci_dev(dev); - int ret; - u16 num_vfs; - - ret = kstrtou16(buf, 0, &num_vfs); - if (ret < 0) - return ret; - - if (num_vfs > pci_sriov_get_totalvfs(pdev)) - return -ERANGE; - - device_lock(&pdev->dev); - - if (num_vfs == pdev->sriov->num_VFs) - goto exit; - - /* is PF driver loaded w/callback */ - if (!pdev->driver || !pdev->driver->sriov_configure) { - pci_info(pdev, "Driver doesn't support SRIOV configuration via sysfs\n"); - ret = -ENOENT; - goto exit; - } - - if (num_vfs == 0) { - /* disable VFs */ - ret = pdev->driver->sriov_configure(pdev, 0); - goto exit; - } - - /* enable VFs */ - if (pdev->sriov->num_VFs) { - pci_warn(pdev, "%d VFs already enabled. Disable before enabling %d VFs\n", - pdev->sriov->num_VFs, num_vfs); - ret = -EBUSY; - goto exit; - } - - ret = pdev->driver->sriov_configure(pdev, num_vfs); - if (ret < 0) - goto exit; - - if (ret != num_vfs) - pci_warn(pdev, "%d VFs requested; only %d enabled\n", - num_vfs, ret); - -exit: - device_unlock(&pdev->dev); - - if (ret < 0) - return ret; - - return count; -} - -static ssize_t sriov_offset_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pdev->sriov->offset); -} - -static ssize_t sriov_stride_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pdev->sriov->stride); -} - -static ssize_t sriov_vf_device_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%x\n", pdev->sriov->vf_device); -} - -static ssize_t sriov_drivers_autoprobe_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pdev->sriov->drivers_autoprobe); -} - -static ssize_t sriov_drivers_autoprobe_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct pci_dev *pdev = to_pci_dev(dev); - bool drivers_autoprobe; - - if (kstrtobool(buf, &drivers_autoprobe) < 0) - return -EINVAL; - - pdev->sriov->drivers_autoprobe = drivers_autoprobe; - - return count; -} - -static DEVICE_ATTR_RO(sriov_totalvfs); -static DEVICE_ATTR(sriov_numvfs, 0664, sriov_numvfs_show, sriov_numvfs_store); -static DEVICE_ATTR_RO(sriov_offset); -static DEVICE_ATTR_RO(sriov_stride); -static DEVICE_ATTR_RO(sriov_vf_device); -static DEVICE_ATTR(sriov_drivers_autoprobe, 0664, sriov_drivers_autoprobe_show, - sriov_drivers_autoprobe_store); -#endif /* CONFIG_PCI_IOV */ - static ssize_t driver_override_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -1691,34 +1546,6 @@ static const struct attribute_group pci_dev_hp_attr_group = { .is_visible = pci_dev_hp_attrs_are_visible, }; -#ifdef CONFIG_PCI_IOV -static struct attribute *sriov_dev_attrs[] = { - &dev_attr_sriov_totalvfs.attr, - &dev_attr_sriov_numvfs.attr, - &dev_attr_sriov_offset.attr, - &dev_attr_sriov_stride.attr, - &dev_attr_sriov_vf_device.attr, - &dev_attr_sriov_drivers_autoprobe.attr, - NULL, -}; - -static umode_t sriov_attrs_are_visible(struct kobject *kobj, - struct attribute *a, int n) -{ - struct device *dev = kobj_to_dev(kobj); - - if (!dev_is_pf(dev)) - return 0; - - return a->mode; -} - -static const struct attribute_group sriov_dev_attr_group = { - .attrs = sriov_dev_attrs, - .is_visible = sriov_attrs_are_visible, -}; -#endif /* CONFIG_PCI_IOV */ - static const struct attribute_group pci_dev_attr_group = { .attrs = pci_dev_dev_attrs, .is_visible = pci_dev_attrs_are_visible, diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1be03a97cb92..061a935ac18e 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -433,7 +433,7 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno); resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno); void pci_restore_iov_state(struct pci_dev *dev); int pci_iov_bus_range(struct pci_bus *bus); - +extern const struct attribute_group sriov_dev_attr_group; #else static inline int pci_iov_init(struct pci_dev *dev) { From 265a38d4611360ae3d5bb612d586a3126507a954 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:44:04 -0400 Subject: [PATCH 0216/2880] xprtrdma: Simplify rpcrdma_mr_pop Clean up: rpcrdma_mr_pop call sites check if the list is empty first. Let's replace the list_empty with less costly logic. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 12 ++++-------- net/sunrpc/xprtrdma/rpc_rdma.c | 7 +------ net/sunrpc/xprtrdma/verbs.c | 6 ++---- net/sunrpc/xprtrdma/xprt_rdma.h | 7 ++++--- 4 files changed, 11 insertions(+), 21 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 3a10bfff2125..d7e763fafa04 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -126,12 +126,10 @@ frwr_mr_recycle_worker(struct work_struct *work) */ void frwr_reset(struct rpcrdma_req *req) { - while (!list_empty(&req->rl_registered)) { - struct rpcrdma_mr *mr; + struct rpcrdma_mr *mr; - mr = rpcrdma_mr_pop(&req->rl_registered); + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) rpcrdma_mr_unmap_and_put(mr); - } } /** @@ -532,8 +530,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ frwr = NULL; prev = &first; - while (!list_empty(&req->rl_registered)) { - mr = rpcrdma_mr_pop(&req->rl_registered); + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; @@ -632,8 +629,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ frwr = NULL; prev = &first; - while (!list_empty(&req->rl_registered)) { - mr = rpcrdma_mr_pop(&req->rl_registered); + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 4345e6912392..0ac096a6348a 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -841,12 +841,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) * chunks. Very likely the connection has been replaced, * so these registrations are invalid and unusable. */ - while (unlikely(!list_empty(&req->rl_registered))) { - struct rpcrdma_mr *mr; - - mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_recycle(mr); - } + frwr_reset(req); /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e004873cc4f0..ee6fcf10425b 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1213,13 +1213,11 @@ struct rpcrdma_mr * rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_mr *mr = NULL; + struct rpcrdma_mr *mr; spin_lock(&buf->rb_mrlock); - if (!list_empty(&buf->rb_mrs)) - mr = rpcrdma_mr_pop(&buf->rb_mrs); + mr = rpcrdma_mr_pop(&buf->rb_mrs); spin_unlock(&buf->rb_mrlock); - if (!mr) goto out_nomrs; return mr; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5aaa53b8ae12..9663b8ddd733 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -338,7 +338,7 @@ rpcr_to_rdmar(const struct rpc_rqst *rqst) static inline void rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list) { - list_add_tail(&mr->mr_list, list); + list_add(&mr->mr_list, list); } static inline struct rpcrdma_mr * @@ -346,8 +346,9 @@ rpcrdma_mr_pop(struct list_head *list) { struct rpcrdma_mr *mr; - mr = list_first_entry(list, struct rpcrdma_mr, mr_list); - list_del_init(&mr->mr_list); + mr = list_first_entry_or_null(list, struct rpcrdma_mr, mr_list); + if (mr) + list_del_init(&mr->mr_list); return mr; } From 1ca3f4c054a4e3765bdeb62c849d940b5bc8002d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:44:50 -0400 Subject: [PATCH 0217/2880] xprtrdma: Combine rpcrdma_mr_put and rpcrdma_mr_unmap_and_put Clean up. There is only one remaining rpcrdma_mr_put call site, and it can be directly replaced with unmap_and_put because mr->mr_dir is set to DMA_NONE just before the call. Now all the call sites do a DMA unmap, and we can just rename mr_unmap_and_put to mr_put, which nicely matches mr_get. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 6 +++--- net/sunrpc/xprtrdma/verbs.c | 32 ++++++++------------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 3 files changed, 11 insertions(+), 28 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index d7e763fafa04..97e1804139b8 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -129,7 +129,7 @@ void frwr_reset(struct rpcrdma_req *req) struct rpcrdma_mr *mr; while ((mr = rpcrdma_mr_pop(&req->rl_registered))) - rpcrdma_mr_unmap_and_put(mr); + rpcrdma_mr_put(mr); } /** @@ -453,7 +453,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); trace_xprtrdma_mr_remoteinv(mr); - rpcrdma_mr_unmap_and_put(mr); + rpcrdma_mr_put(mr); break; /* only one invalidated MR per RPC */ } } @@ -463,7 +463,7 @@ static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) if (wc->status != IB_WC_SUCCESS) rpcrdma_mr_recycle(mr); else - rpcrdma_mr_unmap_and_put(mr); + rpcrdma_mr_put(mr); } /** diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index ee6fcf10425b..5e0b774ed522 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1233,34 +1233,15 @@ out_nomrs: return NULL; } -static void -__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr) -{ - spin_lock(&buf->rb_mrlock); - rpcrdma_mr_push(mr, &buf->rb_mrs); - spin_unlock(&buf->rb_mrlock); -} - /** - * rpcrdma_mr_put - Release an rpcrdma_mr object - * @mr: object to release + * rpcrdma_mr_put - DMA unmap an MR and release it + * @mr: MR to release * */ -void -rpcrdma_mr_put(struct rpcrdma_mr *mr) -{ - __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr); -} - -/** - * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it - * @mr: object to release - * - */ -void -rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) +void rpcrdma_mr_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); @@ -1268,7 +1249,10 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } - __rpcrdma_mr_put(&r_xprt->rx_buf, mr); + + spin_lock(&buf->rb_mrlock); + rpcrdma_mr_push(mr, &buf->rb_mrs); + spin_unlock(&buf->rb_mrlock); } /** diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9663b8ddd733..3e0839c2cda2 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -485,7 +485,6 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); -void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); static inline void rpcrdma_mr_recycle(struct rpcrdma_mr *mr) From 3b39f52a02d4b3322744a0a32d59142e01afa435 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:45:37 -0400 Subject: [PATCH 0218/2880] xprtrdma: Move rpcrdma_mr_get out of frwr_map Refactor: Retrieve an MR and handle error recovery entirely in rpc_rdma.c, as this is not a device-specific function. Note that since commit 89f90fe1ad8b ("SUNRPC: Allow calls to xprt_transmit() to drain the entire transmit queue"), the xprt_transmit function handles the cond_resched. The transport no longer has to do this itself. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 29 ++++++++++++++++++++++++++++- net/sunrpc/xprtrdma/frwr_ops.c | 23 +++++------------------ net/sunrpc/xprtrdma/rpc_rdma.c | 30 ++++++++++++++++++++++++------ net/sunrpc/xprtrdma/verbs.c | 21 ++++----------------- net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++-- 5 files changed, 63 insertions(+), 44 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 6e6055eb67e7..83c4dfd7feea 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -464,7 +464,34 @@ TRACE_EVENT(xprtrdma_createmrs, ) ); -DEFINE_RXPRT_EVENT(xprtrdma_nomrs); +TRACE_EVENT(xprtrdma_nomrs, + TP_PROTO( + const struct rpcrdma_req *req + ), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(const void *, req) + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + ), + + TP_fast_assign( + const struct rpc_rqst *rqst = &req->rl_slot; + + __entry->req = req; + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + ), + + TP_printk("task:%u@%u xid=0x%08x req=%p", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->req + ) +); DEFINE_RDCH_EVENT(read); DEFINE_WRCH_EVENT(write); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 97e1804139b8..362056f4f48d 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -291,31 +291,25 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt) * @nsegs: number of segments remaining * @writing: true when RDMA Write will be used * @xid: XID of RPC using the registered memory - * @out: initialized MR + * @mr: MR to fill in * * Prepare a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. * * Returns the next segment or a negative errno pointer. - * On success, the prepared MR is planted in @out. + * On success, @mr is filled in. */ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr **out) + struct rpcrdma_mr *mr) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; - struct rpcrdma_mr *mr; - struct ib_mr *ibmr; struct ib_reg_wr *reg_wr; + struct ib_mr *ibmr; int i, n; u8 key; - mr = rpcrdma_mr_get(r_xprt); - if (!mr) - goto out_getmr_err; - if (nsegs > ia->ri_max_frwr_depth) nsegs = ia->ri_max_frwr_depth; for (i = 0; i < nsegs;) { @@ -330,7 +324,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, ++seg; ++i; - if (holes_ok) + if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS) continue; if ((i < nsegs && offset_in_page(seg->mr_offset)) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) @@ -365,22 +359,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_offset = ibmr->iova; trace_xprtrdma_mr_map(mr); - *out = mr; return seg; -out_getmr_err: - xprt_wait_for_buffer_space(&r_xprt->rx_xprt); - return ERR_PTR(-EAGAIN); - out_dmamap_err: mr->mr_dir = DMA_NONE; trace_xprtrdma_frwr_sgerr(mr, i); - rpcrdma_mr_put(mr); return ERR_PTR(-EIO); out_mapmr_err: trace_xprtrdma_frwr_maperr(mr, n); - rpcrdma_mr_recycle(mr); return ERR_PTR(-EIO); } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 0ac096a6348a..34772cb19286 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -342,6 +342,27 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, return 0; } +static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpcrdma_mr_seg *seg, + int nsegs, bool writing, + struct rpcrdma_mr **mr) +{ + *mr = rpcrdma_mr_get(r_xprt); + if (!*mr) + goto out_getmr_err; + + rpcrdma_mr_push(*mr, &req->rl_registered); + return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); + +out_getmr_err: + trace_xprtrdma_nomrs(req); + xprt_wait_for_buffer_space(&r_xprt->rx_xprt); + if (r_xprt->rx_ep.rep_connected != -ENODEV) + schedule_work(&r_xprt->rx_buf.rb_refresh_worker); + return ERR_PTR(-EAGAIN); +} + /* Register and XDR encode the Read list. Supports encoding a list of read * segments that belong to a single read chunk. * @@ -379,10 +400,9 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return nsegs; do { - seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; @@ -440,10 +460,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; @@ -501,10 +520,9 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 5e0b774ed522..c9fa0f27b10a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -408,7 +408,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) struct rpcrdma_req *req; struct rpcrdma_rep *rep; - cancel_delayed_work_sync(&buf->rb_refresh_worker); + cancel_work_sync(&buf->rb_refresh_worker); /* This is similar to rpcrdma_ep_destroy, but: * - Don't cancel the connect worker. @@ -975,7 +975,7 @@ static void rpcrdma_mr_refresh_worker(struct work_struct *work) { struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, - rb_refresh_worker.work); + rb_refresh_worker); struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); @@ -1086,8 +1086,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) spin_lock_init(&buf->rb_lock); INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all_mrs); - INIT_DELAYED_WORK(&buf->rb_refresh_worker, - rpcrdma_mr_refresh_worker); + INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); rpcrdma_mrs_create(r_xprt); @@ -1177,7 +1176,7 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - cancel_delayed_work_sync(&buf->rb_refresh_worker); + cancel_work_sync(&buf->rb_refresh_worker); rpcrdma_sendctxs_destroy(buf); @@ -1218,19 +1217,7 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) spin_lock(&buf->rb_mrlock); mr = rpcrdma_mr_pop(&buf->rb_mrs); spin_unlock(&buf->rb_mrlock); - if (!mr) - goto out_nomrs; return mr; - -out_nomrs: - trace_xprtrdma_nomrs(r_xprt); - if (r_xprt->rx_ep.rep_connected != -ENODEV) - schedule_delayed_work(&buf->rb_refresh_worker, 0); - - /* Allow the reply handler and refresh worker to run */ - cond_resched(); - - return NULL; } /** diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 3e0839c2cda2..9573587ca602 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -379,7 +379,7 @@ struct rpcrdma_buffer { u32 rb_bc_srv_max_requests; u32 rb_bc_max_requests; - struct delayed_work rb_refresh_worker; + struct work_struct rb_refresh_worker; }; /* @@ -548,7 +548,7 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr **mr); + struct rpcrdma_mr *mr); int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req); void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); From 805a1f620ba38c5f6de8b9697f35dcb38d8112b5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:46:24 -0400 Subject: [PATCH 0219/2880] xprtrdma: Ensure creating an MR does not trigger FS writeback Probably would be good to also pass GFP flags to ib_alloc_mr. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 7 ++++--- net/sunrpc/xprtrdma/verbs.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 362056f4f48d..1f2e3dda7401 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -147,11 +147,14 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) struct ib_mr *frmr; int rc; + /* NB: ib_alloc_mr and device drivers typically allocate + * memory with GFP_KERNEL. + */ frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); if (IS_ERR(frmr)) goto out_mr_err; - sg = kcalloc(depth, sizeof(*sg), GFP_KERNEL); + sg = kcalloc(depth, sizeof(*sg), GFP_NOFS); if (!sg) goto out_list_err; @@ -171,8 +174,6 @@ out_mr_err: return rc; out_list_err: - dprintk("RPC: %s: sg allocation failure\n", - __func__); ib_dereg_mr(frmr); return -ENOMEM; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c9fa0f27b10a..cb6df58488bb 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -949,7 +949,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_mr *mr; int rc; - mr = kzalloc(sizeof(*mr), GFP_KERNEL); + mr = kzalloc(sizeof(*mr), GFP_NOFS); if (!mr) break; From be700103efd1050808db1cf00e52c3a2837bf802 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Thu, 15 Aug 2019 17:01:37 +0000 Subject: [PATCH 0220/2880] PCI: hv: Detect and fix Hyper-V PCI domain number collision Currently in Azure cloud, for passthrough devices, the host sets the device instance ID's bytes 8 - 15 to a value derived from the host HWID, which is the same on all devices in a VM. So, the device instance ID's bytes 8 and 9 provided by the host are no longer unique. This affects all Azure hosts since July 2018, and can cause device passthrough to VMs to fail because the bytes 8 and 9 are used as PCI domain number. Collision of domain numbers will cause the second device with the same domain number fail to load. In the cases of collision, we will detect and find another number that is not in use. Suggested-by: Michael Kelley Signed-off-by: Haiyang Zhang Signed-off-by: Lorenzo Pieralisi Acked-by: Sasha Levin --- drivers/pci/controller/pci-hyperv.c | 92 +++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 13 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 2b53976cd9f9..4caa3388692a 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2510,6 +2510,48 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus) complete(&hbus->remove_event); } +#define HVPCI_DOM_MAP_SIZE (64 * 1024) +static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); + +/* + * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 + * as invalid for passthrough PCI devices of this driver. + */ +#define HVPCI_DOM_INVALID 0 + +/** + * hv_get_dom_num() - Get a valid PCI domain number + * Check if the PCI domain number is in use, and return another number if + * it is in use. + * + * @dom: Requested domain number + * + * return: domain number on success, HVPCI_DOM_INVALID on failure + */ +static u16 hv_get_dom_num(u16 dom) +{ + unsigned int i; + + if (test_and_set_bit(dom, hvpci_dom_map) == 0) + return dom; + + for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { + if (test_and_set_bit(i, hvpci_dom_map) == 0) + return i; + } + + return HVPCI_DOM_INVALID; +} + +/** + * hv_put_dom_num() - Mark the PCI domain number as free + * @dom: Domain number to be freed + */ +static void hv_put_dom_num(u16 dom) +{ + clear_bit(dom, hvpci_dom_map); +} + /** * hv_pci_probe() - New VMBus channel probe, for a root PCI bus * @hdev: VMBus's tracking struct for this root PCI bus @@ -2521,6 +2563,7 @@ static int hv_pci_probe(struct hv_device *hdev, const struct hv_vmbus_device_id *dev_id) { struct hv_pcibus_device *hbus; + u16 dom_req, dom; int ret; /* @@ -2535,19 +2578,34 @@ static int hv_pci_probe(struct hv_device *hdev, hbus->state = hv_pcibus_init; /* - * The PCI bus "domain" is what is called "segment" in ACPI and - * other specs. Pull it from the instance ID, to get something - * unique. Bytes 8 and 9 are what is used in Windows guests, so - * do the same thing for consistency. Note that, since this code - * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee - * that (1) the only domain in use for something that looks like - * a physical PCI bus (which is actually emulated by the - * hypervisor) is domain 0 and (2) there will be no overlap - * between domains derived from these instance IDs in the same - * VM. + * The PCI bus "domain" is what is called "segment" in ACPI and other + * specs. Pull it from the instance ID, to get something usually + * unique. In rare cases of collision, we will find out another number + * not in use. + * + * Note that, since this code only runs in a Hyper-V VM, Hyper-V + * together with this guest driver can guarantee that (1) The only + * domain used by Gen1 VMs for something that looks like a physical + * PCI bus (which is actually emulated by the hypervisor) is domain 0. + * (2) There will be no overlap between domains (after fixing possible + * collisions) in the same VM. */ - hbus->sysdata.domain = hdev->dev_instance.b[9] | - hdev->dev_instance.b[8] << 8; + dom_req = hdev->dev_instance.b[8] << 8 | hdev->dev_instance.b[9]; + dom = hv_get_dom_num(dom_req); + + if (dom == HVPCI_DOM_INVALID) { + dev_err(&hdev->device, + "Unable to use dom# 0x%hx or other numbers", dom_req); + ret = -EINVAL; + goto free_bus; + } + + if (dom != dom_req) + dev_info(&hdev->device, + "PCI dom# 0x%hx has collision, using 0x%hx", + dom_req, dom); + + hbus->sysdata.domain = dom; hbus->hdev = hdev; refcount_set(&hbus->remove_lock, 1); @@ -2562,7 +2620,7 @@ static int hv_pci_probe(struct hv_device *hdev, hbus->sysdata.domain); if (!hbus->wq) { ret = -ENOMEM; - goto free_bus; + goto free_dom; } ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, @@ -2639,6 +2697,8 @@ close: vmbus_close(hdev->channel); destroy_wq: destroy_workqueue(hbus->wq); +free_dom: + hv_put_dom_num(hbus->sysdata.domain); free_bus: free_page((unsigned long)hbus); return ret; @@ -2720,6 +2780,9 @@ static int hv_pci_remove(struct hv_device *hdev) put_hvpcibus(hbus); wait_for_completion(&hbus->remove_event); destroy_workqueue(hbus->wq); + + hv_put_dom_num(hbus->sysdata.domain); + free_page((unsigned long)hbus); return 0; } @@ -2747,6 +2810,9 @@ static void __exit exit_hv_pci_drv(void) static int __init init_hv_pci_drv(void) { + /* Set the invalid domain number's bit, so it will not be used */ + set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); + return vmbus_driver_register(&hv_pci_drv); } From 5ae6393e6d41c7cc38c1ce679bc91c475412c624 Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Mon, 19 Aug 2019 13:09:46 +0530 Subject: [PATCH 0221/2880] PCI: kirin: Make structure kirin_dw_pcie_ops constant Static variable kirin_dw_pcie_ops, of type dw_pcie_ops, is used only once, when it is assigned to the constant field ops of variable pci (having type dw_pcie) so kirin_dw_pcie_ops is never modified. Make it constant to protect it from unintended modification. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- drivers/pci/controller/dwc/pcie-kirin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 8df1914226be..c19617a912bd 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -436,7 +436,7 @@ static int kirin_pcie_host_init(struct pcie_port *pp) return 0; } -static struct dw_pcie_ops kirin_dw_pcie_ops = { +static const struct dw_pcie_ops kirin_dw_pcie_ops = { .read_dbi = kirin_pcie_read_dbi, .write_dbi = kirin_pcie_write_dbi, .link_up = kirin_pcie_link_up, From 6dc6ec9e04c468d994bff6eb660f3146f94cbfd9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:47:10 -0400 Subject: [PATCH 0222/2880] xprtrdma: Cache free MRs in each rpcrdma_req Instead of a globally-contended MR free list, cache MRs in each rpcrdma_req as they are released. This means acquiring and releasing an MR will be lock-free in the common case, even outside the transport send lock. The original idea of per-rpcrdma_req MR free lists was suggested by Shirley Ma several years ago. I just now figured out how to make that idea work with on-demand MR allocation. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 38 +++++++++++++++++++++++++++++++-- net/sunrpc/xprtrdma/frwr_ops.c | 9 +++++--- net/sunrpc/xprtrdma/rpc_rdma.c | 11 +++++++--- net/sunrpc/xprtrdma/verbs.c | 18 +++++++++++++--- net/sunrpc/xprtrdma/xprt_rdma.h | 7 +++--- 5 files changed, 69 insertions(+), 14 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 83c4dfd7feea..a13830616107 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -451,16 +451,50 @@ TRACE_EVENT(xprtrdma_createmrs, TP_STRUCT__entry( __field(const void *, r_xprt) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) __field(unsigned int, count) ), TP_fast_assign( __entry->r_xprt = r_xprt; __entry->count = count; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("r_xprt=%p: created %u MRs", - __entry->r_xprt, __entry->count + TP_printk("peer=[%s]:%s r_xprt=%p: created %u MRs", + __get_str(addr), __get_str(port), __entry->r_xprt, + __entry->count + ) +); + +TRACE_EVENT(xprtrdma_mr_get, + TP_PROTO( + const struct rpcrdma_req *req + ), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(const void *, req) + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + ), + + TP_fast_assign( + const struct rpc_rqst *rqst = &req->rl_slot; + + __entry->req = req; + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + ), + + TP_printk("task:%u@%u xid=0x%08x req=%p", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->req ) ); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 1f2e3dda7401..0e740bae2d80 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -488,8 +488,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_wake(wc, frwr); - complete(&frwr->fr_linv_done); __frwr_release_mr(wc, mr); + complete(&frwr->fr_linv_done); } /** @@ -587,11 +587,15 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_rep *rep = mr->mr_req->rl_reply; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_done(wc, frwr); - rpcrdma_complete_rqst(frwr->fr_req->rl_reply); __frwr_release_mr(wc, mr); + + /* Ensure @rep is generated before __frwr_release_mr */ + smp_rmb(); + rpcrdma_complete_rqst(rep); } /** @@ -624,7 +628,6 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; - frwr->fr_req = req; last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 34772cb19286..ffeb4dfebd46 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -348,9 +348,14 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, int nsegs, bool writing, struct rpcrdma_mr **mr) { - *mr = rpcrdma_mr_get(r_xprt); - if (!*mr) - goto out_getmr_err; + *mr = rpcrdma_mr_pop(&req->rl_free_mrs); + if (!*mr) { + *mr = rpcrdma_mr_get(r_xprt); + if (!*mr) + goto out_getmr_err; + trace_xprtrdma_mr_get(req); + (*mr)->mr_req = req; + } rpcrdma_mr_push(*mr, &req->rl_registered); return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index cb6df58488bb..69753ec73c36 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -77,6 +77,7 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); +static void rpcrdma_mr_free(struct rpcrdma_mr *mr); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags); @@ -1022,6 +1023,7 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, if (!req->rl_recvbuf) goto out4; + INIT_LIST_HEAD(&req->rl_free_mrs); INIT_LIST_HEAD(&req->rl_registered); spin_lock(&buffer->rb_lock); list_add(&req->rl_all, &buffer->rb_allreqs); @@ -1130,11 +1132,13 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) * This function assumes that the caller prevents concurrent device * unload and transport tear-down. */ -void -rpcrdma_req_destroy(struct rpcrdma_req *req) +void rpcrdma_req_destroy(struct rpcrdma_req *req) { list_del(&req->rl_all); + while (!list_empty(&req->rl_free_mrs)) + rpcrdma_mr_free(rpcrdma_mr_pop(&req->rl_free_mrs)); + rpcrdma_regbuf_free(req->rl_recvbuf); rpcrdma_regbuf_free(req->rl_sendbuf); rpcrdma_regbuf_free(req->rl_rdmabuf); @@ -1228,7 +1232,6 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) void rpcrdma_mr_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); @@ -1237,6 +1240,15 @@ void rpcrdma_mr_put(struct rpcrdma_mr *mr) mr->mr_dir = DMA_NONE; } + rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); +} + +static void rpcrdma_mr_free(struct rpcrdma_mr *mr) +{ + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + + mr->mr_req = NULL; spin_lock(&buf->rb_mrlock); rpcrdma_mr_push(mr, &buf->rb_mrs); spin_unlock(&buf->rb_mrlock); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9573587ca602..c375b0e434ac 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -234,20 +234,20 @@ struct rpcrdma_sendctx { * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). */ -struct rpcrdma_req; struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; struct completion fr_linv_done; - struct rpcrdma_req *fr_req; union { struct ib_reg_wr fr_regwr; struct ib_send_wr fr_invwr; }; }; +struct rpcrdma_req; struct rpcrdma_mr { struct list_head mr_list; + struct rpcrdma_req *mr_req; struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; @@ -325,7 +325,8 @@ struct rpcrdma_req { struct list_head rl_all; struct kref rl_kref; - struct list_head rl_registered; /* registered segments */ + struct list_head rl_free_mrs; + struct list_head rl_registered; struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; From 4d6b8890ddb16120c5dd25edc8d215cfd8222b63 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:47:57 -0400 Subject: [PATCH 0223/2880] xprtrdma: Remove rpcrdma_buffer::rb_mrlock Clean up: Now that the free list is used sparingly, get rid of the separate spin lock protecting it. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 4 ++-- net/sunrpc/xprtrdma/verbs.c | 21 ++++++++++----------- net/sunrpc/xprtrdma/xprt_rdma.h | 9 ++++----- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 0e740bae2d80..368cdf3edfc9 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -106,10 +106,10 @@ frwr_mr_recycle_worker(struct work_struct *work) mr->mr_dir = DMA_NONE; } - spin_lock(&r_xprt->rx_buf.rb_mrlock); + spin_lock(&r_xprt->rx_buf.rb_lock); list_del(&mr->mr_all); r_xprt->rx_stats.mrs_recycled++; - spin_unlock(&r_xprt->rx_buf.rb_mrlock); + spin_unlock(&r_xprt->rx_buf.rb_lock); frwr_release_mr(mr); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 69753ec73c36..52444c4d1be2 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -962,10 +962,10 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) mr->mr_xprt = r_xprt; - spin_lock(&buf->rb_mrlock); + spin_lock(&buf->rb_lock); list_add(&mr->mr_list, &buf->rb_mrs); list_add(&mr->mr_all, &buf->rb_all_mrs); - spin_unlock(&buf->rb_mrlock); + spin_unlock(&buf->rb_lock); } r_xprt->rx_stats.mrs_allocated += count; @@ -1084,7 +1084,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests; buf->rb_bc_srv_max_requests = 0; - spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all_mrs); @@ -1154,18 +1153,18 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) unsigned int count; count = 0; - spin_lock(&buf->rb_mrlock); + spin_lock(&buf->rb_lock); while ((mr = list_first_entry_or_null(&buf->rb_all_mrs, struct rpcrdma_mr, mr_all)) != NULL) { list_del(&mr->mr_all); - spin_unlock(&buf->rb_mrlock); + spin_unlock(&buf->rb_lock); frwr_release_mr(mr); count++; - spin_lock(&buf->rb_mrlock); + spin_lock(&buf->rb_lock); } - spin_unlock(&buf->rb_mrlock); + spin_unlock(&buf->rb_lock); r_xprt->rx_stats.mrs_allocated = 0; } @@ -1218,9 +1217,9 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_mr *mr; - spin_lock(&buf->rb_mrlock); + spin_lock(&buf->rb_lock); mr = rpcrdma_mr_pop(&buf->rb_mrs); - spin_unlock(&buf->rb_mrlock); + spin_unlock(&buf->rb_lock); return mr; } @@ -1249,9 +1248,9 @@ static void rpcrdma_mr_free(struct rpcrdma_mr *mr) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; mr->mr_req = NULL; - spin_lock(&buf->rb_mrlock); + spin_lock(&buf->rb_lock); rpcrdma_mr_push(mr, &buf->rb_mrs); - spin_unlock(&buf->rb_mrlock); + spin_unlock(&buf->rb_lock); } /** diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c375b0e434ac..200d075bbe31 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -360,19 +360,18 @@ rpcrdma_mr_pop(struct list_head *list) * One of these is associated with a transport instance */ struct rpcrdma_buffer { - spinlock_t rb_mrlock; /* protect rb_mrs list */ + spinlock_t rb_lock; + struct list_head rb_send_bufs; + struct list_head rb_recv_bufs; struct list_head rb_mrs; - struct list_head rb_all_mrs; unsigned long rb_sc_head; unsigned long rb_sc_tail; unsigned long rb_sc_last; struct rpcrdma_sendctx **rb_sc_ctxs; - spinlock_t rb_lock; /* protect buf lists */ - struct list_head rb_send_bufs; - struct list_head rb_recv_bufs; struct list_head rb_allreqs; + struct list_head rb_all_mrs; u32 rb_max_requests; u32 rb_credits; /* most recent credit grant */ From b0b227f071a06d0ce5ef803538899299933d2931 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:48:43 -0400 Subject: [PATCH 0224/2880] xprtrdma: Use an llist to manage free rpcrdma_reps rpcrdma_rep objects are removed from their free list by only a single thread: the Receive completion handler. Thus that free list can be converted to an llist, where a single-threaded consumer and a multi-threaded producer (rpcrdma_buffer_put) can both access the llist without the need for any serialization. This eliminates spin lock contention between the Receive completion handler and rpcrdma_buffer_get, and makes the rep consumer wait- free. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 106 +++++++++++++++----------------- net/sunrpc/xprtrdma/xprt_rdma.h | 6 +- 2 files changed, 53 insertions(+), 59 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 52444c4d1be2..db90083ed35b 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -75,6 +75,7 @@ * internal functions */ static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); +static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); static void rpcrdma_mr_free(struct rpcrdma_mr *mr); @@ -407,7 +408,6 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_req *req; - struct rpcrdma_rep *rep; cancel_work_sync(&buf->rb_refresh_worker); @@ -431,8 +431,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) /* The ULP is responsible for ensuring all DMA * mappings and MRs are gone. */ - list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list) - rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); + rpcrdma_reps_destroy(buf); list_for_each_entry(req, &buf->rb_allreqs, rl_all) { rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf); rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); @@ -1071,6 +1070,40 @@ out: return NULL; } +static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) +{ + rpcrdma_regbuf_free(rep->rr_rdmabuf); + kfree(rep); +} + +static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) +{ + struct llist_node *node; + + /* Calls to llist_del_first are required to be serialized */ + node = llist_del_first(&buf->rb_free_reps); + if (!node) + return NULL; + return llist_entry(node, struct rpcrdma_rep, rr_node); +} + +static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, + struct rpcrdma_rep *rep) +{ + if (!rep->rr_temp) + llist_add(&rep->rr_node, &buf->rb_free_reps); + else + rpcrdma_rep_destroy(rep); +} + +static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_rep *rep; + + while ((rep = rpcrdma_rep_get_locked(buf)) != NULL) + rpcrdma_rep_destroy(rep); +} + /** * rpcrdma_buffer_create - Create initial set of req/rep objects * @r_xprt: transport instance to (re)initialize @@ -1106,7 +1139,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } buf->rb_credits = 1; - INIT_LIST_HEAD(&buf->rb_recv_bufs); + init_llist_head(&buf->rb_free_reps); rc = rpcrdma_sendctxs_create(r_xprt); if (rc) @@ -1118,12 +1151,6 @@ out: return rc; } -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) -{ - rpcrdma_regbuf_free(rep->rr_rdmabuf); - kfree(rep); -} - /** * rpcrdma_req_destroy - Destroy an rpcrdma_req object * @req: unused object to be destroyed @@ -1182,15 +1209,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) cancel_work_sync(&buf->rb_refresh_worker); rpcrdma_sendctxs_destroy(buf); - - while (!list_empty(&buf->rb_recv_bufs)) { - struct rpcrdma_rep *rep; - - rep = list_first_entry(&buf->rb_recv_bufs, - struct rpcrdma_rep, rr_list); - list_del(&rep->rr_list); - rpcrdma_rep_destroy(rep); - } + rpcrdma_reps_destroy(buf); while (!list_empty(&buf->rb_send_bufs)) { struct rpcrdma_req *req; @@ -1281,39 +1300,24 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) */ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) { - struct rpcrdma_rep *rep = req->rl_reply; - + if (req->rl_reply) + rpcrdma_rep_put(buffers, req->rl_reply); req->rl_reply = NULL; spin_lock(&buffers->rb_lock); list_add(&req->rl_list, &buffers->rb_send_bufs); - if (rep) { - if (!rep->rr_temp) { - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - rep = NULL; - } - } spin_unlock(&buffers->rb_lock); - if (rep) - rpcrdma_rep_destroy(rep); } -/* - * Put reply buffers back into pool when not attached to - * request. This happens in error conditions. +/** + * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list + * @rep: rep to release + * + * Used after error conditions. */ -void -rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) +void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) { - struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; - - if (!rep->rr_temp) { - spin_lock(&buffers->rb_lock); - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - spin_unlock(&buffers->rb_lock); - } else { - rpcrdma_rep_destroy(rep); - } + rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep); } /* Returns a pointer to a rpcrdma_regbuf object, or NULL. @@ -1469,22 +1473,10 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) /* fast path: all needed reps can be found on the free list */ wr = NULL; - spin_lock(&buf->rb_lock); while (needed) { - rep = list_first_entry_or_null(&buf->rb_recv_bufs, - struct rpcrdma_rep, rr_list); + rep = rpcrdma_rep_get_locked(buf); if (!rep) - break; - - list_del(&rep->rr_list); - rep->rr_recv_wr.next = wr; - wr = &rep->rr_recv_wr; - --needed; - } - spin_unlock(&buf->rb_lock); - - while (needed) { - rep = rpcrdma_rep_create(r_xprt, temp); + rep = rpcrdma_rep_create(r_xprt, temp); if (!rep) break; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 200d075bbe31..bd1befa83d24 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -47,6 +47,7 @@ #include /* atomic_t, etc */ #include /* struct kref */ #include /* struct work_struct */ +#include #include /* RDMA connection api */ #include /* RDMA verbs api */ @@ -200,7 +201,7 @@ struct rpcrdma_rep { struct rpc_rqst *rr_rqst; struct xdr_buf rr_hdrbuf; struct xdr_stream rr_stream; - struct list_head rr_list; + struct llist_node rr_node; struct ib_recv_wr rr_recv_wr; }; @@ -362,7 +363,6 @@ rpcrdma_mr_pop(struct list_head *list) struct rpcrdma_buffer { spinlock_t rb_lock; struct list_head rb_send_bufs; - struct list_head rb_recv_bufs; struct list_head rb_mrs; unsigned long rb_sc_head; @@ -373,6 +373,8 @@ struct rpcrdma_buffer { struct list_head rb_allreqs; struct list_head rb_all_mrs; + struct llist_head rb_free_reps; + u32 rb_max_requests; u32 rb_credits; /* most recent credit grant */ From 2a7f77c7be1b5adb69712e440e97e0c6fa7ebecb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:49:30 -0400 Subject: [PATCH 0225/2880] xprtrdma: Clean up xprt_rdma_set_connect_timeout() Clean up: The function name should match the documenting comment. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index f4763e8a6761..993b96ff6760 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -494,9 +494,9 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) * @reconnect_timeout: reconnect timeout after server disconnects * */ -static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt, - unsigned long connect_timeout, - unsigned long reconnect_timeout) +static void xprt_rdma_set_connect_timeout(struct rpc_xprt *xprt, + unsigned long connect_timeout, + unsigned long reconnect_timeout) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); @@ -805,7 +805,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = { .send_request = xprt_rdma_send_request, .close = xprt_rdma_close, .destroy = xprt_rdma_destroy, - .set_connect_timeout = xprt_rdma_tcp_set_connect_timeout, + .set_connect_timeout = xprt_rdma_set_connect_timeout, .print_stats = xprt_rdma_print_stats, .enable_swap = xprt_rdma_enable_swap, .disable_swap = xprt_rdma_disable_swap, From df901c85cc28b538c62f6bc20b16a8bd05fcb756 Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Sat, 13 Jul 2019 22:11:29 +0800 Subject: [PATCH 0226/2880] PCI: mobiveil: Fix the CPU base address setup in inbound window Current code erroneously sets-up the CPU base address through the parameter 'pci_addr', which is passed to initialize the CPU (AXI) base address of the inbound window where the controller maps the PCI address space into CPU physical address space; furthermore, it also truncates it by programming only the lower 32-bit value into the inbound CPU address register. Fix both issues by introducing a new parameter 'u64 cpu_addr' to initialize both lower 32-bit and upper 32-bit of the CPU physical base address mapping PCI inbound transactions into CPU (AXI) ones. Fixes: 9af6bcb11e12 ("PCI: mobiveil: Add Mobiveil PCIe Host Bridge IP driver") Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Minghuan Lian Reviewed-by: Subrahmanya Lingappa --- drivers/pci/controller/pcie-mobiveil.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/pcie-mobiveil.c b/drivers/pci/controller/pcie-mobiveil.c index 672e633601c7..a45a6447b01d 100644 --- a/drivers/pci/controller/pcie-mobiveil.c +++ b/drivers/pci/controller/pcie-mobiveil.c @@ -88,6 +88,7 @@ #define AMAP_CTRL_TYPE_MASK 3 #define PAB_EXT_PEX_AMAP_SIZEN(win) PAB_EXT_REG_ADDR(0xbef0, win) +#define PAB_EXT_PEX_AMAP_AXI_WIN(win) PAB_EXT_REG_ADDR(0xb4a0, win) #define PAB_PEX_AMAP_AXI_WIN(win) PAB_REG_ADDR(0x4ba4, win) #define PAB_PEX_AMAP_PEX_WIN_L(win) PAB_REG_ADDR(0x4ba8, win) #define PAB_PEX_AMAP_PEX_WIN_H(win) PAB_REG_ADDR(0x4bac, win) @@ -462,7 +463,7 @@ static int mobiveil_pcie_parse_dt(struct mobiveil_pcie *pcie) } static void program_ib_windows(struct mobiveil_pcie *pcie, int win_num, - u64 pci_addr, u32 type, u64 size) + u64 cpu_addr, u64 pci_addr, u32 type, u64 size) { u32 value; u64 size64 = ~(size - 1); @@ -482,7 +483,10 @@ static void program_ib_windows(struct mobiveil_pcie *pcie, int win_num, csr_writel(pcie, upper_32_bits(size64), PAB_EXT_PEX_AMAP_SIZEN(win_num)); - csr_writel(pcie, pci_addr, PAB_PEX_AMAP_AXI_WIN(win_num)); + csr_writel(pcie, lower_32_bits(cpu_addr), + PAB_PEX_AMAP_AXI_WIN(win_num)); + csr_writel(pcie, upper_32_bits(cpu_addr), + PAB_EXT_PEX_AMAP_AXI_WIN(win_num)); csr_writel(pcie, lower_32_bits(pci_addr), PAB_PEX_AMAP_PEX_WIN_L(win_num)); @@ -624,7 +628,7 @@ static int mobiveil_host_init(struct mobiveil_pcie *pcie) CFG_WINDOW_TYPE, resource_size(pcie->ob_io_res)); /* memory inbound translation window */ - program_ib_windows(pcie, WIN_NUM_0, 0, MEM_WINDOW_TYPE, IB_WIN_SIZE); + program_ib_windows(pcie, WIN_NUM_0, 0, 0, MEM_WINDOW_TYPE, IB_WIN_SIZE); /* Get the I/O and memory ranges from DT */ resource_list_for_each_entry(win, &pcie->resources) { From 17d47f93bc69cc629aa6a20c0ac7b20c9965c116 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:50:16 -0400 Subject: [PATCH 0227/2880] xprtrdma: Fix bc_max_slots return value For the moment the returned value just happens to be correct because the current backchannel server implementation does not vary the number of credits it offers. The spec does permit this value to change during the lifetime of a connection, however. The actual maximum is fixed for all RPC/RDMA transports, because each transport instance has to pre-allocate the resources for processing BC requests. That's the value that should be returned. Fixes: 7402a4fedc2b ("SUNRPC: Fix up backchannel slot table ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 59e624b1d7a0..50e075fcdd8f 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -54,9 +54,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt) { - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - - return r_xprt->rx_buf.rb_bc_srv_max_requests; + return RPCRDMA_BACKWARD_WRS >> 1; } static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) From 1738de336ebc9f8d4bb1b3126c5417a5923a660a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:51:03 -0400 Subject: [PATCH 0228/2880] xprtrdma: Inline XDR chunk encoder functions Micro-optimization: Save the cost of three function calls during transport header encoding. These were "noinline" before to generate more meaningful call stacks during debugging, but this code is now pretty stable. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ffeb4dfebd46..67e1684aee6d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -382,9 +382,10 @@ out_getmr_err: * * Only a single @pos value is currently supported. */ -static noinline int -rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) +static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype rtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; @@ -436,9 +437,10 @@ done: * * Only a single Write chunk is currently supported. */ -static noinline int -rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) +static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; @@ -498,9 +500,10 @@ done: * Returns zero on success, or a negative errno if a failure occurred. * @xdr is advanced to the next position in the stream. */ -static noinline int -rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) +static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; From 435eba4ae0692e2f3d62988f8648efd65c935b6a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:51:49 -0400 Subject: [PATCH 0229/2880] xprtrdma: Optimize rpcrdma_post_recvs() Micro-optimization: In rpcrdma_post_recvs, since commit e340c2d6ef2a ("xprtrdma: Reduce the doorbell rate (Receive)"), the common case is to return without doing anything. Found with perf. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index db90083ed35b..ac2abf4578b9 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1465,7 +1465,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); - if (ep->rep_receive_count > needed) + if (likely(ep->rep_receive_count > needed)) goto out; needed -= ep->rep_receive_count; if (!temp) From 1e672e3644940d83bd94e7cb46bac6bb3627de02 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Tue, 20 Aug 2019 22:21:21 -0500 Subject: [PATCH 0230/2880] NFSv4: Fix a memory leak bug In nfs4_try_migration(), if nfs4_begin_drain_session() fails, the previously allocated 'page' and 'locations' are not deallocated, leading to memory leaks. To fix this issue, go to the 'out' label to free 'page' and 'locations' before returning the error. Signed-off-by: Wenwen Wang Signed-off-by: Anna Schumaker --- fs/nfs/nfs4state.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cad4e064b328..e916aba7a799 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2095,8 +2095,10 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred } status = nfs4_begin_drain_session(clp); - if (status != 0) - return status; + if (status != 0) { + result = status; + goto out; + } status = nfs4_replace_transport(server, locations); if (status != 0) { From 4ad35c1f56386c8e7019c921bba1af109fde9693 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 21 Aug 2019 19:15:52 +0800 Subject: [PATCH 0231/2880] csky: Fixup 610 vipt cache flush mechanism 610 has vipt aliasing issue, so we need to finish the cache flush apis mentioned in cachetlb.rst to avoid data corruption. Here is the list of modified apis in the patch: - flush_kernel_dcache_page (new add) - flush_dcache_mmap_lock (new add) - flush_dcache_mmap_unlock (new add) - flush_kernel_vmap_range (new add) - invalidate_kernel_vmap_range (new add) - flush_anon_page (new add) - flush_cache_range (new add) - flush_cache_vmap (flush all) - flush_cache_vunmap (flush all) - flush_cache_mm (only dcache flush) - flush_icache_page (just nop) - copy_from_user_page (remove no need flush) - copy_to_user_page (remove no need flush) Change to V2: - Fixup compile error with xa_lock*(&mapping->i_pages) Signed-off-by: Guo Ren Cc: Arnd Bergmann Cc: Christoph Hellwig --- arch/csky/abiv1/cacheflush.c | 20 ++++++++++++++ arch/csky/abiv1/inc/abi/cacheflush.h | 41 ++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index fee99fc6612f..9f1fe80cc847 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -54,3 +54,23 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, icache_inv_all(); } } + +void flush_kernel_dcache_page(struct page *page) +{ + struct address_space *mapping; + + mapping = page_mapping_file(page); + + if (!mapping || mapping_mapped(mapping)) + dcache_wbinv_all(); +} +EXPORT_SYMBOL(flush_kernel_dcache_page); + +void flush_cache_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + dcache_wbinv_all(); + + if (vma->vm_flags & VM_EXEC) + icache_inv_all(); +} diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index fce5604cef40..79ef9e8c1afd 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -4,26 +4,49 @@ #ifndef __ABI_CSKY_CACHEFLUSH_H #define __ABI_CSKY_CACHEFLUSH_H -#include +#include #include #include #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); -#define flush_cache_mm(mm) cache_wbinv_all() +#define flush_cache_mm(mm) dcache_wbinv_all() #define flush_cache_page(vma, page, pfn) cache_wbinv_all() #define flush_cache_dup_mm(mm) cache_wbinv_all() +#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE +extern void flush_kernel_dcache_page(struct page *); + +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) + +static inline void flush_kernel_vmap_range(void *addr, int size) +{ + dcache_wbinv_all(); +} +static inline void invalidate_kernel_vmap_range(void *addr, int size) +{ + dcache_wbinv_all(); +} + +#define ARCH_HAS_FLUSH_ANON_PAGE +static inline void flush_anon_page(struct vm_area_struct *vma, + struct page *page, unsigned long vmaddr) +{ + if (PageAnon(page)) + cache_wbinv_all(); +} + /* * if (current_mm != vma->mm) cache_wbinv_range(start, end) will be broken. * Use cache_wbinv_all() here and need to be improved in future. */ -#define flush_cache_range(vma, start, end) cache_wbinv_all() -#define flush_cache_vmap(start, end) cache_wbinv_range(start, end) -#define flush_cache_vunmap(start, end) cache_wbinv_range(start, end) +extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +#define flush_cache_vmap(start, end) cache_wbinv_all() +#define flush_cache_vunmap(start, end) cache_wbinv_all() -#define flush_icache_page(vma, page) cache_wbinv_all() +#define flush_icache_page(vma, page) do {} while (0); #define flush_icache_range(start, end) cache_wbinv_range(start, end) #define flush_icache_user_range(vma,page,addr,len) \ @@ -31,19 +54,13 @@ extern void flush_dcache_page(struct page *); #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ do { \ - cache_wbinv_all(); \ memcpy(dst, src, len); \ - cache_wbinv_all(); \ } while (0) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ - cache_wbinv_all(); \ memcpy(dst, src, len); \ cache_wbinv_all(); \ } while (0) -#define flush_dcache_mmap_lock(mapping) do {} while (0) -#define flush_dcache_mmap_unlock(mapping) do {} while (0) - #endif /* __ABI_CSKY_CACHEFLUSH_H */ From 48c058543cbbb6b34114bbf8f0967e86dcd06b14 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 14 Aug 2019 15:27:00 -0400 Subject: [PATCH 0232/2880] NFS: Add an nfs4_call_sync_custom() function There are a few cases where we need to manually configure the rpc_task_setup structure to get the behavior we want. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1406858bae6c..e5b6499c0b8b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1073,14 +1073,26 @@ static const struct rpc_call_ops nfs40_call_sync_ops = { .rpc_call_done = nfs40_call_sync_done, }; +static int nfs4_call_sync_custom(struct rpc_task_setup *task_setup) +{ + int ret; + struct rpc_task *task; + + task = rpc_run_task(task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + + ret = task->tk_status; + rpc_put_task(task); + return ret; +} + static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res) { - int ret; - struct rpc_task *task; struct nfs_client *clp = server->nfs_client; struct nfs4_call_sync_data data = { .seq_server = server, @@ -1094,14 +1106,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, .callback_data = &data }; - task = rpc_run_task(&task_setup); - if (IS_ERR(task)) - ret = PTR_ERR(task); - else { - ret = task->tk_status; - rpc_put_task(task); - } - return ret; + return nfs4_call_sync_custom(&task_setup); } int nfs4_call_sync(struct rpc_clnt *clnt, From dae40965d51e563603222aa8d4fad7ab3cec0e2d Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 14 Aug 2019 15:28:28 -0400 Subject: [PATCH 0233/2880] NFS: Have nfs4_proc_setclientid() call nfs4_call_sync_custom() Rather than running the task manually Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e5b6499c0b8b..234312240f33 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6023,7 +6023,6 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, .rpc_resp = res, .rpc_cred = cred, }; - struct rpc_task *task; struct rpc_task_setup task_setup_data = { .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, @@ -6056,17 +6055,12 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, dprintk("NFS call setclientid auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_owner_id); - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out; - } - status = task->tk_status; + + status = nfs4_call_sync_custom(&task_setup_data); if (setclientid.sc_cred) { clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred); put_rpccred(setclientid.sc_cred); } - rpc_put_task(task); out: trace_nfs4_setclientid(clp, status); dprintk("NFS reply setclientid: %d\n", status); From 50493364e784b6e4077e41ab10eba4f798d13d9a Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 14 Aug 2019 15:30:16 -0400 Subject: [PATCH 0234/2880] NFS: Have _nfs4_proc_secinfo() call nfs4_call_sync_custom() We do this to set the RPC_TASK_NO_ROUND_ROBIN flag in the task_setup structure Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 234312240f33..de2b3fd806ef 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7644,6 +7644,8 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred) static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity) { int status; + struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct nfs_client *clp = NFS_SERVER(dir)->nfs_client; struct nfs4_secinfo_arg args = { .dir_fh = NFS_FH(dir), .name = name, @@ -7656,26 +7658,37 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct .rpc_argp = &args, .rpc_resp = &res, }; - struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct nfs4_call_sync_data data = { + .seq_server = NFS_SERVER(dir), + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = clp->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; const struct cred *cred = NULL; if (use_integrity) { - clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient; - cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); + clnt = clp->cl_rpcclient; + task_setup.rpc_client = clnt; + + cred = nfs4_get_clid_cred(clp); msg.rpc_cred = cred; } dprintk("NFS call secinfo %s\n", name->name); - nfs4_state_protect(NFS_SERVER(dir)->nfs_client, - NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + nfs4_state_protect(clp, NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_call_sync_custom(&task_setup); - status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, - &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); dprintk("NFS reply secinfo: %d\n", status); put_cred(cred); - return status; } From 4c952e3d1b0dc1835fda7f191b6f92e28b2fa435 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 14 Aug 2019 15:46:48 -0400 Subject: [PATCH 0235/2880] NFS: Have nfs41_proc_reclaim_complete() call nfs4_call_sync_custom() An async call followed by an rpc_wait_for_completion() is basically the same as a synchronous call, so we can use nfs4_call_sync_custom() to keep our custom callback ops and the RPC_TASK_NO_ROUND_ROBIN flag. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index de2b3fd806ef..1b7863ec12d3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8857,7 +8857,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, const struct cred *cred) { struct nfs4_reclaim_complete_data *calldata; - struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], .rpc_cred = cred, @@ -8866,7 +8865,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs4_reclaim_complete_call_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN, + .flags = RPC_TASK_NO_ROUND_ROBIN, }; int status = -ENOMEM; @@ -8881,15 +8880,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out; - } - status = rpc_wait_for_completion_task(task); - if (status == 0) - status = task->tk_status; - rpc_put_task(task); + status = nfs4_call_sync_custom(&task_setup_data); out: dprintk("<-- %s status=%d\n", __func__, status); return status; From cc15e24a3af249455158bdb2ad02b9685eb4d6e9 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 14 Aug 2019 16:22:31 -0400 Subject: [PATCH 0236/2880] NFS: Have nfs41_proc_secinfo_no_name() call nfs4_call_sync_custom() We need to use the custom rpc_task_setup here to set the RPC_TASK_NO_ROUND_ROBIN flag on the RPC call. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1b7863ec12d3..df12af8f6b36 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9365,18 +9365,32 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; struct rpc_clnt *clnt = server->client; + struct nfs4_call_sync_data data = { + .seq_server = server, + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = server->nfs_client->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; const struct cred *cred = NULL; int status; if (use_integrity) { clnt = server->nfs_client->cl_rpcclient; + task_setup.rpc_client = clnt; + cred = nfs4_get_clid_cred(server->nfs_client); msg.rpc_cred = cred; } dprintk("--> %s\n", __func__); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_call_sync_custom(&task_setup); dprintk("<-- %s status=%d\n", __func__, status); put_cred(cred); From f836b27ecad98770bc53600d1ebd6533c921a4e7 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 19 Aug 2019 10:18:28 -0400 Subject: [PATCH 0237/2880] NFS: Have nfs4_proc_get_lease_time() call nfs4_call_sync_custom() This removes some code duplication, since both functions were doing the same thing. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index df12af8f6b36..00c7a92e3d6b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8356,7 +8356,6 @@ static const struct rpc_call_ops nfs4_get_lease_time_ops = { int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) { - struct rpc_task *task; struct nfs4_get_lease_time_args args; struct nfs4_get_lease_time_res res = { .lr_fsinfo = fsinfo, @@ -8378,17 +8377,9 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) .callback_data = &data, .flags = RPC_TASK_TIMEOUT, }; - int status; nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1); - task = rpc_run_task(&task_setup); - - if (IS_ERR(task)) - return PTR_ERR(task); - - status = task->tk_status; - rpc_put_task(task); - return status; + return nfs4_call_sync_custom(&task_setup); } #ifdef CONFIG_NFS_V4_1 From ee4ea764ea03253016034603cab352c0798ce173 Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Tue, 20 Aug 2019 07:28:43 +0000 Subject: [PATCH 0238/2880] dt-bindings: PCI: designware: Remove the num-lanes from Required properties The num-lanes is not a mandatory property, e.g. on FSL Layerscape SoCs, the PCIe link training is completed automatically based on the selected SerDes protocol, it does not need the num-lanes to set-up the link width. Currently it is both a Required and Optional property, let's remove it from the Required properties. Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- Documentation/devicetree/bindings/pci/designware-pcie.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pci/designware-pcie.txt b/Documentation/devicetree/bindings/pci/designware-pcie.txt index 5561a1c060d0..bd880df39a79 100644 --- a/Documentation/devicetree/bindings/pci/designware-pcie.txt +++ b/Documentation/devicetree/bindings/pci/designware-pcie.txt @@ -11,7 +11,6 @@ Required properties: the ATU address space. (The old way of getting the configuration address space from "ranges" is deprecated and should be avoided.) -- num-lanes: number of lanes to use RC mode: - #address-cells: set to <3> - #size-cells: set to <2> From 66de33f09fd97201847de7e1e2ec8a117242e1d6 Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Tue, 20 Aug 2019 07:28:49 +0000 Subject: [PATCH 0239/2880] PCI: dwc: Return directly when num-lanes is not found The num-lanes is optional since it is not needed on some platforms that bring up the link in firmware. The link programming is based on the num-lanes properties (which is optional); if it is not present code must return instead of fiddling with the lanes value to print an error message. Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- drivers/pci/controller/dwc/pcie-designware.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 7d25102c304c..0a89bfd1636e 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -423,8 +423,10 @@ void dw_pcie_setup(struct dw_pcie *pci) ret = of_property_read_u32(np, "num-lanes", &lanes); - if (ret) - lanes = 0; + if (ret) { + dev_dbg(pci->dev, "property num-lanes isn't found\n"); + return; + } /* Set the number of lanes */ val = dw_pcie_readl_dbi(pci, PCIE_PORT_LINK_CONTROL); From 568adba9eb201875bc561745e0f19b831b7e2bbc Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Tue, 20 Aug 2019 07:28:55 +0000 Subject: [PATCH 0240/2880] ARM: dts: ls1021a: Remove num-lanes property from PCIe nodes Remove the num-lanes property to avoid the driver setting the link width. On FSL Layerscape SoCs, the number of lanes assigned to PCIe controller is not fixed, it is determined by the selected SerDes protocol in the RCW (Reset Configuration Word). The PCIe link training is completed automatically through the selected SerDes protocol - the link width set-up is updated by hardware after power on reset, so the num-lanes property is not needed for Layerscape PCIe. The current num-lanes property was added erroneously, which actually indicates the maximum lanes the PCIe controller can support up to, instead of the lanes assigned to the PCIe controller. The link width set by SerDes protocol will be overridden by the num-lanes property, hence the subsequent re-training will fail when the assigned lanes do not match the value in the num-lanes property. Remove the property to fix the issue. Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- arch/arm/boot/dts/ls1021a.dtsi | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi index 464df4290ffc..2f6977ada447 100644 --- a/arch/arm/boot/dts/ls1021a.dtsi +++ b/arch/arm/boot/dts/ls1021a.dtsi @@ -874,7 +874,6 @@ #address-cells = <3>; #size-cells = <2>; device_type = "pci"; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -899,7 +898,6 @@ #address-cells = <3>; #size-cells = <2>; device_type = "pci"; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ From 4035ff36a6e0a07a1f0b0609520e17bd69a74c8b Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Tue, 20 Aug 2019 07:29:01 +0000 Subject: [PATCH 0241/2880] arm64: dts: fsl: Remove num-lanes property from PCIe nodes Remove the num-lanes property to avoid the driver setting the link width. On FSL Layerscape SoCs, the number of lanes assigned to PCIe controller is not fixed, it is determined by the selected SerDes protocol in the RCW (Reset Configuration Word). The PCIe link training is completed automatically through the selected SerDes protocol - the link width set-up is updated by hardware after power on reset, so the num-lanes property is not needed for Layerscape PCIe. The current num-lanes property was added erroneously, which actually indicates the maximum lanes the PCIe controller can support up to, instead of the lanes assigned to the PCIe controller. The link width set by SerDes protocol will be overridden by the num-lanes property, hence the subsequent re-training will fail when the assigned lanes do not match the value in the num-lanes property. Remove the property to fix the issue Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi | 1 - arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi | 3 --- arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi | 6 ------ arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi | 3 --- arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi | 4 ---- 5 files changed, 17 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi index ec6257a5b251..119c597ca867 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi @@ -486,7 +486,6 @@ #address-cells = <3>; #size-cells = <2>; device_type = "pci"; - num-lanes = <4>; num-viewport = <2>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 71d9ed9ff985..c084c7a4b6a6 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -677,7 +677,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -704,7 +703,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <2>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -731,7 +729,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <2>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000 /* downstream I/O */ diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index b0ef08b090dd..d4c1da3d4bde 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -649,7 +649,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <8>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -671,7 +670,6 @@ reg-names = "regs", "addr_space"; num-ib-windows = <6>; num-ob-windows = <8>; - num-lanes = <2>; status = "disabled"; }; @@ -687,7 +685,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <2>; num-viewport = <8>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -709,7 +706,6 @@ reg-names = "regs", "addr_space"; num-ib-windows = <6>; num-ob-windows = <8>; - num-lanes = <2>; status = "disabled"; }; @@ -725,7 +721,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <2>; num-viewport = <8>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -747,7 +742,6 @@ reg-names = "regs", "addr_space"; num-ib-windows = <6>; num-ob-windows = <8>; - num-lanes = <2>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi index dacd8cf03a7f..ce48a2323337 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi @@ -452,7 +452,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <256>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x20 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -478,7 +477,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x28 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -504,7 +502,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <8>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x30 0x00010000 0x0 0x00010000 /* downstream I/O */ diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi index 3ace91945b72..d4993a2b404f 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi @@ -634,7 +634,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; msi-parent = <&its>; @@ -656,7 +655,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; msi-parent = <&its>; @@ -678,7 +676,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <8>; num-viewport = <256>; bus-range = <0x0 0xff>; msi-parent = <&its>; @@ -700,7 +697,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; msi-parent = <&its>; From 9d60d93198c62ac3b3e59e8bba7079646c972a4c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 26 Aug 2019 10:28:58 -0400 Subject: [PATCH 0242/2880] Deprecate nfsd fault injection This is only useful for client testing. I haven't really maintained it, and reference counting and locking are wrong at this point. You can get some of the same functionality now from nfsd/clients/. It was a good idea but I think its time has passed. In the unlikely event of users, hopefully the BROKEN dependency will prompt them to speak up. Otherwise I expect to remove it soon. Reported-by: Alex Lyakas Signed-off-by: J. Bruce Fields --- fs/nfsd/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index bff8456220e0..10cefb0c07c7 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -148,7 +148,7 @@ config NFSD_V4_SECURITY_LABEL config NFSD_FAULT_INJECTION bool "NFS server manual fault injection" - depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS + depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS && BROKEN help This option enables support for manually injecting faults into the NFS server. This is intended to be used for From ee2f412ece32ab685921408ab1242d097557b57c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 26 Aug 2019 13:12:46 -0400 Subject: [PATCH 0243/2880] xprtrdma: Recycle MRs after disconnect The optimization done in "xprtrdma: Simplify rpcrdma_mr_pop" was a bit too optimistic. MRs left over after a reconnect still need to be recycled, not added back to the free list, since they could be in flight or actually fully registered. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 35 +++++++++++++++++++++++++-------- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 368cdf3edfc9..30065a28628c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -88,15 +88,8 @@ void frwr_release_mr(struct rpcrdma_mr *mr) kfree(mr); } -/* MRs are dynamically allocated, so simply clean up and release the MR. - * A replacement MR will subsequently be allocated on demand. - */ -static void -frwr_mr_recycle_worker(struct work_struct *work) +static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { - struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - trace_xprtrdma_mr_recycle(mr); if (mr->mr_dir != DMA_NONE) { @@ -114,6 +107,32 @@ frwr_mr_recycle_worker(struct work_struct *work) frwr_release_mr(mr); } +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. + */ +static void +frwr_mr_recycle_worker(struct work_struct *work) +{ + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, + mr_recycle); + + frwr_mr_recycle(mr->mr_xprt, mr); +} + +/* frwr_recycle - Discard MRs + * @req: request to reset + * + * Used after a reconnect. These MRs could be in flight, we can't + * tell. Safe thing to do is release them. + */ +void frwr_recycle(struct rpcrdma_req *req) +{ + struct rpcrdma_mr *mr; + + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) + frwr_mr_recycle(mr->mr_xprt, mr); +} + /* frwr_reset - Place MRs back on the free list * @req: request to reset * diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 67e1684aee6d..19dd29a5c60d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -867,7 +867,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) * chunks. Very likely the connection has been replaced, * so these registrations are invalid and unusable. */ - frwr_reset(req); + frwr_recycle(req); /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index bd1befa83d24..65e6b0eb862e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -542,6 +542,7 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ bool frwr_is_supported(struct ib_device *device); +void frwr_recycle(struct rpcrdma_req *req); void frwr_reset(struct rpcrdma_req *req); int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep); int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); From f9e1afe0fa729337309fa44921da998d2e6e6198 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 26 Aug 2019 13:12:51 -0400 Subject: [PATCH 0244/2880] xprtrdma: Clear xprt->reestablish_timeout on close Ensure that the re-establishment delay does not grow exponentially on each good reconnect. This probably should have been part of commit 675dd90ad093 ("xprtrdma: Modernize ops->connect"). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 8 ++++++-- net/sunrpc/xprtrdma/transport.c | 3 +-- net/sunrpc/xprtrdma/verbs.c | 2 ++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 19dd29a5c60d..b86b5fd62d9f 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1261,8 +1261,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) struct rpc_rqst *rqst = rep->rr_rqst; int status; - xprt->reestablish_timeout = 0; - switch (rep->rr_proc) { case rdma_msg: status = rpcrdma_decode_msg(r_xprt, rep, rqst); @@ -1321,6 +1319,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) u32 credits; __be32 *p; + /* Any data means we had a useful conversation, so + * then we don't need to delay the next reconnect. + */ + if (xprt->reestablish_timeout) + xprt->reestablish_timeout = 0; + /* Fixed transport header fields */ xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, rep->rr_hdrbuf.head[0].iov_base, NULL); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 993b96ff6760..160558b4135e 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -423,8 +423,6 @@ void xprt_rdma_close(struct rpc_xprt *xprt) if (ep->rep_connected == -ENODEV) return; - if (ep->rep_connected > 0) - xprt->reestablish_timeout = 0; rpcrdma_ep_disconnect(ep, ia); /* Prepare @xprt for the next connection by reinitializing @@ -434,6 +432,7 @@ void xprt_rdma_close(struct rpc_xprt *xprt) xprt->cwnd = RPC_CWNDSHIFT; out: + xprt->reestablish_timeout = 0; ++xprt->connect_cookie; xprt_disconnect_done(xprt); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index ac2abf4578b9..1dadc9ef504f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -731,6 +731,8 @@ retry: if (rc) goto out; + if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); if (ep->rep_connected <= 0) { if (ep->rep_connected == -EAGAIN) From 98ef77d1aaa7a2f4e1b2a721faa084222021fda7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 26 Aug 2019 13:12:57 -0400 Subject: [PATCH 0245/2880] xprtrdma: Send Queue size grows after a reconnect Eli Dorfman reports that after a series of idle disconnects, an RPC/RDMA transport becomes unusable (rdma_create_qp returns -ENOMEM). Problem was tracked down to increasing Send Queue size after each reconnect. The rdma_create_qp() API does not promise to leave its @qp_init_attr parameter unaltered. In fact, some drivers do modify one or more of its fields. Thus our calls to rdma_create_qp must use a fresh copy of ib_qp_init_attr each time. This fix is appropriate for kernels dating back to late 2007, though it will have to be adapted, as the connect code has changed over the years. Reported-by: Eli Dorfman Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1dadc9ef504f..796945751e66 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -606,10 +606,10 @@ void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) * Unlike a normal reconnection, a fresh PD and a new set * of MRs and buffers is needed. */ -static int -rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, + struct ib_qp_init_attr *qp_init_attr) { + struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc, err; trace_xprtrdma_reinsert(r_xprt); @@ -626,7 +626,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, } rc = -ENETUNREACH; - err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr); if (err) { pr_err("rpcrdma: rdma_create_qp returned %d\n", err); goto out3; @@ -643,16 +643,16 @@ out1: return rc; } -static int -rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, - struct rpcrdma_ia *ia) +static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, + struct ib_qp_init_attr *qp_init_attr) { + struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rdma_cm_id *id, *old; int err, rc; trace_xprtrdma_reconnect(r_xprt); - rpcrdma_ep_disconnect(ep, ia); + rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia); rc = -EHOSTUNREACH; id = rpcrdma_create_id(r_xprt, ia); @@ -674,7 +674,7 @@ rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, goto out_destroy; } - err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + err = rdma_create_qp(id, ia->ri_pd, qp_init_attr); if (err) goto out_destroy; @@ -699,25 +699,27 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct ib_qp_init_attr qp_init_attr; int rc; retry: + memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr)); switch (ep->rep_connected) { case 0: dprintk("RPC: %s: connecting...\n", __func__); - rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr); if (rc) { rc = -ENETUNREACH; goto out_noupdate; } break; case -ENODEV: - rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia); + rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr); if (rc) goto out_noupdate; break; default: - rc = rpcrdma_ep_reconnect(r_xprt, ep, ia); + rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr); if (rc) goto out; } From 51904045d4aa07fbccf6ff18d56d2064a7676d35 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Tue, 30 Jul 2019 10:21:22 +0800 Subject: [PATCH 0246/2880] thermal: qoriq: Add clock operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some platforms like i.MX8MQ has clock control for this module, need to add clock operations to make sure the driver is working properly. Signed-off-by: Anson Huang Reviewed-by: Guido Günther Reviewed-by: Dong Aisheng Signed-off-by: Zhang Rui --- drivers/thermal/qoriq_thermal.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c index 7b364933bfb1..28939471ce16 100644 --- a/drivers/thermal/qoriq_thermal.c +++ b/drivers/thermal/qoriq_thermal.c @@ -2,6 +2,7 @@ // // Copyright 2016 Freescale Semiconductor, Inc. +#include #include #include #include @@ -72,6 +73,7 @@ struct qoriq_sensor { struct qoriq_tmu_data { struct qoriq_tmu_regs __iomem *regs; + struct clk *clk; bool little_endian; struct qoriq_sensor *sensor[SITES_MAX]; }; @@ -209,6 +211,16 @@ static int qoriq_tmu_probe(struct platform_device *pdev) goto err_iomap; } + data->clk = devm_clk_get_optional(&pdev->dev, NULL); + if (IS_ERR(data->clk)) + return PTR_ERR(data->clk); + + ret = clk_prepare_enable(data->clk); + if (ret) { + dev_err(&pdev->dev, "Failed to enable clock\n"); + return ret; + } + qoriq_tmu_init_device(data); /* TMU initialization */ ret = qoriq_tmu_calibration(pdev); /* TMU calibration */ @@ -225,6 +237,7 @@ static int qoriq_tmu_probe(struct platform_device *pdev) return 0; err_tmu: + clk_disable_unprepare(data->clk); iounmap(data->regs); err_iomap: @@ -241,6 +254,9 @@ static int qoriq_tmu_remove(struct platform_device *pdev) tmu_write(data, TMR_DISABLE, &data->regs->tmr); iounmap(data->regs); + + clk_disable_unprepare(data->clk); + platform_set_drvdata(pdev, NULL); return 0; @@ -257,14 +273,21 @@ static int qoriq_tmu_suspend(struct device *dev) tmr &= ~TMR_ME; tmu_write(data, tmr, &data->regs->tmr); + clk_disable_unprepare(data->clk); + return 0; } static int qoriq_tmu_resume(struct device *dev) { u32 tmr; + int ret; struct qoriq_tmu_data *data = dev_get_drvdata(dev); + ret = clk_prepare_enable(data->clk); + if (ret) + return ret; + /* Enable monitoring */ tmr = tmu_read(data, &data->regs->tmr); tmr |= TMR_ME; From 11f0cdc8bd621fcded06c951b05076e618c5c717 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Tue, 30 Jul 2019 10:21:23 +0800 Subject: [PATCH 0247/2880] thermal: qoriq: Fix error path of calling qoriq_tmu_register_tmu_zone fail When registering tmu zone failed, the error path should be err_tmu instead of err_iomap, as iounmap() needs to be called. Signed-off-by: Anson Huang Reviewed-by: Dong Aisheng Signed-off-by: Zhang Rui --- drivers/thermal/qoriq_thermal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c index 28939471ce16..5755a1108a1f 100644 --- a/drivers/thermal/qoriq_thermal.c +++ b/drivers/thermal/qoriq_thermal.c @@ -231,7 +231,7 @@ static int qoriq_tmu_probe(struct platform_device *pdev) if (ret < 0) { dev_err(&pdev->dev, "Failed to register sensors\n"); ret = -ENODEV; - goto err_iomap; + goto err_tmu; } return 0; From 4d82000af007acda6b46729ea368f6b2823c532c Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Tue, 30 Jul 2019 10:21:24 +0800 Subject: [PATCH 0248/2880] thermal: qoriq: Use devm_platform_ioremap_resource() instead of of_iomap() Use devm_platform_ioremap_resource() instead of of_iomap() to save the iounmap() call in error handle path; Signed-off-by: Anson Huang Reviewed-by: Dong Aisheng Signed-off-by: Zhang Rui --- drivers/thermal/qoriq_thermal.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c index 5755a1108a1f..8d19601d0ca6 100644 --- a/drivers/thermal/qoriq_thermal.c +++ b/drivers/thermal/qoriq_thermal.c @@ -204,11 +204,10 @@ static int qoriq_tmu_probe(struct platform_device *pdev) data->little_endian = of_property_read_bool(np, "little-endian"); - data->regs = of_iomap(np, 0); - if (!data->regs) { + data->regs = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(data->regs)) { dev_err(&pdev->dev, "Failed to get memory region\n"); - ret = -ENODEV; - goto err_iomap; + return PTR_ERR(data->regs); } data->clk = devm_clk_get_optional(&pdev->dev, NULL); @@ -225,22 +224,19 @@ static int qoriq_tmu_probe(struct platform_device *pdev) ret = qoriq_tmu_calibration(pdev); /* TMU calibration */ if (ret < 0) - goto err_tmu; + goto err; ret = qoriq_tmu_register_tmu_zone(pdev); if (ret < 0) { dev_err(&pdev->dev, "Failed to register sensors\n"); ret = -ENODEV; - goto err_tmu; + goto err; } return 0; -err_tmu: +err: clk_disable_unprepare(data->clk); - iounmap(data->regs); - -err_iomap: platform_set_drvdata(pdev, NULL); return ret; @@ -253,8 +249,6 @@ static int qoriq_tmu_remove(struct platform_device *pdev) /* Disable monitoring */ tmu_write(data, TMR_DISABLE, &data->regs->tmr); - iounmap(data->regs); - clk_disable_unprepare(data->clk); platform_set_drvdata(pdev, NULL); From aea591970f659bd6d792ca4c46dd0d251331a397 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Tue, 30 Jul 2019 10:21:25 +0800 Subject: [PATCH 0249/2880] thermal: qoriq: Use __maybe_unused instead of #if CONFIG_PM_SLEEP Use __maybe_unused for power management related functions instead of #if CONFIG_PM_SLEEP to simply the code. Signed-off-by: Anson Huang Reviewed-by: Dong Aisheng Signed-off-by: Zhang Rui --- drivers/thermal/qoriq_thermal.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c index 8d19601d0ca6..39542c670301 100644 --- a/drivers/thermal/qoriq_thermal.c +++ b/drivers/thermal/qoriq_thermal.c @@ -256,8 +256,7 @@ static int qoriq_tmu_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM_SLEEP -static int qoriq_tmu_suspend(struct device *dev) +static int __maybe_unused qoriq_tmu_suspend(struct device *dev) { u32 tmr; struct qoriq_tmu_data *data = dev_get_drvdata(dev); @@ -272,7 +271,7 @@ static int qoriq_tmu_suspend(struct device *dev) return 0; } -static int qoriq_tmu_resume(struct device *dev) +static int __maybe_unused qoriq_tmu_resume(struct device *dev) { u32 tmr; int ret; @@ -289,7 +288,6 @@ static int qoriq_tmu_resume(struct device *dev) return 0; } -#endif static SIMPLE_DEV_PM_OPS(qoriq_tmu_pm_ops, qoriq_tmu_suspend, qoriq_tmu_resume); From 11f787b0840e26648b5e01fad99bd2aa35098d5d Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Tue, 30 Jul 2019 10:21:26 +0800 Subject: [PATCH 0250/2880] dt-bindings: thermal: qoriq: Add optional clocks property Some platforms like i.MX8M series SoCs have clock control for TMU, add optional clocks property to the binding doc. Signed-off-by: Anson Huang Reviewed-by: Rob Herring Reviewed-by: Dong Aisheng Signed-off-by: Zhang Rui --- Documentation/devicetree/bindings/thermal/qoriq-thermal.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt b/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt index 04cbb90a5d3e..28f2cbaf1702 100644 --- a/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt @@ -23,6 +23,7 @@ Required properties: Optional property: - little-endian : If present, the TMU registers are little endian. If absent, the default is big endian. +- clocks : the clock for clocking the TMU silicon. Example: From 9aee3713135a6733f6e58ec84c1ce24514579039 Mon Sep 17 00:00:00 2001 From: Nathan Huckleberry Date: Thu, 13 Jun 2019 11:49:23 -0700 Subject: [PATCH 0251/2880] thermal: armada: Fix -Wshift-negative-value Clang produces the following warning drivers/thermal/armada_thermal.c:270:33: warning: shifting a negative signed value is undefined [-Wshift-negative-value] 1 warning reg &= ~CONTROL1_TSEN_AVG_MASK << CONTROL1_TSEN_AVG_SHIFT; generated . ~~~~~~~~~~~~~~~~~~~~~~~ ^ CONTROL1_TSEN_AVG_SHIFT is defined to be zero. Since shifting by zero does nothing this variable can be removed. Cc: clang-built-linux@googlegroups.com Link: https://github.com/ClangBuiltLinux/linux/issues/532 Signed-off-by: Nathan Huckleberry Reviewed-by: Daniel Lezcano Acked-by: Miquel Raynal Signed-off-by: Zhang Rui --- drivers/thermal/armada_thermal.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/armada_thermal.c b/drivers/thermal/armada_thermal.c index 8c07a393dc2e..709a22f455e9 100644 --- a/drivers/thermal/armada_thermal.c +++ b/drivers/thermal/armada_thermal.c @@ -53,7 +53,6 @@ #define CONTROL0_TSEN_MODE_EXTERNAL 0x2 #define CONTROL0_TSEN_MODE_MASK 0x3 -#define CONTROL1_TSEN_AVG_SHIFT 0 #define CONTROL1_TSEN_AVG_MASK 0x7 #define CONTROL1_EXT_TSEN_SW_RESET BIT(7) #define CONTROL1_EXT_TSEN_HW_RESETn BIT(8) @@ -267,8 +266,8 @@ static void armada_cp110_init(struct platform_device *pdev, /* Average the output value over 2^1 = 2 samples */ regmap_read(priv->syscon, data->syscon_control1_off, ®); - reg &= ~CONTROL1_TSEN_AVG_MASK << CONTROL1_TSEN_AVG_SHIFT; - reg |= 1 << CONTROL1_TSEN_AVG_SHIFT; + reg &= ~CONTROL1_TSEN_AVG_MASK; + reg |= 1; regmap_write(priv->syscon, data->syscon_control1_off, reg); } From b9cd1663fb49c9432c346be742f94ef25b406c89 Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Mon, 8 Jul 2019 20:34:09 +0800 Subject: [PATCH 0252/2880] thermal: rcar_gen3_thermal: Replace devm_add_action() followed by failure action with devm_add_action_or_reset() devm_add_action_or_reset() is introduced as a helper function which internally calls devm_add_action(). If devm_add_action() fails then it will execute the action mentioned and return the error code. This reduce source code size (avoid writing the action twice) and reduce the likelyhood of bugs. Signed-off-by: Fuqian Huang Signed-off-by: Zhang Rui --- drivers/thermal/rcar_gen3_thermal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/thermal/rcar_gen3_thermal.c b/drivers/thermal/rcar_gen3_thermal.c index a56463308694..755d2b5bd2c2 100644 --- a/drivers/thermal/rcar_gen3_thermal.c +++ b/drivers/thermal/rcar_gen3_thermal.c @@ -443,9 +443,8 @@ static int rcar_gen3_thermal_probe(struct platform_device *pdev) if (ret) goto error_unregister; - ret = devm_add_action(dev, rcar_gen3_hwmon_action, zone); + ret = devm_add_action_or_reset(dev, rcar_gen3_hwmon_action, zone); if (ret) { - rcar_gen3_hwmon_action(zone); goto error_unregister; } From 9d6b4b871dcfd6796b46a05e002884e051687e47 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 21 Jul 2019 18:33:58 +0200 Subject: [PATCH 0253/2880] thermal: tegra: Fix a typo s/sochterm/soctherm/ Signed-off-by: Christophe JAILLET Signed-off-by: Zhang Rui --- drivers/thermal/tegra/soctherm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c index 43941eb734eb..5acaad3a594f 100644 --- a/drivers/thermal/tegra/soctherm.c +++ b/drivers/thermal/tegra/soctherm.c @@ -202,7 +202,7 @@ /* get dividend from the depth */ #define THROT_DEPTH_DIVIDEND(depth) ((256 * (100 - (depth)) / 100) - 1) -/* gk20a nv_therm interface N:3 Mapping. Levels defined in tegra124-sochterm.h +/* gk20a nv_therm interface N:3 Mapping. Levels defined in tegra124-soctherm.h * level vector * NONE 3'b000 * LOW 3'b001 From 6b8249abb093551ef173d13a25ed0044d5dd33e0 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Fri, 23 Aug 2019 10:38:35 +0100 Subject: [PATCH 0254/2880] drivers: thermal: qcom: tsens: Fix memory leak from qfprom read memory returned as part of nvmem_read via qfprom_read should be freed by the consumer once done. Existing code is not doing it so fix it. Below memory leak detected by kmemleak [] kmemleak_alloc+0x50/0x84 [] __kmalloc+0xe8/0x168 [] nvmem_cell_read+0x30/0x80 [] qfprom_read+0x4c/0x7c [] calibrate_v1+0x34/0x204 [] tsens_probe+0x164/0x258 [] platform_drv_probe+0x80/0xa0 [] really_probe+0x208/0x248 [] driver_probe_device+0x98/0xc0 [] __device_attach_driver+0x9c/0xac [] bus_for_each_drv+0x60/0x8c [] __device_attach+0x8c/0x100 [] device_initial_probe+0x20/0x28 [] bus_probe_device+0x34/0x7c [] deferred_probe_work_func+0x6c/0x98 [] process_one_work+0x160/0x2f8 Signed-off-by: Srinivas Kandagatla Acked-by: Amit Kucheria Signed-off-by: Zhang Rui --- drivers/thermal/qcom/tsens-8960.c | 2 ++ drivers/thermal/qcom/tsens-v0_1.c | 12 ++++++++++-- drivers/thermal/qcom/tsens-v1.c | 1 + drivers/thermal/qcom/tsens.h | 1 + 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c index 8d9b721dadb6..e46a4e3f25c4 100644 --- a/drivers/thermal/qcom/tsens-8960.c +++ b/drivers/thermal/qcom/tsens-8960.c @@ -229,6 +229,8 @@ static int calibrate_8960(struct tsens_priv *priv) for (i = 0; i < num_read; i++, s++) s->offset = data[i]; + kfree(data); + return 0; } diff --git a/drivers/thermal/qcom/tsens-v0_1.c b/drivers/thermal/qcom/tsens-v0_1.c index 6f26fadf4c27..055647bcee67 100644 --- a/drivers/thermal/qcom/tsens-v0_1.c +++ b/drivers/thermal/qcom/tsens-v0_1.c @@ -145,8 +145,10 @@ static int calibrate_8916(struct tsens_priv *priv) return PTR_ERR(qfprom_cdata); qfprom_csel = (u32 *)qfprom_read(priv->dev, "calib_sel"); - if (IS_ERR(qfprom_csel)) + if (IS_ERR(qfprom_csel)) { + kfree(qfprom_cdata); return PTR_ERR(qfprom_csel); + } mode = (qfprom_csel[0] & MSM8916_CAL_SEL_MASK) >> MSM8916_CAL_SEL_SHIFT; dev_dbg(priv->dev, "calibration mode is %d\n", mode); @@ -181,6 +183,8 @@ static int calibrate_8916(struct tsens_priv *priv) } compute_intercept_slope(priv, p1, p2, mode); + kfree(qfprom_cdata); + kfree(qfprom_csel); return 0; } @@ -198,8 +202,10 @@ static int calibrate_8974(struct tsens_priv *priv) return PTR_ERR(calib); bkp = (u32 *)qfprom_read(priv->dev, "calib_backup"); - if (IS_ERR(bkp)) + if (IS_ERR(bkp)) { + kfree(calib); return PTR_ERR(bkp); + } calib_redun_sel = bkp[1] & BKP_REDUN_SEL; calib_redun_sel >>= BKP_REDUN_SHIFT; @@ -313,6 +319,8 @@ static int calibrate_8974(struct tsens_priv *priv) } compute_intercept_slope(priv, p1, p2, mode); + kfree(calib); + kfree(bkp); return 0; } diff --git a/drivers/thermal/qcom/tsens-v1.c b/drivers/thermal/qcom/tsens-v1.c index 10b595d4f619..870f502f2cb6 100644 --- a/drivers/thermal/qcom/tsens-v1.c +++ b/drivers/thermal/qcom/tsens-v1.c @@ -138,6 +138,7 @@ static int calibrate_v1(struct tsens_priv *priv) } compute_intercept_slope(priv, p1, p2, mode); + kfree(qfprom_cdata); return 0; } diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h index 2fd94997245b..b89083b61c38 100644 --- a/drivers/thermal/qcom/tsens.h +++ b/drivers/thermal/qcom/tsens.h @@ -17,6 +17,7 @@ #include #include +#include struct tsens_priv; From 7ce2e76a0420801fb4b53b9e6850940e6b326433 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Tue, 27 Aug 2019 11:56:20 +0200 Subject: [PATCH 0255/2880] PCI: Move ASPM declarations to linux/pci.h Move ASPM definitions and function prototypes from include/linux/pci-aspm.h to include/linux/pci.h so users only need to include : PCIE_LINK_STATE_L0S PCIE_LINK_STATE_L1 PCIE_LINK_STATE_CLKPM pci_disable_link_state() pci_disable_link_state_locked() pcie_no_aspm() No functional changes intended. Link: https://lore.kernel.org/r/20190827095620.11213-1-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/acpi/pci_root.c | 1 - drivers/char/xillybus/xillybus_pcie.c | 1 - drivers/net/ethernet/intel/e1000e/e1000.h | 1 - drivers/net/ethernet/jme.c | 1 - drivers/net/ethernet/realtek/r8169_main.c | 1 - drivers/net/wireless/ath/ath5k/pci.c | 1 - .../net/wireless/intel/iwlegacy/3945-mac.c | 1 - .../net/wireless/intel/iwlegacy/4965-mac.c | 1 - .../net/wireless/intel/iwlwifi/pcie/trans.c | 1 - drivers/pci/pci-acpi.c | 1 - drivers/pci/pcie/aspm.c | 1 - drivers/pci/quirks.c | 1 - drivers/scsi/aacraid/linit.c | 1 - drivers/scsi/hpsa.c | 1 - drivers/scsi/mpt3sas/mpt3sas_scsih.c | 1 - include/linux/pci-aspm.h | 36 ------------------- include/linux/pci.h | 18 ++++++++++ 17 files changed, 18 insertions(+), 51 deletions(-) delete mode 100644 include/linux/pci-aspm.h diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 314a187ed572..d1e666ef3fcc 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/char/xillybus/xillybus_pcie.c b/drivers/char/xillybus/xillybus_pcie.c index 02c15952b103..18b0c392bc93 100644 --- a/drivers/char/xillybus/xillybus_pcie.c +++ b/drivers/char/xillybus/xillybus_pcie.c @@ -9,7 +9,6 @@ #include #include -#include #include #include "xillybus.h" diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index 34cd67951aec..6c51b1bad8c4 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c index 0b668357db4d..57e8aea98969 100644 --- a/drivers/net/ethernet/jme.c +++ b/drivers/net/ethernet/jme.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 0637c6752a78..a6d8e7f5fde7 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/wireless/ath/ath5k/pci.c b/drivers/net/wireless/ath/ath5k/pci.c index c6156cc38940..d5ee32ce9eb3 100644 --- a/drivers/net/wireless/ath/ath5k/pci.c +++ b/drivers/net/wireless/ath/ath5k/pci.c @@ -18,7 +18,6 @@ #include #include -#include #include #include #include "../ath.h" diff --git a/drivers/net/wireless/intel/iwlegacy/3945-mac.c b/drivers/net/wireless/intel/iwlegacy/3945-mac.c index b82da75a9ae3..4fbcc7fba3cc 100644 --- a/drivers/net/wireless/intel/iwlegacy/3945-mac.c +++ b/drivers/net/wireless/intel/iwlegacy/3945-mac.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c index fa2c02881939..ffb705b18fb1 100644 --- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c +++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index f5df5b370d78..4c308e33ee21 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -62,7 +62,6 @@ * *****************************************************************************/ #include -#include #include #include #include diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 45049f558860..dd520fe927db 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index e44af7f4d37f..ad6259ec51a6 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "../pci.h" #ifdef MODULE_PARAM_PREFIX diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 208aacf39329..9ac1a7564c9e 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 644f7f5c61a2..4a858789e6c5 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 43a6b5350775..148663373f7d 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 717ba0845a2a..27fdbc165446 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/pci-aspm.h b/include/linux/pci-aspm.h deleted file mode 100644 index 67064145d76e..000000000000 --- a/include/linux/pci-aspm.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * aspm.h - * - * PCI Express ASPM defines and function prototypes - * - * Copyright (C) 2007 Intel Corp. - * Zhang Yanmin (yanmin.zhang@intel.com) - * Shaohua Li (shaohua.li@intel.com) - * - * For more information, please consult the following manuals (look at - * http://www.pcisig.com/ for how to get them): - * - * PCI Express Specification - */ - -#ifndef LINUX_ASPM_H -#define LINUX_ASPM_H - -#include - -#define PCIE_LINK_STATE_L0S 1 -#define PCIE_LINK_STATE_L1 2 -#define PCIE_LINK_STATE_CLKPM 4 - -#ifdef CONFIG_PCIEASPM -int pci_disable_link_state(struct pci_dev *pdev, int state); -int pci_disable_link_state_locked(struct pci_dev *pdev, int state); -void pcie_no_aspm(void); -#else -static inline int pci_disable_link_state(struct pci_dev *pdev, int state) -{ return 0; } -static inline void pcie_no_aspm(void) { } -#endif - -#endif /* LINUX_ASPM_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..b4e479921e07 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -6,12 +6,18 @@ * Copyright 1994, Drew Eckhardt * Copyright 1997--1999 Martin Mares * + * PCI Express ASPM defines and function prototypes + * Copyright (c) 2007 Intel Corp. + * Zhang Yanmin (yanmin.zhang@intel.com) + * Shaohua Li (shaohua.li@intel.com) + * * For more information, please consult the following manuals (look at * http://www.pcisig.com/ for how to get them): * * PCI BIOS Specification * PCI Local Bus Specification * PCI to PCI Bridge Specification + * PCI Express Specification * PCI System Design Guide */ #ifndef LINUX_PCI_H @@ -1565,9 +1571,21 @@ extern bool pcie_ports_native; #define pcie_ports_native false #endif +#define PCIE_LINK_STATE_L0S 1 +#define PCIE_LINK_STATE_L1 2 +#define PCIE_LINK_STATE_CLKPM 4 + #ifdef CONFIG_PCIEASPM +int pci_disable_link_state(struct pci_dev *pdev, int state); +int pci_disable_link_state_locked(struct pci_dev *pdev, int state); +void pcie_no_aspm(void); bool pcie_aspm_support_enabled(void); #else +static inline int pci_disable_link_state(struct pci_dev *pdev, int state) +{ return 0; } +static inline int pci_disable_link_state_locked(struct pci_dev *pdev, int state) +{ return 0; } +static inline void pcie_no_aspm(void) { } static inline bool pcie_aspm_support_enabled(void) { return false; } #endif From 556d971bdae643de4cd7e2976e14f70ca2a3061d Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Wed, 7 Aug 2019 21:43:18 -0300 Subject: [PATCH 0256/2880] ima: Fix use after free in ima_read_modsig() If we can't parse the PKCS7 in the appended modsig, we will free the modsig structure and then access one of its members to determine the error value. Fixes: 39b07096364a ("ima: Implement support for module-style appended signatures") Reported-by: kbuild test robot Reported-by: Julia Lawall Reported-by: Dan Carpenter Signed-off-by: Thiago Jung Bauermann Reviewed-by: Gustavo A. R. Silva Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_modsig.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/integrity/ima/ima_modsig.c b/security/integrity/ima/ima_modsig.c index c412e31d1714..d106885cc495 100644 --- a/security/integrity/ima/ima_modsig.c +++ b/security/integrity/ima/ima_modsig.c @@ -91,8 +91,9 @@ int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, hdr->pkcs7_msg = pkcs7_parse_message(buf + buf_len, sig_len); if (IS_ERR(hdr->pkcs7_msg)) { + rc = PTR_ERR(hdr->pkcs7_msg); kfree(hdr); - return PTR_ERR(hdr->pkcs7_msg); + return rc; } memcpy(hdr->raw_pkcs7, buf + buf_len, sig_len); From e2797ad31fb40f4ff59ebc4314d6f000d713bad9 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Tue, 27 Aug 2019 11:49:49 +0200 Subject: [PATCH 0257/2880] PCI/ACPI: Rename _HPX structs from hpp_* to hpx_* The names of the hpp_type0, hpp_type1 and hpp_type2 structs suggest that they're related to _HPP, when in fact they're related to _HPX. The struct hpp_type0 denotes an _HPX Type 0 setting record that supersedes the _HPP setting record, and it has been used interchangeably for _HPP as per the ACPI specification (see version 6.3, section 6.2.9.1) which states that it should be applied to PCI, PCI-X and PCI Express devices, with settings being ignored if they are not applicable. Rename them to hpx_type0, hpx_type1 and hpx_type2 to reflect their relation to _HPX rather than _HPP. Link: https://lore.kernel.org/r/20190827094951.10613-2-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-acpi.c | 28 +++++++-------- drivers/pci/probe.c | 72 ++++++++++++++++++------------------- include/linux/pci_hotplug.h | 30 ++++++++-------- 3 files changed, 64 insertions(+), 66 deletions(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 45049f558860..02addc47edae 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -119,7 +119,7 @@ phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle) } static acpi_status decode_type0_hpx_record(union acpi_object *record, - struct hpp_type0 *hpx0) + struct hpx_type0 *hpx0) { int i; union acpi_object *fields = record->package.elements; @@ -147,7 +147,7 @@ static acpi_status decode_type0_hpx_record(union acpi_object *record, } static acpi_status decode_type1_hpx_record(union acpi_object *record, - struct hpp_type1 *hpx1) + struct hpx_type1 *hpx1) { int i; union acpi_object *fields = record->package.elements; @@ -174,7 +174,7 @@ static acpi_status decode_type1_hpx_record(union acpi_object *record, } static acpi_status decode_type2_hpx_record(union acpi_object *record, - struct hpp_type2 *hpx2) + struct hpx_type2 *hpx2) { int i; union acpi_object *fields = record->package.elements; @@ -277,9 +277,9 @@ static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle, acpi_status status; struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; union acpi_object *package, *record, *fields; - struct hpp_type0 hpx0; - struct hpp_type1 hpx1; - struct hpp_type2 hpx2; + struct hpx_type0 hpx0; + struct hpx_type1 hpx1; + struct hpx_type2 hpx2; u32 type; int i; @@ -353,10 +353,10 @@ static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle, acpi_status status; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *package, *fields; - struct hpp_type0 hpp0; + struct hpx_type0 hpx0; int i; - memset(&hpp0, 0, sizeof(hpp0)); + memset(&hpx0, 0, sizeof(hpx0)); status = acpi_evaluate_object(handle, "_HPP", NULL, &buffer); if (ACPI_FAILURE(status)) @@ -377,13 +377,13 @@ static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle, } } - hpp0.revision = 1; - hpp0.cache_line_size = fields[0].integer.value; - hpp0.latency_timer = fields[1].integer.value; - hpp0.enable_serr = fields[2].integer.value; - hpp0.enable_perr = fields[3].integer.value; + hpx0.revision = 1; + hpx0.cache_line_size = fields[0].integer.value; + hpx0.latency_timer = fields[1].integer.value; + hpx0.enable_serr = fields[2].integer.value; + hpx0.enable_perr = fields[3].integer.value; - hp_ops->program_type0(dev, &hpp0); + hp_ops->program_type0(dev, &hpx0); exit: kfree(buffer.pointer); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index a3c7338fad86..120c70b5003b 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1920,7 +1920,7 @@ static void pci_configure_mps(struct pci_dev *dev) p_mps, mps, mpss); } -static struct hpp_type0 pci_default_type0 = { +static struct hpx_type0 pci_default_type0 = { .revision = 1, .cache_line_size = 8, .latency_timer = 0x40, @@ -1928,44 +1928,44 @@ static struct hpp_type0 pci_default_type0 = { .enable_perr = 0, }; -static void program_hpp_type0(struct pci_dev *dev, struct hpp_type0 *hpp) +static void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx) { u16 pci_cmd, pci_bctl; - if (!hpp) - hpp = &pci_default_type0; + if (!hpx) + hpx = &pci_default_type0; - if (hpp->revision > 1) { + if (hpx->revision > 1) { pci_warn(dev, "PCI settings rev %d not supported; using defaults\n", - hpp->revision); - hpp = &pci_default_type0; + hpx->revision); + hpx = &pci_default_type0; } - pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpp->cache_line_size); - pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpp->latency_timer); + pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpx->cache_line_size); + pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpx->latency_timer); pci_read_config_word(dev, PCI_COMMAND, &pci_cmd); - if (hpp->enable_serr) + if (hpx->enable_serr) pci_cmd |= PCI_COMMAND_SERR; - if (hpp->enable_perr) + if (hpx->enable_perr) pci_cmd |= PCI_COMMAND_PARITY; pci_write_config_word(dev, PCI_COMMAND, pci_cmd); /* Program bridge control value */ if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER, - hpp->latency_timer); + hpx->latency_timer); pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl); - if (hpp->enable_perr) + if (hpx->enable_perr) pci_bctl |= PCI_BRIDGE_CTL_PARITY; pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl); } } -static void program_hpp_type1(struct pci_dev *dev, struct hpp_type1 *hpp) +static void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx) { int pos; - if (!hpp) + if (!hpx) return; pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); @@ -1990,20 +1990,20 @@ static bool pcie_root_rcb_set(struct pci_dev *dev) return false; } -static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) +static void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx) { int pos; u32 reg32; - if (!hpp) + if (!hpx) return; if (!pci_is_pcie(dev)) return; - if (hpp->revision > 1) { + if (hpx->revision > 1) { pci_warn(dev, "PCIe settings rev %d not supported\n", - hpp->revision); + hpx->revision); return; } @@ -2012,14 +2012,14 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) * those to make sure they're consistent with the rest of the * platform. */ - hpp->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD | + hpx->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD | PCI_EXP_DEVCTL_READRQ; - hpp->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD | + hpx->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD | PCI_EXP_DEVCTL_READRQ); /* Initialize Device Control Register */ pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, - ~hpp->pci_exp_devctl_and, hpp->pci_exp_devctl_or); + ~hpx->pci_exp_devctl_and, hpx->pci_exp_devctl_or); /* Initialize Link Control Register */ if (pcie_cap_has_lnkctl(dev)) { @@ -2028,13 +2028,13 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) * If the Root Port supports Read Completion Boundary of * 128, set RCB to 128. Otherwise, clear it. */ - hpp->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB; - hpp->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB; + hpx->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB; + hpx->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB; if (pcie_root_rcb_set(dev)) - hpp->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB; + hpx->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB; pcie_capability_clear_and_set_word(dev, PCI_EXP_LNKCTL, - ~hpp->pci_exp_lnkctl_and, hpp->pci_exp_lnkctl_or); + ~hpx->pci_exp_lnkctl_and, hpx->pci_exp_lnkctl_or); } /* Find Advanced Error Reporting Enhanced Capability */ @@ -2044,22 +2044,22 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) /* Initialize Uncorrectable Error Mask Register */ pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, ®32); - reg32 = (reg32 & hpp->unc_err_mask_and) | hpp->unc_err_mask_or; + reg32 = (reg32 & hpx->unc_err_mask_and) | hpx->unc_err_mask_or; pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32); /* Initialize Uncorrectable Error Severity Register */ pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, ®32); - reg32 = (reg32 & hpp->unc_err_sever_and) | hpp->unc_err_sever_or; + reg32 = (reg32 & hpx->unc_err_sever_and) | hpx->unc_err_sever_or; pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32); /* Initialize Correctable Error Mask Register */ pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, ®32); - reg32 = (reg32 & hpp->cor_err_mask_and) | hpp->cor_err_mask_or; + reg32 = (reg32 & hpx->cor_err_mask_and) | hpx->cor_err_mask_or; pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32); /* Initialize Advanced Error Capabilities and Control Register */ pci_read_config_dword(dev, pos + PCI_ERR_CAP, ®32); - reg32 = (reg32 & hpp->adv_err_cap_and) | hpp->adv_err_cap_or; + reg32 = (reg32 & hpx->adv_err_cap_and) | hpx->adv_err_cap_or; /* Don't enable ECRC generation or checking if unsupported */ if (!(reg32 & PCI_ERR_CAP_ECRC_GENC)) @@ -2178,15 +2178,15 @@ static void program_hpx_type3_register(struct pci_dev *dev, pos, orig_value, write_reg); } -static void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx3) +static void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx) { - if (!hpx3) + if (!hpx) return; if (!pci_is_pcie(dev)) return; - program_hpx_type3_register(dev, hpx3); + program_hpx_type3_register(dev, hpx); } int pci_configure_extended_tags(struct pci_dev *dev, void *ign) @@ -2370,9 +2370,9 @@ static void pci_configure_serr(struct pci_dev *dev) static void pci_configure_device(struct pci_dev *dev) { static const struct hotplug_program_ops hp_ops = { - .program_type0 = program_hpp_type0, - .program_type1 = program_hpp_type1, - .program_type2 = program_hpp_type2, + .program_type0 = program_hpx_type0, + .program_type1 = program_hpx_type1, + .program_type2 = program_hpx_type2, .program_type3 = program_hpx_type3, }; diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h index f694eb2ca978..247a49ee27b3 100644 --- a/include/linux/pci_hotplug.h +++ b/include/linux/pci_hotplug.h @@ -86,25 +86,25 @@ void pci_hp_deregister(struct hotplug_slot *slot); #define pci_hp_initialize(slot, bus, nr, name) \ __pci_hp_initialize(slot, bus, nr, name, THIS_MODULE, KBUILD_MODNAME) -/* PCI Setting Record (Type 0) */ -struct hpp_type0 { - u32 revision; - u8 cache_line_size; - u8 latency_timer; +/* _HPX PCI Setting Record (Type 0); same as _HPP */ +struct hpx_type0 { + u32 revision; /* Not present in _HPP */ + u8 cache_line_size; /* Not applicable to PCIe */ + u8 latency_timer; /* Not applicable to PCIe */ u8 enable_serr; u8 enable_perr; }; -/* PCI-X Setting Record (Type 1) */ -struct hpp_type1 { +/* _HPX PCI-X Setting Record (Type 1) */ +struct hpx_type1 { u32 revision; u8 max_mem_read; u8 avg_max_split; u16 tot_max_split; }; -/* PCI Express Setting Record (Type 2) */ -struct hpp_type2 { +/* _HPX PCI Express Setting Record (Type 2) */ +struct hpx_type2 { u32 revision; u32 unc_err_mask_and; u32 unc_err_mask_or; @@ -124,9 +124,7 @@ struct hpp_type2 { u32 sec_unc_err_mask_or; }; -/* - * _HPX PCI Express Setting Record (Type 3) - */ +/* _HPX PCI Express Setting Record (Type 3) */ struct hpx_type3 { u16 device_type; u16 function_type; @@ -145,10 +143,10 @@ struct hpx_type3 { }; struct hotplug_program_ops { - void (*program_type0)(struct pci_dev *dev, struct hpp_type0 *hpp); - void (*program_type1)(struct pci_dev *dev, struct hpp_type1 *hpp); - void (*program_type2)(struct pci_dev *dev, struct hpp_type2 *hpp); - void (*program_type3)(struct pci_dev *dev, struct hpx_type3 *hpp); + void (*program_type0)(struct pci_dev *dev, struct hpx_type0 *hpx); + void (*program_type1)(struct pci_dev *dev, struct hpx_type1 *hpx); + void (*program_type2)(struct pci_dev *dev, struct hpx_type2 *hpx); + void (*program_type3)(struct pci_dev *dev, struct hpx_type3 *hpx); }; enum hpx_type3_dev_type { From 8c3aac6e1b6146ce771b1cabd78e593136d3e5f2 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Tue, 27 Aug 2019 11:49:50 +0200 Subject: [PATCH 0258/2880] PCI/ACPI: Move _HPP & _HPX functions to pci-acpi.c Move program_hpx_type0(), program_hpx_type1(), etc., and enums hpx_type3_dev_type, hpx_type3_fn_type and hpx_type3_cfg_loc to drivers/pci/pci-acpi.c as these functions and enums are ACPI-specific. Move structs hpx_type0, hpx_type1, hpx_type2 and hpx_type3 to drivers/pci/pci.h as these are shared between drivers/pci/pci-acpi.c and drivers/pci/probe.c. Link: https://lore.kernel.org/r/20190827094951.10613-3-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-acpi.c | 296 ++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 79 ++++++++++ drivers/pci/probe.c | 269 -------------------------------- include/linux/pci_hotplug.h | 98 ------------ 4 files changed, 375 insertions(+), 367 deletions(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 02addc47edae..0bfabb3f9931 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -118,6 +118,47 @@ phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle) return (phys_addr_t)mcfg_addr; } +static struct hpx_type0 pci_default_type0 = { + .revision = 1, + .cache_line_size = 8, + .latency_timer = 0x40, + .enable_serr = 0, + .enable_perr = 0, +}; + +void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx) +{ + u16 pci_cmd, pci_bctl; + + if (!hpx) + hpx = &pci_default_type0; + + if (hpx->revision > 1) { + pci_warn(dev, "PCI settings rev %d not supported; using defaults\n", + hpx->revision); + hpx = &pci_default_type0; + } + + pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpx->cache_line_size); + pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpx->latency_timer); + pci_read_config_word(dev, PCI_COMMAND, &pci_cmd); + if (hpx->enable_serr) + pci_cmd |= PCI_COMMAND_SERR; + if (hpx->enable_perr) + pci_cmd |= PCI_COMMAND_PARITY; + pci_write_config_word(dev, PCI_COMMAND, pci_cmd); + + /* Program bridge control value */ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { + pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER, + hpx->latency_timer); + pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl); + if (hpx->enable_perr) + pci_bctl |= PCI_BRIDGE_CTL_PARITY; + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl); + } +} + static acpi_status decode_type0_hpx_record(union acpi_object *record, struct hpx_type0 *hpx0) { @@ -146,6 +187,20 @@ static acpi_status decode_type0_hpx_record(union acpi_object *record, return AE_OK; } +void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx) +{ + int pos; + + if (!hpx) + return; + + pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (!pos) + return; + + pci_warn(dev, "PCI-X settings not supported\n"); +} + static acpi_status decode_type1_hpx_record(union acpi_object *record, struct hpx_type1 *hpx1) { @@ -173,6 +228,107 @@ static acpi_status decode_type1_hpx_record(union acpi_object *record, return AE_OK; } +static bool pcie_root_rcb_set(struct pci_dev *dev) +{ + struct pci_dev *rp = pcie_find_root_port(dev); + u16 lnkctl; + + if (!rp) + return false; + + pcie_capability_read_word(rp, PCI_EXP_LNKCTL, &lnkctl); + if (lnkctl & PCI_EXP_LNKCTL_RCB) + return true; + + return false; +} + +void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx) +{ + int pos; + u32 reg32; + + if (!hpx) + return; + + if (!pci_is_pcie(dev)) + return; + + if (hpx->revision > 1) { + pci_warn(dev, "PCIe settings rev %d not supported\n", + hpx->revision); + return; + } + + /* + * Don't allow _HPX to change MPS or MRRS settings. We manage + * those to make sure they're consistent with the rest of the + * platform. + */ + hpx->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD | + PCI_EXP_DEVCTL_READRQ; + hpx->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD | + PCI_EXP_DEVCTL_READRQ); + + /* Initialize Device Control Register */ + pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, + ~hpx->pci_exp_devctl_and, hpx->pci_exp_devctl_or); + + /* Initialize Link Control Register */ + if (pcie_cap_has_lnkctl(dev)) { + + /* + * If the Root Port supports Read Completion Boundary of + * 128, set RCB to 128. Otherwise, clear it. + */ + hpx->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB; + hpx->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB; + if (pcie_root_rcb_set(dev)) + hpx->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB; + + pcie_capability_clear_and_set_word(dev, PCI_EXP_LNKCTL, + ~hpx->pci_exp_lnkctl_and, hpx->pci_exp_lnkctl_or); + } + + /* Find Advanced Error Reporting Enhanced Capability */ + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); + if (!pos) + return; + + /* Initialize Uncorrectable Error Mask Register */ + pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, ®32); + reg32 = (reg32 & hpx->unc_err_mask_and) | hpx->unc_err_mask_or; + pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32); + + /* Initialize Uncorrectable Error Severity Register */ + pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, ®32); + reg32 = (reg32 & hpx->unc_err_sever_and) | hpx->unc_err_sever_or; + pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32); + + /* Initialize Correctable Error Mask Register */ + pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, ®32); + reg32 = (reg32 & hpx->cor_err_mask_and) | hpx->cor_err_mask_or; + pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32); + + /* Initialize Advanced Error Capabilities and Control Register */ + pci_read_config_dword(dev, pos + PCI_ERR_CAP, ®32); + reg32 = (reg32 & hpx->adv_err_cap_and) | hpx->adv_err_cap_or; + + /* Don't enable ECRC generation or checking if unsupported */ + if (!(reg32 & PCI_ERR_CAP_ECRC_GENC)) + reg32 &= ~PCI_ERR_CAP_ECRC_GENE; + if (!(reg32 & PCI_ERR_CAP_ECRC_CHKC)) + reg32 &= ~PCI_ERR_CAP_ECRC_CHKE; + pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32); + + /* + * FIXME: The following two registers are not supported yet. + * + * o Secondary Uncorrectable Error Severity Register + * o Secondary Uncorrectable Error Mask Register + */ +} + static acpi_status decode_type2_hpx_record(union acpi_object *record, struct hpx_type2 *hpx2) { @@ -213,6 +369,146 @@ static acpi_status decode_type2_hpx_record(union acpi_object *record, return AE_OK; } +enum hpx_type3_dev_type { + HPX_TYPE_ENDPOINT = BIT(0), + HPX_TYPE_LEG_END = BIT(1), + HPX_TYPE_RC_END = BIT(2), + HPX_TYPE_RC_EC = BIT(3), + HPX_TYPE_ROOT_PORT = BIT(4), + HPX_TYPE_UPSTREAM = BIT(5), + HPX_TYPE_DOWNSTREAM = BIT(6), + HPX_TYPE_PCI_BRIDGE = BIT(7), + HPX_TYPE_PCIE_BRIDGE = BIT(8), +}; + +static u16 hpx3_device_type(struct pci_dev *dev) +{ + u16 pcie_type = pci_pcie_type(dev); + const int pcie_to_hpx3_type[] = { + [PCI_EXP_TYPE_ENDPOINT] = HPX_TYPE_ENDPOINT, + [PCI_EXP_TYPE_LEG_END] = HPX_TYPE_LEG_END, + [PCI_EXP_TYPE_RC_END] = HPX_TYPE_RC_END, + [PCI_EXP_TYPE_RC_EC] = HPX_TYPE_RC_EC, + [PCI_EXP_TYPE_ROOT_PORT] = HPX_TYPE_ROOT_PORT, + [PCI_EXP_TYPE_UPSTREAM] = HPX_TYPE_UPSTREAM, + [PCI_EXP_TYPE_DOWNSTREAM] = HPX_TYPE_DOWNSTREAM, + [PCI_EXP_TYPE_PCI_BRIDGE] = HPX_TYPE_PCI_BRIDGE, + [PCI_EXP_TYPE_PCIE_BRIDGE] = HPX_TYPE_PCIE_BRIDGE, + }; + + if (pcie_type >= ARRAY_SIZE(pcie_to_hpx3_type)) + return 0; + + return pcie_to_hpx3_type[pcie_type]; +} + +enum hpx_type3_fn_type { + HPX_FN_NORMAL = BIT(0), + HPX_FN_SRIOV_PHYS = BIT(1), + HPX_FN_SRIOV_VIRT = BIT(2), +}; + +static u8 hpx3_function_type(struct pci_dev *dev) +{ + if (dev->is_virtfn) + return HPX_FN_SRIOV_VIRT; + else if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV) > 0) + return HPX_FN_SRIOV_PHYS; + else + return HPX_FN_NORMAL; +} + +static bool hpx3_cap_ver_matches(u8 pcie_cap_id, u8 hpx3_cap_id) +{ + u8 cap_ver = hpx3_cap_id & 0xf; + + if ((hpx3_cap_id & BIT(4)) && cap_ver >= pcie_cap_id) + return true; + else if (cap_ver == pcie_cap_id) + return true; + + return false; +} + +enum hpx_type3_cfg_loc { + HPX_CFG_PCICFG = 0, + HPX_CFG_PCIE_CAP = 1, + HPX_CFG_PCIE_CAP_EXT = 2, + HPX_CFG_VEND_CAP = 3, + HPX_CFG_DVSEC = 4, + HPX_CFG_MAX, +}; + +static void program_hpx_type3_register(struct pci_dev *dev, + const struct hpx_type3 *reg) +{ + u32 match_reg, write_reg, header, orig_value; + u16 pos; + + if (!(hpx3_device_type(dev) & reg->device_type)) + return; + + if (!(hpx3_function_type(dev) & reg->function_type)) + return; + + switch (reg->config_space_location) { + case HPX_CFG_PCICFG: + pos = 0; + break; + case HPX_CFG_PCIE_CAP: + pos = pci_find_capability(dev, reg->pci_exp_cap_id); + if (pos == 0) + return; + + break; + case HPX_CFG_PCIE_CAP_EXT: + pos = pci_find_ext_capability(dev, reg->pci_exp_cap_id); + if (pos == 0) + return; + + pci_read_config_dword(dev, pos, &header); + if (!hpx3_cap_ver_matches(PCI_EXT_CAP_VER(header), + reg->pci_exp_cap_ver)) + return; + + break; + case HPX_CFG_VEND_CAP: /* Fall through */ + case HPX_CFG_DVSEC: /* Fall through */ + default: + pci_warn(dev, "Encountered _HPX type 3 with unsupported config space location"); + return; + } + + pci_read_config_dword(dev, pos + reg->match_offset, &match_reg); + + if ((match_reg & reg->match_mask_and) != reg->match_value) + return; + + pci_read_config_dword(dev, pos + reg->reg_offset, &write_reg); + orig_value = write_reg; + write_reg &= reg->reg_mask_and; + write_reg |= reg->reg_mask_or; + + if (orig_value == write_reg) + return; + + pci_write_config_dword(dev, pos + reg->reg_offset, write_reg); + + pci_dbg(dev, "Applied _HPX3 at [0x%x]: 0x%08x -> 0x%08x", + pos, orig_value, write_reg); +} + +void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx) +{ + if (!hpx) + return; + + if (!pci_is_pcie(dev)) + return; + + program_hpx_type3_register(dev, hpx); +} + static void parse_hpx3_register(struct hpx_type3 *hpx3_reg, union acpi_object *reg_fields) { diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1be03a97cb92..dad43c64b350 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -608,4 +608,83 @@ static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } static inline void pci_aer_clear_device_status(struct pci_dev *dev) { } #endif +/* _HPX PCI Setting Record (Type 0); same as _HPP */ +struct hpx_type0 { + u32 revision; /* Not present in _HPP */ + u8 cache_line_size; /* Not applicable to PCIe */ + u8 latency_timer; /* Not applicable to PCIe */ + u8 enable_serr; + u8 enable_perr; +}; + +/* _HPX PCI-X Setting Record (Type 1) */ +struct hpx_type1 { + u32 revision; + u8 max_mem_read; + u8 avg_max_split; + u16 tot_max_split; +}; + +/* _HPX PCI Express Setting Record (Type 2) */ +struct hpx_type2 { + u32 revision; + u32 unc_err_mask_and; + u32 unc_err_mask_or; + u32 unc_err_sever_and; + u32 unc_err_sever_or; + u32 cor_err_mask_and; + u32 cor_err_mask_or; + u32 adv_err_cap_and; + u32 adv_err_cap_or; + u16 pci_exp_devctl_and; + u16 pci_exp_devctl_or; + u16 pci_exp_lnkctl_and; + u16 pci_exp_lnkctl_or; + u32 sec_unc_err_sever_and; + u32 sec_unc_err_sever_or; + u32 sec_unc_err_mask_and; + u32 sec_unc_err_mask_or; +}; + +/* _HPX PCI Express Setting Record (Type 3) */ +struct hpx_type3 { + u16 device_type; + u16 function_type; + u16 config_space_location; + u16 pci_exp_cap_id; + u16 pci_exp_cap_ver; + u16 pci_exp_vendor_id; + u16 dvsec_id; + u16 dvsec_rev; + u16 match_offset; + u32 match_mask_and; + u32 match_value; + u16 reg_offset; + u32 reg_mask_and; + u32 reg_mask_or; +}; + +void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx); +void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx); +void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx); +void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx); + +struct hotplug_program_ops { + void (*program_type0)(struct pci_dev *dev, struct hpx_type0 *hpx); + void (*program_type1)(struct pci_dev *dev, struct hpx_type1 *hpx); + void (*program_type2)(struct pci_dev *dev, struct hpx_type2 *hpx); + void (*program_type3)(struct pci_dev *dev, struct hpx_type3 *hpx); +}; + +#ifdef CONFIG_ACPI +int pci_acpi_program_hp_params(struct pci_dev *dev, + const struct hotplug_program_ops *hp_ops); +#else +static inline int pci_acpi_program_hp_params(struct pci_dev *dev, + const struct hotplug_program_ops *hp_ops) +{ + return -ENODEV; +} +#endif + #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 120c70b5003b..5847b557dc9a 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1920,275 +1920,6 @@ static void pci_configure_mps(struct pci_dev *dev) p_mps, mps, mpss); } -static struct hpx_type0 pci_default_type0 = { - .revision = 1, - .cache_line_size = 8, - .latency_timer = 0x40, - .enable_serr = 0, - .enable_perr = 0, -}; - -static void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx) -{ - u16 pci_cmd, pci_bctl; - - if (!hpx) - hpx = &pci_default_type0; - - if (hpx->revision > 1) { - pci_warn(dev, "PCI settings rev %d not supported; using defaults\n", - hpx->revision); - hpx = &pci_default_type0; - } - - pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpx->cache_line_size); - pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpx->latency_timer); - pci_read_config_word(dev, PCI_COMMAND, &pci_cmd); - if (hpx->enable_serr) - pci_cmd |= PCI_COMMAND_SERR; - if (hpx->enable_perr) - pci_cmd |= PCI_COMMAND_PARITY; - pci_write_config_word(dev, PCI_COMMAND, pci_cmd); - - /* Program bridge control value */ - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { - pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER, - hpx->latency_timer); - pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl); - if (hpx->enable_perr) - pci_bctl |= PCI_BRIDGE_CTL_PARITY; - pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl); - } -} - -static void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx) -{ - int pos; - - if (!hpx) - return; - - pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); - if (!pos) - return; - - pci_warn(dev, "PCI-X settings not supported\n"); -} - -static bool pcie_root_rcb_set(struct pci_dev *dev) -{ - struct pci_dev *rp = pcie_find_root_port(dev); - u16 lnkctl; - - if (!rp) - return false; - - pcie_capability_read_word(rp, PCI_EXP_LNKCTL, &lnkctl); - if (lnkctl & PCI_EXP_LNKCTL_RCB) - return true; - - return false; -} - -static void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx) -{ - int pos; - u32 reg32; - - if (!hpx) - return; - - if (!pci_is_pcie(dev)) - return; - - if (hpx->revision > 1) { - pci_warn(dev, "PCIe settings rev %d not supported\n", - hpx->revision); - return; - } - - /* - * Don't allow _HPX to change MPS or MRRS settings. We manage - * those to make sure they're consistent with the rest of the - * platform. - */ - hpx->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD | - PCI_EXP_DEVCTL_READRQ; - hpx->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD | - PCI_EXP_DEVCTL_READRQ); - - /* Initialize Device Control Register */ - pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, - ~hpx->pci_exp_devctl_and, hpx->pci_exp_devctl_or); - - /* Initialize Link Control Register */ - if (pcie_cap_has_lnkctl(dev)) { - - /* - * If the Root Port supports Read Completion Boundary of - * 128, set RCB to 128. Otherwise, clear it. - */ - hpx->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB; - hpx->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB; - if (pcie_root_rcb_set(dev)) - hpx->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB; - - pcie_capability_clear_and_set_word(dev, PCI_EXP_LNKCTL, - ~hpx->pci_exp_lnkctl_and, hpx->pci_exp_lnkctl_or); - } - - /* Find Advanced Error Reporting Enhanced Capability */ - pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); - if (!pos) - return; - - /* Initialize Uncorrectable Error Mask Register */ - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, ®32); - reg32 = (reg32 & hpx->unc_err_mask_and) | hpx->unc_err_mask_or; - pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32); - - /* Initialize Uncorrectable Error Severity Register */ - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, ®32); - reg32 = (reg32 & hpx->unc_err_sever_and) | hpx->unc_err_sever_or; - pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32); - - /* Initialize Correctable Error Mask Register */ - pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, ®32); - reg32 = (reg32 & hpx->cor_err_mask_and) | hpx->cor_err_mask_or; - pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32); - - /* Initialize Advanced Error Capabilities and Control Register */ - pci_read_config_dword(dev, pos + PCI_ERR_CAP, ®32); - reg32 = (reg32 & hpx->adv_err_cap_and) | hpx->adv_err_cap_or; - - /* Don't enable ECRC generation or checking if unsupported */ - if (!(reg32 & PCI_ERR_CAP_ECRC_GENC)) - reg32 &= ~PCI_ERR_CAP_ECRC_GENE; - if (!(reg32 & PCI_ERR_CAP_ECRC_CHKC)) - reg32 &= ~PCI_ERR_CAP_ECRC_CHKE; - pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32); - - /* - * FIXME: The following two registers are not supported yet. - * - * o Secondary Uncorrectable Error Severity Register - * o Secondary Uncorrectable Error Mask Register - */ -} - -static u16 hpx3_device_type(struct pci_dev *dev) -{ - u16 pcie_type = pci_pcie_type(dev); - const int pcie_to_hpx3_type[] = { - [PCI_EXP_TYPE_ENDPOINT] = HPX_TYPE_ENDPOINT, - [PCI_EXP_TYPE_LEG_END] = HPX_TYPE_LEG_END, - [PCI_EXP_TYPE_RC_END] = HPX_TYPE_RC_END, - [PCI_EXP_TYPE_RC_EC] = HPX_TYPE_RC_EC, - [PCI_EXP_TYPE_ROOT_PORT] = HPX_TYPE_ROOT_PORT, - [PCI_EXP_TYPE_UPSTREAM] = HPX_TYPE_UPSTREAM, - [PCI_EXP_TYPE_DOWNSTREAM] = HPX_TYPE_DOWNSTREAM, - [PCI_EXP_TYPE_PCI_BRIDGE] = HPX_TYPE_PCI_BRIDGE, - [PCI_EXP_TYPE_PCIE_BRIDGE] = HPX_TYPE_PCIE_BRIDGE, - }; - - if (pcie_type >= ARRAY_SIZE(pcie_to_hpx3_type)) - return 0; - - return pcie_to_hpx3_type[pcie_type]; -} - -static u8 hpx3_function_type(struct pci_dev *dev) -{ - if (dev->is_virtfn) - return HPX_FN_SRIOV_VIRT; - else if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV) > 0) - return HPX_FN_SRIOV_PHYS; - else - return HPX_FN_NORMAL; -} - -static bool hpx3_cap_ver_matches(u8 pcie_cap_id, u8 hpx3_cap_id) -{ - u8 cap_ver = hpx3_cap_id & 0xf; - - if ((hpx3_cap_id & BIT(4)) && cap_ver >= pcie_cap_id) - return true; - else if (cap_ver == pcie_cap_id) - return true; - - return false; -} - -static void program_hpx_type3_register(struct pci_dev *dev, - const struct hpx_type3 *reg) -{ - u32 match_reg, write_reg, header, orig_value; - u16 pos; - - if (!(hpx3_device_type(dev) & reg->device_type)) - return; - - if (!(hpx3_function_type(dev) & reg->function_type)) - return; - - switch (reg->config_space_location) { - case HPX_CFG_PCICFG: - pos = 0; - break; - case HPX_CFG_PCIE_CAP: - pos = pci_find_capability(dev, reg->pci_exp_cap_id); - if (pos == 0) - return; - - break; - case HPX_CFG_PCIE_CAP_EXT: - pos = pci_find_ext_capability(dev, reg->pci_exp_cap_id); - if (pos == 0) - return; - - pci_read_config_dword(dev, pos, &header); - if (!hpx3_cap_ver_matches(PCI_EXT_CAP_VER(header), - reg->pci_exp_cap_ver)) - return; - - break; - case HPX_CFG_VEND_CAP: /* Fall through */ - case HPX_CFG_DVSEC: /* Fall through */ - default: - pci_warn(dev, "Encountered _HPX type 3 with unsupported config space location"); - return; - } - - pci_read_config_dword(dev, pos + reg->match_offset, &match_reg); - - if ((match_reg & reg->match_mask_and) != reg->match_value) - return; - - pci_read_config_dword(dev, pos + reg->reg_offset, &write_reg); - orig_value = write_reg; - write_reg &= reg->reg_mask_and; - write_reg |= reg->reg_mask_or; - - if (orig_value == write_reg) - return; - - pci_write_config_dword(dev, pos + reg->reg_offset, write_reg); - - pci_dbg(dev, "Applied _HPX3 at [0x%x]: 0x%08x -> 0x%08x", - pos, orig_value, write_reg); -} - -static void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx) -{ - if (!hpx) - return; - - if (!pci_is_pcie(dev)) - return; - - program_hpx_type3_register(dev, hpx); -} - int pci_configure_extended_tags(struct pci_dev *dev, void *ign) { struct pci_host_bridge *host; diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h index 247a49ee27b3..b482e42d7153 100644 --- a/include/linux/pci_hotplug.h +++ b/include/linux/pci_hotplug.h @@ -86,112 +86,14 @@ void pci_hp_deregister(struct hotplug_slot *slot); #define pci_hp_initialize(slot, bus, nr, name) \ __pci_hp_initialize(slot, bus, nr, name, THIS_MODULE, KBUILD_MODNAME) -/* _HPX PCI Setting Record (Type 0); same as _HPP */ -struct hpx_type0 { - u32 revision; /* Not present in _HPP */ - u8 cache_line_size; /* Not applicable to PCIe */ - u8 latency_timer; /* Not applicable to PCIe */ - u8 enable_serr; - u8 enable_perr; -}; - -/* _HPX PCI-X Setting Record (Type 1) */ -struct hpx_type1 { - u32 revision; - u8 max_mem_read; - u8 avg_max_split; - u16 tot_max_split; -}; - -/* _HPX PCI Express Setting Record (Type 2) */ -struct hpx_type2 { - u32 revision; - u32 unc_err_mask_and; - u32 unc_err_mask_or; - u32 unc_err_sever_and; - u32 unc_err_sever_or; - u32 cor_err_mask_and; - u32 cor_err_mask_or; - u32 adv_err_cap_and; - u32 adv_err_cap_or; - u16 pci_exp_devctl_and; - u16 pci_exp_devctl_or; - u16 pci_exp_lnkctl_and; - u16 pci_exp_lnkctl_or; - u32 sec_unc_err_sever_and; - u32 sec_unc_err_sever_or; - u32 sec_unc_err_mask_and; - u32 sec_unc_err_mask_or; -}; - -/* _HPX PCI Express Setting Record (Type 3) */ -struct hpx_type3 { - u16 device_type; - u16 function_type; - u16 config_space_location; - u16 pci_exp_cap_id; - u16 pci_exp_cap_ver; - u16 pci_exp_vendor_id; - u16 dvsec_id; - u16 dvsec_rev; - u16 match_offset; - u32 match_mask_and; - u32 match_value; - u16 reg_offset; - u32 reg_mask_and; - u32 reg_mask_or; -}; - -struct hotplug_program_ops { - void (*program_type0)(struct pci_dev *dev, struct hpx_type0 *hpx); - void (*program_type1)(struct pci_dev *dev, struct hpx_type1 *hpx); - void (*program_type2)(struct pci_dev *dev, struct hpx_type2 *hpx); - void (*program_type3)(struct pci_dev *dev, struct hpx_type3 *hpx); -}; - -enum hpx_type3_dev_type { - HPX_TYPE_ENDPOINT = BIT(0), - HPX_TYPE_LEG_END = BIT(1), - HPX_TYPE_RC_END = BIT(2), - HPX_TYPE_RC_EC = BIT(3), - HPX_TYPE_ROOT_PORT = BIT(4), - HPX_TYPE_UPSTREAM = BIT(5), - HPX_TYPE_DOWNSTREAM = BIT(6), - HPX_TYPE_PCI_BRIDGE = BIT(7), - HPX_TYPE_PCIE_BRIDGE = BIT(8), -}; - -enum hpx_type3_fn_type { - HPX_FN_NORMAL = BIT(0), - HPX_FN_SRIOV_PHYS = BIT(1), - HPX_FN_SRIOV_VIRT = BIT(2), -}; - -enum hpx_type3_cfg_loc { - HPX_CFG_PCICFG = 0, - HPX_CFG_PCIE_CAP = 1, - HPX_CFG_PCIE_CAP_EXT = 2, - HPX_CFG_VEND_CAP = 3, - HPX_CFG_DVSEC = 4, - HPX_CFG_MAX, -}; - #ifdef CONFIG_ACPI #include -int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops); bool pciehp_is_native(struct pci_dev *bridge); int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge); bool shpchp_is_native(struct pci_dev *bridge); int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle); int acpi_pci_detect_ejectable(acpi_handle handle); #else -static inline int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops) -{ - return -ENODEV; -} - static inline int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge) { return 0; From 4a2dbeddd3d54484d2e9c12965e4f9bfa1a0ee36 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Tue, 27 Aug 2019 11:49:51 +0200 Subject: [PATCH 0259/2880] PCI/ACPI: Remove unnecessary struct hotplug_program_ops Move the ACPI-specific structs hpx_type0, hpx_type1, hpx_type2 and hpx_type3 to drivers/pci/pci-acpi.c as they are not used anywhere else. Then remove the struct hotplug_program_ops that has been shared between drivers/pci/probe.c and drivers/pci/pci-acpi.c from drivers/pci/pci.h as it is no longer needed. The struct hotplug_program_ops was added by 87fcf12e846a ("PCI/ACPI: Remove the need for 'struct hotplug_params'") and replaced previously used struct hotplug_params enabling the support for the _HPX Type 3 Setting Record that was added by f873c51a155a ("PCI/ACPI: Implement _HPX Type 3 Setting Record"). The new struct allowed for the static functions such program_hpx_type0(), program_hpx_type1(), etc., from the drivers/pci/probe.c to be called from the function pci_acpi_program_hp_params() in the drivers/pci/pci-acpi.c. Previously a programming of _HPX Type 0 was as follows: drivers/pci/probe.c: program_hpx_type0() ... pci_configure_device() hp_ops = { .program_type0 = program_hpx_type0, ... } pci_acpi_program_hp_params(&hp_ops) drivers/pci/pci-acpi.c: pci_acpi_program_hp_params(&hp_ops) acpi_run_hpx(hp_ops) decode_type0_hpx_record() hp_ops->program_type0 # program_hpx_type0() called via hp_ops After the ACPI-specific functions, structs, enums, etc., have been moved to drivers/pci/pci-acpi.c there is no need for the hotplug_program_ops as all of the _HPX Type 0, 1, 2 and 3 are directly accessible. Link: https://lore.kernel.org/r/20190827094951.10613-4-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-acpi.c | 95 ++++++++++++++++++++++++++++++++---------- drivers/pci/pci.h | 74 +------------------------------- drivers/pci/probe.c | 9 +--- 3 files changed, 76 insertions(+), 102 deletions(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 0bfabb3f9931..6e65d1ff93da 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -118,6 +118,15 @@ phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle) return (phys_addr_t)mcfg_addr; } +/* _HPX PCI Setting Record (Type 0); same as _HPP */ +struct hpx_type0 { + u32 revision; /* Not present in _HPP */ + u8 cache_line_size; /* Not applicable to PCIe */ + u8 latency_timer; /* Not applicable to PCIe */ + u8 enable_serr; + u8 enable_perr; +}; + static struct hpx_type0 pci_default_type0 = { .revision = 1, .cache_line_size = 8, @@ -126,7 +135,7 @@ static struct hpx_type0 pci_default_type0 = { .enable_perr = 0, }; -void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx) +static void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx) { u16 pci_cmd, pci_bctl; @@ -187,7 +196,15 @@ static acpi_status decode_type0_hpx_record(union acpi_object *record, return AE_OK; } -void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx) +/* _HPX PCI-X Setting Record (Type 1) */ +struct hpx_type1 { + u32 revision; + u8 max_mem_read; + u8 avg_max_split; + u16 tot_max_split; +}; + +static void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx) { int pos; @@ -243,7 +260,28 @@ static bool pcie_root_rcb_set(struct pci_dev *dev) return false; } -void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx) +/* _HPX PCI Express Setting Record (Type 2) */ +struct hpx_type2 { + u32 revision; + u32 unc_err_mask_and; + u32 unc_err_mask_or; + u32 unc_err_sever_and; + u32 unc_err_sever_or; + u32 cor_err_mask_and; + u32 cor_err_mask_or; + u32 adv_err_cap_and; + u32 adv_err_cap_or; + u16 pci_exp_devctl_and; + u16 pci_exp_devctl_or; + u16 pci_exp_lnkctl_and; + u16 pci_exp_lnkctl_or; + u32 sec_unc_err_sever_and; + u32 sec_unc_err_sever_or; + u32 sec_unc_err_mask_and; + u32 sec_unc_err_mask_or; +}; + +static void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx) { int pos; u32 reg32; @@ -369,6 +407,24 @@ static acpi_status decode_type2_hpx_record(union acpi_object *record, return AE_OK; } +/* _HPX PCI Express Setting Record (Type 3) */ +struct hpx_type3 { + u16 device_type; + u16 function_type; + u16 config_space_location; + u16 pci_exp_cap_id; + u16 pci_exp_cap_ver; + u16 pci_exp_vendor_id; + u16 dvsec_id; + u16 dvsec_rev; + u16 match_offset; + u32 match_mask_and; + u32 match_value; + u16 reg_offset; + u32 reg_mask_and; + u32 reg_mask_or; +}; + enum hpx_type3_dev_type { HPX_TYPE_ENDPOINT = BIT(0), HPX_TYPE_LEG_END = BIT(1), @@ -498,7 +554,7 @@ static void program_hpx_type3_register(struct pci_dev *dev, pos, orig_value, write_reg); } -void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx) +static void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx) { if (!hpx) return; @@ -529,8 +585,7 @@ static void parse_hpx3_register(struct hpx_type3 *hpx3_reg, } static acpi_status program_type3_hpx_record(struct pci_dev *dev, - union acpi_object *record, - const struct hotplug_program_ops *hp_ops) + union acpi_object *record) { union acpi_object *fields = record->package.elements; u32 desc_count, expected_length, revision; @@ -554,7 +609,7 @@ static acpi_status program_type3_hpx_record(struct pci_dev *dev, for (i = 0; i < desc_count; i++) { reg_fields = fields + 3 + i * 14; parse_hpx3_register(&hpx3, reg_fields); - hp_ops->program_type3(dev, &hpx3); + program_hpx_type3(dev, &hpx3); } break; @@ -567,8 +622,7 @@ static acpi_status program_type3_hpx_record(struct pci_dev *dev, return AE_OK; } -static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle, - const struct hotplug_program_ops *hp_ops) +static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle) { acpi_status status; struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; @@ -610,24 +664,24 @@ static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle, status = decode_type0_hpx_record(record, &hpx0); if (ACPI_FAILURE(status)) goto exit; - hp_ops->program_type0(dev, &hpx0); + program_hpx_type0(dev, &hpx0); break; case 1: memset(&hpx1, 0, sizeof(hpx1)); status = decode_type1_hpx_record(record, &hpx1); if (ACPI_FAILURE(status)) goto exit; - hp_ops->program_type1(dev, &hpx1); + program_hpx_type1(dev, &hpx1); break; case 2: memset(&hpx2, 0, sizeof(hpx2)); status = decode_type2_hpx_record(record, &hpx2); if (ACPI_FAILURE(status)) goto exit; - hp_ops->program_type2(dev, &hpx2); + program_hpx_type2(dev, &hpx2); break; case 3: - status = program_type3_hpx_record(dev, record, hp_ops); + status = program_type3_hpx_record(dev, record); if (ACPI_FAILURE(status)) goto exit; break; @@ -643,8 +697,7 @@ static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle, return status; } -static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle, - const struct hotplug_program_ops *hp_ops) +static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle) { acpi_status status; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; @@ -679,20 +732,18 @@ static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle, hpx0.enable_serr = fields[2].integer.value; hpx0.enable_perr = fields[3].integer.value; - hp_ops->program_type0(dev, &hpx0); + program_hpx_type0(dev, &hpx0); exit: kfree(buffer.pointer); return status; } -/* pci_get_hp_params +/* pci_acpi_program_hp_params * * @dev - the pci_dev for which we want parameters - * @hpp - allocated by the caller */ -int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops) +int pci_acpi_program_hp_params(struct pci_dev *dev) { acpi_status status; acpi_handle handle, phandle; @@ -715,10 +766,10 @@ int pci_acpi_program_hp_params(struct pci_dev *dev, * this pci dev. */ while (handle) { - status = acpi_run_hpx(dev, handle, hp_ops); + status = acpi_run_hpx(dev, handle); if (ACPI_SUCCESS(status)) return 0; - status = acpi_run_hpp(dev, handle, hp_ops); + status = acpi_run_hpp(dev, handle); if (ACPI_SUCCESS(status)) return 0; if (acpi_is_root_bridge(handle)) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index dad43c64b350..97180274c60b 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -608,80 +608,10 @@ static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } static inline void pci_aer_clear_device_status(struct pci_dev *dev) { } #endif -/* _HPX PCI Setting Record (Type 0); same as _HPP */ -struct hpx_type0 { - u32 revision; /* Not present in _HPP */ - u8 cache_line_size; /* Not applicable to PCIe */ - u8 latency_timer; /* Not applicable to PCIe */ - u8 enable_serr; - u8 enable_perr; -}; - -/* _HPX PCI-X Setting Record (Type 1) */ -struct hpx_type1 { - u32 revision; - u8 max_mem_read; - u8 avg_max_split; - u16 tot_max_split; -}; - -/* _HPX PCI Express Setting Record (Type 2) */ -struct hpx_type2 { - u32 revision; - u32 unc_err_mask_and; - u32 unc_err_mask_or; - u32 unc_err_sever_and; - u32 unc_err_sever_or; - u32 cor_err_mask_and; - u32 cor_err_mask_or; - u32 adv_err_cap_and; - u32 adv_err_cap_or; - u16 pci_exp_devctl_and; - u16 pci_exp_devctl_or; - u16 pci_exp_lnkctl_and; - u16 pci_exp_lnkctl_or; - u32 sec_unc_err_sever_and; - u32 sec_unc_err_sever_or; - u32 sec_unc_err_mask_and; - u32 sec_unc_err_mask_or; -}; - -/* _HPX PCI Express Setting Record (Type 3) */ -struct hpx_type3 { - u16 device_type; - u16 function_type; - u16 config_space_location; - u16 pci_exp_cap_id; - u16 pci_exp_cap_ver; - u16 pci_exp_vendor_id; - u16 dvsec_id; - u16 dvsec_rev; - u16 match_offset; - u32 match_mask_and; - u32 match_value; - u16 reg_offset; - u32 reg_mask_and; - u32 reg_mask_or; -}; - -void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx); -void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx); -void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx); -void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx); - -struct hotplug_program_ops { - void (*program_type0)(struct pci_dev *dev, struct hpx_type0 *hpx); - void (*program_type1)(struct pci_dev *dev, struct hpx_type1 *hpx); - void (*program_type2)(struct pci_dev *dev, struct hpx_type2 *hpx); - void (*program_type3)(struct pci_dev *dev, struct hpx_type3 *hpx); -}; - #ifdef CONFIG_ACPI -int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops); +int pci_acpi_program_hp_params(struct pci_dev *dev); #else -static inline int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops) +static inline int pci_acpi_program_hp_params(struct pci_dev *dev) { return -ENODEV; } diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 5847b557dc9a..cdf34a3c531a 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2100,13 +2100,6 @@ static void pci_configure_serr(struct pci_dev *dev) static void pci_configure_device(struct pci_dev *dev) { - static const struct hotplug_program_ops hp_ops = { - .program_type0 = program_hpx_type0, - .program_type1 = program_hpx_type1, - .program_type2 = program_hpx_type2, - .program_type3 = program_hpx_type3, - }; - pci_configure_mps(dev); pci_configure_extended_tags(dev, NULL); pci_configure_relaxed_ordering(dev); @@ -2114,7 +2107,7 @@ static void pci_configure_device(struct pci_dev *dev) pci_configure_eetlp_prefix(dev); pci_configure_serr(dev); - pci_acpi_program_hp_params(dev, &hp_ops); + pci_acpi_program_hp_params(dev); } static void pci_release_capabilities(struct pci_dev *dev) From 2b86e3aaf993a3ea6c73dfcf86143061a40c62e6 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 28 Aug 2019 21:13:45 -0400 Subject: [PATCH 0260/2880] nfsd: eliminate an unnecessary acl size limit We're unnecessarily limiting the size of an ACL to less than what most filesystems will support. Some users do hit the limit and it's confusing and unnecessary. It still seems prudent to impose some limit on the number of ACEs the client gives us before passing it straight to kmalloc(). So, let's just limit it to the maximum number that would be possible given the amount of data left in the argument buffer. That will still leave one limit beyond whatever the filesystem imposes: the client and server negotiate a limit on the size of a request, which we have to respect. But we're no longer imposing any additional arbitrary limit. struct nfs4_ace is 20 bytes on my system and the maximum call size we'll negotiate is about a megabyte, so in practice this is limiting the allocation here to about a megabyte. Reported-by: "de Vandiere, Louis" Signed-off-by: J. Bruce Fields --- fs/nfsd/acl.h | 8 -------- fs/nfsd/nfs4xdr.c | 14 +++++++++++++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index 4cd7c69a6cb9..ba14d2f4b64f 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -39,14 +39,6 @@ struct nfs4_acl; struct svc_fh; struct svc_rqst; -/* - * Maximum ACL we'll accept from a client; chosen (somewhat - * arbitrarily) so that kmalloc'ing the ACL shouldn't require a - * high-order allocation. This allows 204 ACEs on x86_64: - */ -#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \ - / sizeof(struct nfs4_ace)) - int nfs4_acl_bytes(int entries); int nfs4_acl_get_whotype(char *, u32); __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 565d2169902c..c1fc2641e3e7 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -204,6 +204,13 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) return p; } +static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp) +{ + unsigned int this = (char *)argp->end - (char *)argp->p; + + return this + argp->pagelen; +} + static int zero_clientid(clientid_t *clid) { return (clid->cl_boot == 0) && (clid->cl_id == 0); @@ -348,7 +355,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, READ_BUF(4); len += 4; nace = be32_to_cpup(p++); - if (nace > NFS4_ACL_MAX) + if (nace > compoundargs_bytes_left(argp)/20) + /* + * Even with 4-byte names there wouldn't be + * space for that many aces; something fishy is + * going on: + */ return nfserr_fbig; *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace)); From cbc0425d3dd370a6f0bf23589dc7b6955a53a9ce Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Sun, 25 Aug 2019 09:58:16 -0400 Subject: [PATCH 0261/2880] sefltest/ima: support appended signatures (modsig) In addition to the PE/COFF and IMA xattr signatures, the kexec kernel image can be signed with an appended signature, using the same scripts/sign-file tool that is used to sign kernel modules. This patch adds support for detecting a kernel image signed with an appended signature and updates the existing test messages appropriately. Reviewed-by: Petr Vorel Acked-by: Shuah Khan Reviewed-by: Thiago Jung Bauermann Reviewed-by: Jordan Hand (x86_64 QEMU) Tested-by: Jordan Hand (x86_64 QEMU) Signed-off-by: Mimi Zohar --- .../selftests/kexec/test_kexec_file_load.sh | 38 +++++++++++++++++-- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kexec/test_kexec_file_load.sh b/tools/testing/selftests/kexec/test_kexec_file_load.sh index fa7c24e8eefb..2ff600388c30 100755 --- a/tools/testing/selftests/kexec/test_kexec_file_load.sh +++ b/tools/testing/selftests/kexec/test_kexec_file_load.sh @@ -37,11 +37,20 @@ is_ima_sig_required() # sequentially. As a result, a policy rule may be defined, but # might not necessarily be used. This test assumes if a policy # rule is specified, that is the intent. + + # First check for appended signature (modsig), then xattr if [ $ima_read_policy -eq 1 ]; then check_ima_policy "appraise" "func=KEXEC_KERNEL_CHECK" \ - "appraise_type=imasig" + "appraise_type=imasig|modsig" ret=$? - [ $ret -eq 1 ] && log_info "IMA signature required"; + if [ $ret -eq 1 ]; then + log_info "IMA or appended(modsig) signature required" + else + check_ima_policy "appraise" "func=KEXEC_KERNEL_CHECK" \ + "appraise_type=imasig" + ret=$? + [ $ret -eq 1 ] && log_info "IMA signature required"; + fi fi return $ret } @@ -84,6 +93,22 @@ check_for_imasig() return $ret } +# Return 1 for appended signature (modsig) found and 0 for not found. +check_for_modsig() +{ + local module_sig_string="~Module signature appended~" + local sig="$(tail --bytes $((${#module_sig_string} + 1)) $KERNEL_IMAGE)" + local ret=0 + + if [ "$sig" == "$module_sig_string" ]; then + ret=1 + log_info "kexec kernel image modsig signed" + else + log_info "kexec kernel image not modsig signed" + fi + return $ret +} + kexec_file_load_test() { local succeed_msg="kexec_file_load succeeded" @@ -98,7 +123,8 @@ kexec_file_load_test() # In secureboot mode with an architecture specific # policy, make sure either an IMA or PE signature exists. if [ $secureboot -eq 1 ] && [ $arch_policy -eq 1 ] && \ - [ $ima_signed -eq 0 ] && [ $pe_signed -eq 0 ]; then + [ $ima_signed -eq 0 ] && [ $pe_signed -eq 0 ] \ + && [ $ima_modsig -eq 0 ]; then log_fail "$succeed_msg (missing sig)" fi @@ -107,7 +133,8 @@ kexec_file_load_test() log_fail "$succeed_msg (missing PE sig)" fi - if [ $ima_sig_required -eq 1 ] && [ $ima_signed -eq 0 ]; then + if [ $ima_sig_required -eq 1 ] && [ $ima_signed -eq 0 ] \ + && [ $ima_modsig -eq 0 ]; then log_fail "$succeed_msg (missing IMA sig)" fi @@ -204,5 +231,8 @@ pe_signed=$? check_for_imasig ima_signed=$? +check_for_modsig +ima_modsig=$? + # Test loading the kernel image via kexec_file_load syscall kexec_file_load_test From fa5b57175364431245b006c2afcbf94dc2b15400 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 29 May 2019 11:53:43 -0500 Subject: [PATCH 0262/2880] ima: use struct_size() in kzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; struct boo entry[]; }; instance = kzalloc(sizeof(struct foo) + count * sizeof(struct boo), GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_template.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index f5b950e0a955..6aa6408603e3 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -306,9 +306,8 @@ static int ima_restore_template_data(struct ima_template_desc *template_desc, int ret = 0; int i; - *entry = kzalloc(sizeof(**entry) + - template_desc->num_fields * sizeof(struct ima_field_data), - GFP_NOFS); + *entry = kzalloc(struct_size(*entry, template_data, + template_desc->num_fields), GFP_NOFS); if (!*entry) return -ENOMEM; From 2a7f0e53daf29ca6dc9fbe2a27158f13474ec1b5 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 29 Aug 2019 12:29:16 -0500 Subject: [PATCH 0263/2880] ima: ima_api: Use struct_size() in kzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct ima_template_entry { ... struct ima_field_data template_data[0]; /* template related data */ }; instance = kzalloc(sizeof(struct ima_template_entry) + count * sizeof(struct ima_field_data), GFP_NOFS); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kzalloc(struct_size(instance, entry, count), GFP_NOFS); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index 65224474675b..610759fe63b8 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -45,8 +45,8 @@ int ima_alloc_init_template(struct ima_event_data *event_data, else template_desc = ima_template_desc_current(); - *entry = kzalloc(sizeof(**entry) + template_desc->num_fields * - sizeof(struct ima_field_data), GFP_NOFS); + *entry = kzalloc(struct_size(*entry, template_data, + template_desc->num_fields), GFP_NOFS); if (!*entry) return -ENOMEM; From ca85ee7457dc60771ec0173bfed58671177a74ab Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Tue, 27 Aug 2019 19:04:14 +0200 Subject: [PATCH 0264/2880] dt-bindings: i2c: bcm2835: Add brcm,bcm2711 compatible Add a new compatible for the BCM2711, which hasn't the clock stretch bug. Signed-off-by: Stefan Wahren Reviewed-by: Eric Anholt Reviewed-by: Rob Herring Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt b/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt index e9de3756752b..c9a6587fe4bb 100644 --- a/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt +++ b/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt @@ -1,7 +1,9 @@ Broadcom BCM2835 I2C controller Required properties: -- compatible : Should be "brcm,bcm2835-i2c". +- compatible : Should be one of: + "brcm,bcm2711-i2c" + "brcm,bcm2835-i2c" - reg: Should contain register location and length. - interrupts: Should contain interrupt. - clocks : The clock feeding the I2C controller. From 67de10fbaa12dc7d46412c5ab80f784ea8b4d7f8 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Tue, 27 Aug 2019 19:04:15 +0200 Subject: [PATCH 0265/2880] i2c: bcm2835: Avoid clk stretch quirk for BCM2711 The I2C block on the BCM2711 isn't affected by the clk stretching bug. So there is no need to apply the corresponding quirk. Signed-off-by: Stefan Wahren Reviewed-by: Eric Anholt Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-bcm2835.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-bcm2835.c b/drivers/i2c/busses/i2c-bcm2835.c index 67752f7b0371..ab5502f11109 100644 --- a/drivers/i2c/busses/i2c-bcm2835.c +++ b/drivers/i2c/busses/i2c-bcm2835.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -389,7 +390,7 @@ static const struct i2c_algorithm bcm2835_i2c_algo = { }; /* - * This HW was reported to have problems with clock stretching: + * The BCM2835 was reported to have problems with clock stretching: * http://www.advamation.com/knowhow/raspberrypi/rpi-i2c-bug.html * https://www.raspberrypi.org/forums/viewtopic.php?p=146272 */ @@ -475,7 +476,7 @@ static int bcm2835_i2c_probe(struct platform_device *pdev) adap->algo = &bcm2835_i2c_algo; adap->dev.parent = &pdev->dev; adap->dev.of_node = pdev->dev.of_node; - adap->quirks = &bcm2835_i2c_quirks; + adap->quirks = of_device_get_match_data(&pdev->dev); bcm2835_i2c_writel(i2c_dev, BCM2835_I2C_C, 0); @@ -501,7 +502,8 @@ static int bcm2835_i2c_remove(struct platform_device *pdev) } static const struct of_device_id bcm2835_i2c_of_match[] = { - { .compatible = "brcm,bcm2835-i2c" }, + { .compatible = "brcm,bcm2711-i2c" }, + { .compatible = "brcm,bcm2835-i2c", .data = &bcm2835_i2c_quirks }, {}, }; MODULE_DEVICE_TABLE(of, bcm2835_i2c_of_match); From 250212b59a8e33f1b47f15ad9afa3db19ba0fc8d Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Tue, 27 Aug 2019 19:04:16 +0200 Subject: [PATCH 0266/2880] i2c: bcm2835: Add full name of devicetree node to adapter name Inspired by Lori Hikichi's patch for iproc, this adds the full name of the devicetree node to the adapter name. With the introduction of BCM2711 it's very difficult to distinguish between the multiple instances. Signed-off-by: Stefan Wahren Acked-by: Scott Branden Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-bcm2835.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-bcm2835.c b/drivers/i2c/busses/i2c-bcm2835.c index ab5502f11109..e01b2b57e724 100644 --- a/drivers/i2c/busses/i2c-bcm2835.c +++ b/drivers/i2c/busses/i2c-bcm2835.c @@ -472,7 +472,8 @@ static int bcm2835_i2c_probe(struct platform_device *pdev) i2c_set_adapdata(adap, i2c_dev); adap->owner = THIS_MODULE; adap->class = I2C_CLASS_DEPRECATED; - strlcpy(adap->name, "bcm2835 I2C adapter", sizeof(adap->name)); + snprintf(adap->name, sizeof(adap->name), "bcm2835 (%s)", + of_node_full_name(pdev->dev.of_node)); adap->algo = &bcm2835_i2c_algo; adap->dev.parent = &pdev->dev; adap->dev.of_node = pdev->dev.of_node; From 4768e90ecaec6b503ff64229bda5d91186d2edd3 Mon Sep 17 00:00:00 2001 From: Max Staudt Date: Tue, 20 Aug 2019 11:27:39 +0200 Subject: [PATCH 0267/2880] i2c: Add i2c-icy for I2C on m68k/Amiga This is the i2c-icy driver for the ICY board for Amiga computers. It connects a PCF8584 I2C controller to the Zorro bus, providing I2C connectivity. The original documentation can be found on Aminet: https://aminet.net/package/docs/hard/icy IRQ support is currently not implemented, as i2c-algo-pcf is built for the ISA bus and a straight implementation of the same stack locks up a Zorro machine. Signed-off-by: Max Staudt Reviewed-by: Geert Uytterhoeven [wsa: added a missing newline reported by checkpatch] Signed-off-by: Wolfram Sang --- MAINTAINERS | 6 ++ drivers/i2c/busses/Kconfig | 11 +++ drivers/i2c/busses/Makefile | 1 + drivers/i2c/busses/i2c-icy.c | 173 +++++++++++++++++++++++++++++++++++ 4 files changed, 191 insertions(+) create mode 100644 drivers/i2c/busses/i2c-icy.c diff --git a/MAINTAINERS b/MAINTAINERS index f89f27a2d09a..97d35f4e2068 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7750,6 +7750,12 @@ S: Maintained F: drivers/mfd/lpc_ich.c F: drivers/gpio/gpio-ich.c +ICY I2C DRIVER +M: Max Staudt +L: linux-i2c@vger.kernel.org +S: Maintained +F: drivers/i2c/busses/i2c-icy.c + IDE SUBSYSTEM M: "David S. Miller" L: linux-ide@vger.kernel.org diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 69f19312a574..8b97ba4a2c0c 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -1309,6 +1309,17 @@ config I2C_ELEKTOR This support is also available as a module. If so, the module will be called i2c-elektor. +config I2C_ICY + tristate "ICY Zorro card" + depends on ZORRO + select I2C_ALGOPCF + help + This supports the PCF8584 Zorro bus I2C adapter, known as ICY. + Say Y if you own such an adapter. + + This support is also available as a module. If so, the module + will be called i2c-icy. + config I2C_MLXCPLD tristate "Mellanox I2C driver" depends on X86_64 diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index 80c23895eaaf..3ab8aebc39c9 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -140,6 +140,7 @@ obj-$(CONFIG_I2C_BCM_KONA) += i2c-bcm-kona.o obj-$(CONFIG_I2C_BRCMSTB) += i2c-brcmstb.o obj-$(CONFIG_I2C_CROS_EC_TUNNEL) += i2c-cros-ec-tunnel.o obj-$(CONFIG_I2C_ELEKTOR) += i2c-elektor.o +obj-$(CONFIG_I2C_ICY) += i2c-icy.o obj-$(CONFIG_I2C_MLXCPLD) += i2c-mlxcpld.o obj-$(CONFIG_I2C_OPAL) += i2c-opal.o obj-$(CONFIG_I2C_PCA_ISA) += i2c-pca-isa.o diff --git a/drivers/i2c/busses/i2c-icy.c b/drivers/i2c/busses/i2c-icy.c new file mode 100644 index 000000000000..0d61936dfc0b --- /dev/null +++ b/drivers/i2c/busses/i2c-icy.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * I2C driver for stand-alone PCF8584 style adapters on Zorro cards + * + * Original ICY documentation can be found on Aminet: + * https://aminet.net/package/docs/hard/icy + * + * There has been a modern community re-print of this design in 2019: + * https://www.a1k.org/forum/index.php?threads/70106/ + * + * The card is basically a Philips PCF8584 connected straight to the + * beginning of the AutoConfig'd address space (register S1 on base+2), + * with /INT on /INT2 on the Zorro bus. + * + * Copyright (c) 2019 Max Staudt + * + * This started as a fork of i2c-elektor.c and has evolved since. + * Thanks go to its authors for providing a base to grow on. + * + * + * IRQ support is currently not implemented. + * + * As it turns out, i2c-algo-pcf is really written with i2c-elektor's + * edge-triggered ISA interrupts in mind, while the Amiga's Zorro bus has + * level-triggered interrupts. This means that once an interrupt occurs, we + * have to tell the PCF8584 to shut up immediately, or it will keep the + * interrupt line busy and cause an IRQ storm. + + * However, because of the PCF8584's host-side protocol, there is no good + * way to just quieten it without side effects. Rather, we have to perform + * the next read/write operation straight away, which will reset the /INT + * pin. This entails re-designing the core of i2c-algo-pcf in the future. + * For now, we never request an IRQ from the PCF8584, and poll it instead. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "../algos/i2c-algo-pcf.h" + +struct icy_i2c { + struct i2c_adapter adapter; + + void __iomem *reg_s0; + void __iomem *reg_s1; +}; + +/* + * Functions called by i2c-algo-pcf + */ +static void icy_pcf_setpcf(void *data, int ctl, int val) +{ + struct icy_i2c *i2c = (struct icy_i2c *)data; + + u8 __iomem *address = ctl ? i2c->reg_s1 : i2c->reg_s0; + + z_writeb(val, address); +} + +static int icy_pcf_getpcf(void *data, int ctl) +{ + struct icy_i2c *i2c = (struct icy_i2c *)data; + + u8 __iomem *address = ctl ? i2c->reg_s1 : i2c->reg_s0; + + return z_readb(address); +} + +static int icy_pcf_getown(void *data) +{ + return 0x55; +} + +static int icy_pcf_getclock(void *data) +{ + return 0x1c; +} + +static void icy_pcf_waitforpin(void *data) +{ + usleep_range(50, 150); +} + +/* + * Main i2c-icy part + */ +static int icy_probe(struct zorro_dev *z, + const struct zorro_device_id *ent) +{ + struct icy_i2c *i2c; + struct i2c_algo_pcf_data *algo_data; + + i2c = devm_kzalloc(&z->dev, sizeof(*i2c), GFP_KERNEL); + if (!i2c) + return -ENOMEM; + + algo_data = devm_kzalloc(&z->dev, sizeof(*algo_data), GFP_KERNEL); + if (!algo_data) + return -ENOMEM; + + dev_set_drvdata(&z->dev, i2c); + i2c->adapter.dev.parent = &z->dev; + i2c->adapter.owner = THIS_MODULE; + /* i2c->adapter.algo assigned by i2c_pcf_add_bus() */ + i2c->adapter.algo_data = algo_data; + strlcpy(i2c->adapter.name, "ICY I2C Zorro adapter", + sizeof(i2c->adapter.name)); + + if (!devm_request_mem_region(&z->dev, + z->resource.start, + 4, i2c->adapter.name)) + return -ENXIO; + + /* Driver private data */ + i2c->reg_s0 = ZTWO_VADDR(z->resource.start); + i2c->reg_s1 = ZTWO_VADDR(z->resource.start + 2); + + algo_data->data = i2c; + algo_data->setpcf = icy_pcf_setpcf; + algo_data->getpcf = icy_pcf_getpcf; + algo_data->getown = icy_pcf_getown; + algo_data->getclock = icy_pcf_getclock; + algo_data->waitforpin = icy_pcf_waitforpin; + + if (i2c_pcf_add_bus(&i2c->adapter)) { + dev_err(&z->dev, "i2c_pcf_add_bus() failed\n"); + return -ENXIO; + } + + dev_info(&z->dev, "ICY I2C controller at %pa, IRQ not implemented\n", + &z->resource.start); + + return 0; +} + +static void icy_remove(struct zorro_dev *z) +{ + struct icy_i2c *i2c = dev_get_drvdata(&z->dev); + + i2c_del_adapter(&i2c->adapter); +} + +static const struct zorro_device_id icy_zorro_tbl[] = { + { ZORRO_ID(VMC, 15, 0), }, + { 0 } +}; + +MODULE_DEVICE_TABLE(zorro, icy_zorro_tbl); + +static struct zorro_driver icy_driver = { + .name = "i2c-icy", + .id_table = icy_zorro_tbl, + .probe = icy_probe, + .remove = icy_remove, +}; + +module_driver(icy_driver, + zorro_register_driver, + zorro_unregister_driver); + +MODULE_AUTHOR("Max Staudt "); +MODULE_DESCRIPTION("I2C bus via PCF8584 on ICY Zorro card"); +MODULE_LICENSE("GPL v2"); From 724041ae15ed9639b72bb54dabfac3279c6b4e55 Mon Sep 17 00:00:00 2001 From: Max Staudt Date: Mon, 19 Aug 2019 14:16:18 +0200 Subject: [PATCH 0268/2880] i2c: icy: Add LTC2990 present on 2019 board revision Since the 2019 a1k.org community re-print of these PCBs sports an LTC2990 hwmon chip as an example use case, let this driver autoprobe for that as well. If it is present, modprobing ltc2990 is sufficient. The property_entry enables the three additional inputs available on this particular board: in1 will be the voltage of the 5V rail, divided by 2. in2 will be the voltage of the 12V rail, divided by 4. temp3 will be measured using a PCB loop next the chip. Signed-off-by: Max Staudt Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- drivers/i2c/busses/Kconfig | 3 ++ drivers/i2c/busses/i2c-icy.c | 57 ++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 8b97ba4a2c0c..ee7f415a625a 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -1320,6 +1320,9 @@ config I2C_ICY This support is also available as a module. If so, the module will be called i2c-icy. + If you have a 2019 edition board with an LTC2990 sensor at address + 0x4c, loading the module 'ltc2990' is sufficient to enable it. + config I2C_MLXCPLD tristate "Mellanox I2C driver" depends on X86_64 diff --git a/drivers/i2c/busses/i2c-icy.c b/drivers/i2c/busses/i2c-icy.c index 0d61936dfc0b..8382eb64b424 100644 --- a/drivers/i2c/busses/i2c-icy.c +++ b/drivers/i2c/busses/i2c-icy.c @@ -53,6 +53,8 @@ struct icy_i2c { void __iomem *reg_s0; void __iomem *reg_s1; + struct fwnode_handle *ltc2990_fwnode; + struct i2c_client *ltc2990_client; }; /* @@ -94,11 +96,34 @@ static void icy_pcf_waitforpin(void *data) /* * Main i2c-icy part */ +static unsigned short const icy_ltc2990_addresses[] = { + 0x4c, 0x4d, 0x4e, 0x4f, I2C_CLIENT_END +}; + +/* + * Additional sensors exposed once this property is applied: + * + * in1 will be the voltage of the 5V rail, divided by 2. + * in2 will be the voltage of the 12V rail, divided by 4. + * temp3 will be measured using a PCB loop next the chip. + */ +static const u32 icy_ltc2990_meas_mode[] = {0, 3}; + +static const struct property_entry icy_ltc2990_props[] = { + PROPERTY_ENTRY_U32_ARRAY("lltc,meas-mode", icy_ltc2990_meas_mode), + { } +}; + static int icy_probe(struct zorro_dev *z, const struct zorro_device_id *ent) { struct icy_i2c *i2c; struct i2c_algo_pcf_data *algo_data; + struct fwnode_handle *new_fwnode; + struct i2c_board_info ltc2990_info = { + .type = "ltc2990", + .addr = 0x4c, + }; i2c = devm_kzalloc(&z->dev, sizeof(*i2c), GFP_KERNEL); if (!i2c) @@ -140,6 +165,35 @@ static int icy_probe(struct zorro_dev *z, dev_info(&z->dev, "ICY I2C controller at %pa, IRQ not implemented\n", &z->resource.start); + /* + * The 2019 a1k.org PCBs have an LTC2990 at 0x4c, so start + * it automatically once ltc2990 is modprobed. + * + * in0 is the voltage of the internal 5V power supply. + * temp1 is the temperature inside the chip. + * + * See property_entry above for in1, in2, temp3. + */ + new_fwnode = fwnode_create_software_node(icy_ltc2990_props, NULL); + if (IS_ERR(new_fwnode)) { + dev_info(&z->dev, "Failed to create fwnode for LTC2990, error: %ld\n", + PTR_ERR(new_fwnode)); + } else { + /* + * Store the fwnode so we can destroy it on .remove(). + * Only store it on success, as fwnode_remove_software_node() + * is NULL safe, but not PTR_ERR safe. + */ + i2c->ltc2990_fwnode = new_fwnode; + ltc2990_info.fwnode = new_fwnode; + + i2c->ltc2990_client = + i2c_new_probed_device(&i2c->adapter, + <c2990_info, + icy_ltc2990_addresses, + NULL); + } + return 0; } @@ -147,6 +201,9 @@ static void icy_remove(struct zorro_dev *z) { struct icy_i2c *i2c = dev_get_drvdata(&z->dev); + i2c_unregister_device(i2c->ltc2990_client); + fwnode_remove_software_node(i2c->ltc2990_fwnode); + i2c_del_adapter(&i2c->adapter); } From f0b576801d83cecab29888010017333babd61ede Mon Sep 17 00:00:00 2001 From: "Adamski, Krzysztof (Nokia - PL/Wroclaw)" Date: Mon, 19 Aug 2019 09:07:07 +0000 Subject: [PATCH 0269/2880] i2c: axxia: support slave mode This device contains both master and slave controllers which can be enabled simultaneously. Both controllers share the same SDA/SCL lines and interrupt source but has separate control and status registers. Controllers also works in loopback mode - slave device can communicate with its own master controller internally. The controller can handle up to two addresses, both of which may be 10 bit. Most of the logic (sending (N)ACK, handling repeated start or switching between write/read) is handled automatically which makes working with this controller quite easy. For simplicity, this patch adds basic support, limiting to only one slave address. Support for the 2nd device may be added in the future. Note that synchronize_irq() is used to ensure any running slave interrupt is finished to make sure slave i2c_client structure can be safely used by i2c_slave_event. Signed-off-by: Krzysztof Adamski Reviewed-by: Alexander Sverdlin Signed-off-by: Wolfram Sang --- drivers/i2c/busses/Kconfig | 1 + drivers/i2c/busses/i2c-axxia.c | 152 +++++++++++++++++++++++++++++++-- 2 files changed, 145 insertions(+), 8 deletions(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index ee7f415a625a..d3023e5379ea 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -429,6 +429,7 @@ config I2C_AXXIA tristate "Axxia I2C controller" depends on ARCH_AXXIA || COMPILE_TEST default ARCH_AXXIA + select I2C_SLAVE help Say yes if you want to support the I2C bus on Axxia platforms. diff --git a/drivers/i2c/busses/i2c-axxia.c b/drivers/i2c/busses/i2c-axxia.c index ff3142b15cab..0214daa913ff 100644 --- a/drivers/i2c/busses/i2c-axxia.c +++ b/drivers/i2c/busses/i2c-axxia.c @@ -77,6 +77,40 @@ MST_STATUS_IP) #define MST_TX_BYTES_XFRD 0x50 #define MST_RX_BYTES_XFRD 0x54 +#define SLV_ADDR_DEC_CTL 0x58 +#define SLV_ADDR_DEC_GCE BIT(0) /* ACK to General Call Address from own master (loopback) */ +#define SLV_ADDR_DEC_OGCE BIT(1) /* ACK to General Call Address from external masters */ +#define SLV_ADDR_DEC_SA1E BIT(2) /* ACK to addr_1 enabled */ +#define SLV_ADDR_DEC_SA1M BIT(3) /* 10-bit addressing for addr_1 enabled */ +#define SLV_ADDR_DEC_SA2E BIT(4) /* ACK to addr_2 enabled */ +#define SLV_ADDR_DEC_SA2M BIT(5) /* 10-bit addressing for addr_2 enabled */ +#define SLV_ADDR_1 0x5c +#define SLV_ADDR_2 0x60 +#define SLV_RX_CTL 0x64 +#define SLV_RX_ACSA1 BIT(0) /* Generate ACK for writes to addr_1 */ +#define SLV_RX_ACSA2 BIT(1) /* Generate ACK for writes to addr_2 */ +#define SLV_RX_ACGCA BIT(2) /* ACK data phase transfers to General Call Address */ +#define SLV_DATA 0x68 +#define SLV_RX_FIFO 0x6c +#define SLV_FIFO_DV1 BIT(0) /* Data Valid for addr_1 */ +#define SLV_FIFO_DV2 BIT(1) /* Data Valid for addr_2 */ +#define SLV_FIFO_AS BIT(2) /* (N)ACK Sent */ +#define SLV_FIFO_TNAK BIT(3) /* Timeout NACK */ +#define SLV_FIFO_STRC BIT(4) /* First byte after start condition received */ +#define SLV_FIFO_RSC BIT(5) /* Repeated Start Condition */ +#define SLV_FIFO_STPC BIT(6) /* Stop Condition */ +#define SLV_FIFO_DV (SLV_FIFO_DV1 | SLV_FIFO_DV2) +#define SLV_INT_ENABLE 0x70 +#define SLV_INT_STATUS 0x74 +#define SLV_STATUS_RFH BIT(0) /* FIFO service */ +#define SLV_STATUS_WTC BIT(1) /* Write transfer complete */ +#define SLV_STATUS_SRS1 BIT(2) /* Slave read from addr 1 */ +#define SLV_STATUS_SRRS1 BIT(3) /* Repeated start from addr 1 */ +#define SLV_STATUS_SRND1 BIT(4) /* Read request not following start condition */ +#define SLV_STATUS_SRC1 BIT(5) /* Read canceled */ +#define SLV_STATUS_SRAT1 BIT(6) /* Slave Read timed out */ +#define SLV_STATUS_SRDRE1 BIT(7) /* Data written after timed out */ +#define SLV_READ_DUMMY 0x78 #define SCL_HIGH_PERIOD 0x80 #define SCL_LOW_PERIOD 0x84 #define SPIKE_FLTR_LEN 0x88 @@ -111,6 +145,8 @@ struct axxia_i2c_dev { struct clk *i2c_clk; u32 bus_clk_rate; bool last; + struct i2c_client *slave; + int irq; }; static void i2c_int_disable(struct axxia_i2c_dev *idev, u32 mask) @@ -276,13 +312,65 @@ static int axxia_i2c_fill_tx_fifo(struct axxia_i2c_dev *idev) return ret; } +static void axxia_i2c_slv_fifo_event(struct axxia_i2c_dev *idev) +{ + u32 fifo_status = readl(idev->base + SLV_RX_FIFO); + u8 val; + + dev_dbg(idev->dev, "slave irq fifo_status=0x%x\n", fifo_status); + + if (fifo_status & SLV_FIFO_DV1) { + if (fifo_status & SLV_FIFO_STRC) + i2c_slave_event(idev->slave, + I2C_SLAVE_WRITE_REQUESTED, &val); + + val = readl(idev->base + SLV_DATA); + i2c_slave_event(idev->slave, I2C_SLAVE_WRITE_RECEIVED, &val); + } + if (fifo_status & SLV_FIFO_STPC) { + readl(idev->base + SLV_DATA); /* dummy read */ + i2c_slave_event(idev->slave, I2C_SLAVE_STOP, &val); + } + if (fifo_status & SLV_FIFO_RSC) + readl(idev->base + SLV_DATA); /* dummy read */ +} + +static irqreturn_t axxia_i2c_slv_isr(struct axxia_i2c_dev *idev) +{ + u32 status = readl(idev->base + SLV_INT_STATUS); + u8 val; + + dev_dbg(idev->dev, "slave irq status=0x%x\n", status); + + if (status & SLV_STATUS_RFH) + axxia_i2c_slv_fifo_event(idev); + if (status & SLV_STATUS_SRS1) { + i2c_slave_event(idev->slave, I2C_SLAVE_READ_REQUESTED, &val); + writel(val, idev->base + SLV_DATA); + } + if (status & SLV_STATUS_SRND1) { + i2c_slave_event(idev->slave, I2C_SLAVE_READ_PROCESSED, &val); + writel(val, idev->base + SLV_DATA); + } + if (status & SLV_STATUS_SRC1) + i2c_slave_event(idev->slave, I2C_SLAVE_STOP, &val); + + writel(INT_SLV, idev->base + INTERRUPT_STATUS); + return IRQ_HANDLED; +} + static irqreturn_t axxia_i2c_isr(int irq, void *_dev) { struct axxia_i2c_dev *idev = _dev; + irqreturn_t ret = IRQ_NONE; u32 status; - if (!(readl(idev->base + INTERRUPT_STATUS) & INT_MST)) - return IRQ_NONE; + status = readl(idev->base + INTERRUPT_STATUS); + + if (status & INT_SLV) + ret = axxia_i2c_slv_isr(idev); + if (!(status & INT_MST)) + return ret; /* Read interrupt status bits */ status = readl(idev->base + MST_INT_STATUS); @@ -583,9 +671,58 @@ static u32 axxia_i2c_func(struct i2c_adapter *adap) return caps; } +static int axxia_i2c_reg_slave(struct i2c_client *slave) +{ + struct axxia_i2c_dev *idev = i2c_get_adapdata(slave->adapter); + u32 slv_int_mask = SLV_STATUS_RFH; + u32 dec_ctl; + + if (idev->slave) + return -EBUSY; + + idev->slave = slave; + + /* Enable slave mode as well */ + writel(GLOBAL_MST_EN | GLOBAL_SLV_EN, idev->base + GLOBAL_CONTROL); + writel(INT_MST | INT_SLV, idev->base + INTERRUPT_ENABLE); + + /* Set slave address */ + dec_ctl = SLV_ADDR_DEC_SA1E; + if (slave->flags & I2C_CLIENT_TEN) + dec_ctl |= SLV_ADDR_DEC_SA1M; + + writel(SLV_RX_ACSA1, idev->base + SLV_RX_CTL); + writel(dec_ctl, idev->base + SLV_ADDR_DEC_CTL); + writel(slave->addr, idev->base + SLV_ADDR_1); + + /* Enable interrupts */ + slv_int_mask |= SLV_STATUS_SRS1 | SLV_STATUS_SRRS1 | SLV_STATUS_SRND1; + slv_int_mask |= SLV_STATUS_SRC1; + writel(slv_int_mask, idev->base + SLV_INT_ENABLE); + + return 0; +} + +static int axxia_i2c_unreg_slave(struct i2c_client *slave) +{ + struct axxia_i2c_dev *idev = i2c_get_adapdata(slave->adapter); + + /* Disable slave mode */ + writel(GLOBAL_MST_EN, idev->base + GLOBAL_CONTROL); + writel(INT_MST, idev->base + INTERRUPT_ENABLE); + + synchronize_irq(idev->irq); + + idev->slave = NULL; + + return 0; +} + static const struct i2c_algorithm axxia_i2c_algo = { .master_xfer = axxia_i2c_xfer, .functionality = axxia_i2c_func, + .reg_slave = axxia_i2c_reg_slave, + .unreg_slave = axxia_i2c_unreg_slave, }; static const struct i2c_adapter_quirks axxia_i2c_quirks = { @@ -599,7 +736,6 @@ static int axxia_i2c_probe(struct platform_device *pdev) struct axxia_i2c_dev *idev = NULL; struct resource *res; void __iomem *base; - int irq; int ret = 0; idev = devm_kzalloc(&pdev->dev, sizeof(*idev), GFP_KERNEL); @@ -611,10 +747,10 @@ static int axxia_i2c_probe(struct platform_device *pdev) if (IS_ERR(base)) return PTR_ERR(base); - irq = platform_get_irq(pdev, 0); - if (irq < 0) { + idev->irq = platform_get_irq(pdev, 0); + if (idev->irq < 0) { dev_err(&pdev->dev, "missing interrupt resource\n"); - return irq; + return idev->irq; } idev->i2c_clk = devm_clk_get(&pdev->dev, "i2c"); @@ -643,10 +779,10 @@ static int axxia_i2c_probe(struct platform_device *pdev) goto error_disable_clk; } - ret = devm_request_irq(&pdev->dev, irq, axxia_i2c_isr, 0, + ret = devm_request_irq(&pdev->dev, idev->irq, axxia_i2c_isr, 0, pdev->name, idev); if (ret) { - dev_err(&pdev->dev, "failed to claim IRQ%d\n", irq); + dev_err(&pdev->dev, "failed to claim IRQ%d\n", idev->irq); goto error_disable_clk; } From 21aa3983d619055cc3ecd0f0ceea4981dc8ee751 Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Thu, 15 Aug 2019 17:29:43 +0300 Subject: [PATCH 0270/2880] i2c: designware-pci: Switch over to MSI interrupts Some devices support MSI interrupts. Let's at least try to use them in platforms that provide MSI capability. Signed-off-by: Felipe Balbi Signed-off-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-pcidrv.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c index 7d2e6959679c..249ee3ee2a09 100644 --- a/drivers/i2c/busses/i2c-designware-pcidrv.c +++ b/drivers/i2c/busses/i2c-designware-pcidrv.c @@ -225,6 +225,8 @@ static int i2c_dw_pci_probe(struct pci_dev *pdev, return r; } + pci_set_master(pdev); + r = pcim_iomap_regions(pdev, 1 << 0, pci_name(pdev)); if (r) { dev_err(&pdev->dev, "I/O memory remapping failed\n"); @@ -235,18 +237,24 @@ static int i2c_dw_pci_probe(struct pci_dev *pdev, if (!dev) return -ENOMEM; + r = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); + if (r < 0) + return r; + dev->clk = NULL; dev->controller = controller; dev->get_clk_rate_khz = i2c_dw_get_clk_rate_khz; dev->base = pcim_iomap_table(pdev)[0]; dev->dev = &pdev->dev; - dev->irq = pdev->irq; + dev->irq = pci_irq_vector(pdev, 0); dev->flags |= controller->flags; if (controller->setup) { r = controller->setup(pdev, controller); - if (r) + if (r) { + pci_free_irq_vectors(pdev); return r; + } } dev->functionality = controller->functionality | @@ -274,8 +282,10 @@ static int i2c_dw_pci_probe(struct pci_dev *pdev, adap->nr = controller->bus_num; r = i2c_dw_probe(dev); - if (r) + if (r) { + pci_free_irq_vectors(pdev); return r; + } pm_runtime_set_autosuspend_delay(&pdev->dev, 1000); pm_runtime_use_autosuspend(&pdev->dev); @@ -294,6 +304,7 @@ static void i2c_dw_pci_remove(struct pci_dev *pdev) pm_runtime_get_noresume(&pdev->dev); i2c_del_adapter(&dev->adapter); + pci_free_irq_vectors(pdev); } /* work with hotplug and coldplug */ From 70fb95e213147a88417eaae5b93ba6e59be087c0 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Thu, 15 Aug 2019 17:29:44 +0300 Subject: [PATCH 0271/2880] i2c: designware-pci: Add support for Elkhart Lake PSE I2C Add support for Intel(R) Programmable Services Engine (Intel(R) PSE) I2C controller in Intel Elkhart Lake when interface is assigned to the host processor. Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-pcidrv.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c index 249ee3ee2a09..050adda7c1bd 100644 --- a/drivers/i2c/busses/i2c-designware-pcidrv.c +++ b/drivers/i2c/busses/i2c-designware-pcidrv.c @@ -33,6 +33,7 @@ enum dw_pci_ctl_id_t { baytrail, cherrytrail, haswell, + elkhartlake, }; struct dw_scl_sda_cfg { @@ -168,6 +169,14 @@ static struct dw_pci_controller dw_pci_controllers[] = { .flags = MODEL_CHERRYTRAIL, .scl_sda_cfg = &byt_config, }, + [elkhartlake] = { + .bus_num = -1, + .bus_cfg = INTEL_MID_STD_CFG | DW_IC_CON_SPEED_FAST, + .tx_fifo_depth = 32, + .rx_fifo_depth = 32, + .functionality = I2C_FUNC_10BIT_ADDR, + .clk_khz = 100000, + }, }; #ifdef CONFIG_PM @@ -340,6 +349,15 @@ static const struct pci_device_id i2_designware_pci_ids[] = { { PCI_VDEVICE(INTEL, 0x22C5), cherrytrail }, { PCI_VDEVICE(INTEL, 0x22C6), cherrytrail }, { PCI_VDEVICE(INTEL, 0x22C7), cherrytrail }, + /* Elkhart Lake (PSE I2C) */ + { PCI_VDEVICE(INTEL, 0x4bb9), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bba), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbb), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbc), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbd), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbe), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbf), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bc0), elkhartlake }, { 0,} }; MODULE_DEVICE_TABLE(pci, i2_designware_pci_ids); From f9bf7a899412b5f8c96b084343bde4168dc12efa Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Mon, 19 Aug 2019 13:16:01 +0530 Subject: [PATCH 0272/2880] i2c: taos-evm: Make structure tsl2550_info constant Static structure tsl2550_info, of type i2c_board_info, is referenced only twice: the first time in arguments to dev_info() (which does not modify it) and the second time as the last argument to function i2c_new_device() (where the corresponding parameter is declared as const). As tsl2550_info is therefore never modified, make it const to protect it from unintended modifications. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-taos-evm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-taos-evm.c b/drivers/i2c/busses/i2c-taos-evm.c index c82e78f57386..056df6b2538a 100644 --- a/drivers/i2c/busses/i2c-taos-evm.c +++ b/drivers/i2c/busses/i2c-taos-evm.c @@ -39,7 +39,7 @@ struct taos_data { }; /* TAOS TSL2550 EVM */ -static struct i2c_board_info tsl2550_info = { +static const struct i2c_board_info tsl2550_info = { I2C_BOARD_INFO("tsl2550", 0x39), }; From 71dc297ca9ab6311e7d39fc53811fd871d51854f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 19 Aug 2019 13:24:23 +0300 Subject: [PATCH 0273/2880] i2c: designware: assert reset when error happen at ->probe() The commit c62ebb3d5f0d ("i2c: designware: Add support for an interface clock") introduced an optional clock while missed correct error handling. assert reset line back if error happen at ->probe(). Fixes: c62ebb3d5f0d ("i2c: designware: Add support for an interface clock") Signed-off-by: Andy Shevchenko Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index ddfb81872906..4624ef8fbae8 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -346,8 +346,10 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) /* Optional interface clock */ dev->pclk = devm_clk_get_optional(&pdev->dev, "pclk"); - if (IS_ERR(dev->pclk)) - return PTR_ERR(dev->pclk); + if (IS_ERR(dev->pclk)) { + ret = PTR_ERR(dev->pclk); + goto exit_reset; + } dev->clk = devm_clk_get(&pdev->dev, NULL); if (!i2c_dw_prepare_clk(dev, true)) { From a6af48ec0712a0c98d8abe6b47c655b26026fceb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 19 Aug 2019 13:31:30 +0300 Subject: [PATCH 0274/2880] i2c: designware: Fix optional reset error handling The commit bb475230b8e5 ("reset: make optional functions really optional") brought a missed part of the support for an optional reset handlers. Since that we don't need to have special error handling in the driver. Signed-off-by: Andy Shevchenko Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index 4624ef8fbae8..16dd338877d0 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -279,12 +279,10 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) platform_set_drvdata(pdev, dev); dev->rst = devm_reset_control_get_optional_exclusive(&pdev->dev, NULL); - if (IS_ERR(dev->rst)) { - if (PTR_ERR(dev->rst) == -EPROBE_DEFER) - return -EPROBE_DEFER; - } else { - reset_control_deassert(dev->rst); - } + if (IS_ERR(dev->rst)) + return PTR_ERR(dev->rst); + + reset_control_deassert(dev->rst); t = &dev->timings; if (pdata) @@ -402,8 +400,7 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) exit_probe: dw_i2c_plat_pm_cleanup(dev); exit_reset: - if (!IS_ERR_OR_NULL(dev->rst)) - reset_control_assert(dev->rst); + reset_control_assert(dev->rst); return ret; } @@ -421,8 +418,7 @@ static int dw_i2c_plat_remove(struct platform_device *pdev) pm_runtime_put_sync(&pdev->dev); dw_i2c_plat_pm_cleanup(dev); - if (!IS_ERR_OR_NULL(dev->rst)) - reset_control_assert(dev->rst); + reset_control_assert(dev->rst); return 0; } From ba919403566dba52bf074851896ba0ca7f72c1e2 Mon Sep 17 00:00:00 2001 From: Federico Vaga Date: Thu, 22 Aug 2019 15:21:32 +0200 Subject: [PATCH 0275/2880] i2c: ocores: use request_any_context_irq() to register IRQ handler The i2c-ocores device is an HDL component that get instantiated in FPGA. The software stack used to drive an FPGA can be very different, and the i2c-ocore ip-core must work in different context. With respect to this patch the IRQ controller behind this device, and its driver, can have different implementations (nested threads). For this reason, it is safer to use `request_any_context_irq()` to avoid errors at probe time. Signed-off-by: Federico Vaga Reviewed-by: Andrew Lunn Reviewed-by: Peter Korsgaard Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-ocores.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c index 4117f1abc7c6..ca8b3ecfa93d 100644 --- a/drivers/i2c/busses/i2c-ocores.c +++ b/drivers/i2c/busses/i2c-ocores.c @@ -703,8 +703,9 @@ static int ocores_i2c_probe(struct platform_device *pdev) } if (ocores_algorithm.master_xfer != ocores_xfer_polling) { - ret = devm_request_irq(&pdev->dev, irq, ocores_isr, 0, - pdev->name, i2c); + ret = devm_request_any_context_irq(&pdev->dev, irq, + ocores_isr, 0, + pdev->name, i2c); if (ret) { dev_err(&pdev->dev, "Cannot claim IRQ\n"); goto err_clk; From 528d53a1592b0e27c423f7cafc1df85f77fc1163 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 2 Aug 2019 14:54:38 +0200 Subject: [PATCH 0276/2880] i2c: piix4: Fix probing of reserved ports on AMD Family 16h Model 30h Prevent bus timeouts and resets on Family 16h Model 30h by not probing reserved Ports 3 and 4. According to the AMD BIOS and Kernel Developer's Guides (BKDG), Port 3 and Port 4 are reserved on the following devices: - Family 15h Model 60h-6Fh - Family 15h Model 70h-7Fh - Family 16h Model 30h-3Fh Based on earlier work by Andrew Cooks. Reported-by: Andrew Cooks Signed-off-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-piix4.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index c46c4bddc7ca..d9f7e771f6ad 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -72,7 +72,8 @@ #define PIIX4_BLOCK_DATA 0x14 /* Multi-port constants */ -#define PIIX4_MAX_ADAPTERS 4 +#define PIIX4_MAX_ADAPTERS 4 +#define HUDSON2_MAIN_PORTS 2 /* HUDSON2, KERNCZ reserves ports 3, 4 */ /* SB800 constants */ #define SB800_PIIX4_SMB_IDX 0xcd6 @@ -808,6 +809,7 @@ MODULE_DEVICE_TABLE (pci, piix4_ids); static struct i2c_adapter *piix4_main_adapters[PIIX4_MAX_ADAPTERS]; static struct i2c_adapter *piix4_aux_adapter; +static int piix4_adapter_count; static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba, bool sb800_main, u8 port, bool notify_imc, @@ -867,7 +869,15 @@ static int piix4_add_adapters_sb800(struct pci_dev *dev, unsigned short smba, int port; int retval; - for (port = 0; port < PIIX4_MAX_ADAPTERS; port++) { + if (dev->device == PCI_DEVICE_ID_AMD_KERNCZ_SMBUS || + (dev->device == PCI_DEVICE_ID_AMD_HUDSON2_SMBUS && + dev->revision >= 0x1F)) { + piix4_adapter_count = HUDSON2_MAIN_PORTS; + } else { + piix4_adapter_count = PIIX4_MAX_ADAPTERS; + } + + for (port = 0; port < piix4_adapter_count; port++) { retval = piix4_add_adapter(dev, smba, true, port, notify_imc, piix4_main_port_names_sb800[port], &piix4_main_adapters[port]); @@ -989,7 +999,7 @@ static void piix4_adap_remove(struct i2c_adapter *adap) static void piix4_remove(struct pci_dev *dev) { - int port = PIIX4_MAX_ADAPTERS; + int port = piix4_adapter_count; while (--port >= 0) { if (piix4_main_adapters[port]) { From 0183eb8bb59d45f26ec4fc73aaa416067fe6c0be Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 2 Aug 2019 14:55:26 +0200 Subject: [PATCH 0277/2880] i2c: piix4: Add ACPI support Enable the i2c-piix4 SMBus controller driver to enumerate I2C slave devices using ACPI. It builds on the related I2C mux device work in commit 8eb5c87a92c0 ("i2c: add ACPI support for I2C mux ports") In the i2c-piix4 driver the adapters are enumerated as: Main SMBus adapter Port 0, Port 2, ..., aux port (i.e., ASF adapter) However, in the AMD BKDG documentation[1], the implied order of ports is: Main SMBus adapter Port 0, ASF adapter, Port 2, Port 3, ... This ordering difference is unfortunate. We assume that ACPI developers will use the AMD documentation ordering, so we have to pass an extra parameter to piix4_add_adapter(). [1] 52740 BIOS and Kernel Developer's Guide (BKDG) for AMD Family 16h Models 30h-3Fh Processors Based on earlier work by Andrew Cooks. Reported-by: Andrew Cooks Signed-off-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-piix4.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index d9f7e771f6ad..890b8d029b28 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -813,7 +813,8 @@ static int piix4_adapter_count; static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba, bool sb800_main, u8 port, bool notify_imc, - const char *name, struct i2c_adapter **padap) + u8 hw_port_nr, const char *name, + struct i2c_adapter **padap) { struct i2c_adapter *adap; struct i2c_piix4_adapdata *adapdata; @@ -845,6 +846,12 @@ static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba, /* set up the sysfs linkage to our parent device */ adap->dev.parent = &dev->dev; + if (has_acpi_companion(&dev->dev)) { + acpi_preset_companion(&adap->dev, + ACPI_COMPANION(&dev->dev), + hw_port_nr); + } + snprintf(adap->name, sizeof(adap->name), "SMBus PIIX4 adapter%s at %04x", name, smba); @@ -878,7 +885,10 @@ static int piix4_add_adapters_sb800(struct pci_dev *dev, unsigned short smba, } for (port = 0; port < piix4_adapter_count; port++) { + u8 hw_port_nr = port == 0 ? 0 : port + 1; + retval = piix4_add_adapter(dev, smba, true, port, notify_imc, + hw_port_nr, piix4_main_port_names_sb800[port], &piix4_main_adapters[port]); if (retval < 0) @@ -949,8 +959,8 @@ static int piix4_probe(struct pci_dev *dev, const struct pci_device_id *id) return retval; /* Try to register main SMBus adapter, give up if we can't */ - retval = piix4_add_adapter(dev, retval, false, 0, false, "", - &piix4_main_adapters[0]); + retval = piix4_add_adapter(dev, retval, false, 0, false, 0, + "", &piix4_main_adapters[0]); if (retval < 0) return retval; } @@ -976,7 +986,7 @@ static int piix4_probe(struct pci_dev *dev, const struct pci_device_id *id) if (retval > 0) { /* Try to add the aux adapter if it exists, * piix4_add_adapter will clean up if this fails */ - piix4_add_adapter(dev, retval, false, 0, false, + piix4_add_adapter(dev, retval, false, 0, false, 1, is_sb800 ? piix4_aux_port_name_sb800 : "", &piix4_aux_adapter); } From d3b9f659fac6e5f74f857364c5ed08b94e7f94ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Aug 2019 16:03:47 +0200 Subject: [PATCH 0278/2880] microblaze/nommu: use the generic uncached segment support Stop providing our own arch alloc/free hooks for nommu platforms and just expose the segment offset and use the generic dma-direct allocator. Signed-off-by: Christoph Hellwig Signed-off-by: Michal Simek --- arch/microblaze/Kconfig | 2 + arch/microblaze/mm/consistent.c | 93 +++++++++++++++------------------ 2 files changed, 43 insertions(+), 52 deletions(-) diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index d411de05b628..a0d749c309f3 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -5,9 +5,11 @@ config MICROBLAZE select ARCH_NO_SWAP select ARCH_HAS_BINFMT_FLAT if !MMU select ARCH_HAS_DMA_COHERENT_TO_PFN if MMU + select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_UNCACHED_SEGMENT if !MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_NO_COHERENT_DMA_MMAP if !MMU select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c index bc7042209c57..1a859e8b58c2 100644 --- a/arch/microblaze/mm/consistent.c +++ b/arch/microblaze/mm/consistent.c @@ -42,21 +42,48 @@ #include #include -#ifndef CONFIG_MMU -/* I have to use dcache values because I can't relate on ram size */ -# define UNCACHED_SHADOW_MASK (cpuinfo.dcache_high - cpuinfo.dcache_base + 1) -#endif +void arch_dma_prep_coherent(struct page *page, size_t size) +{ + phys_addr_t paddr = page_to_phys(page); + flush_dcache_range(paddr, paddr + size); +} + +#ifndef CONFIG_MMU /* - * Consistent memory allocators. Used for DMA devices that want to - * share uncached memory with the processor core. - * My crufty no-MMU approach is simple. In the HW platform we can optionally - * mirror the DDR up above the processor cacheable region. So, memory accessed - * in this mirror region will not be cached. It's alloced from the same - * pool as normal memory, but the handle we return is shifted up into the - * uncached region. This will no doubt cause big problems if memory allocated - * here is not also freed properly. -- JW + * Consistent memory allocators. Used for DMA devices that want to share + * uncached memory with the processor core. My crufty no-MMU approach is + * simple. In the HW platform we can optionally mirror the DDR up above the + * processor cacheable region. So, memory accessed in this mirror region will + * not be cached. It's alloced from the same pool as normal memory, but the + * handle we return is shifted up into the uncached region. This will no doubt + * cause big problems if memory allocated here is not also freed properly. -- JW + * + * I have to use dcache values because I can't relate on ram size: */ +#ifdef CONFIG_XILINX_UNCACHED_SHADOW +#define UNCACHED_SHADOW_MASK (cpuinfo.dcache_high - cpuinfo.dcache_base + 1) +#else +#define UNCACHED_SHADOW_MASK 0 +#endif /* CONFIG_XILINX_UNCACHED_SHADOW */ + +void *uncached_kernel_address(void *ptr) +{ + unsigned long addr = (unsigned long)ptr; + + addr |= UNCACHED_SHADOW_MASK; + if (addr > cpuinfo.dcache_base && addr < cpuinfo.dcache_high) + pr_warn("ERROR: Your cache coherent area is CACHED!!!\n"); + return (void *)addr; +} + +void *cached_kernel_address(void *ptr) +{ + unsigned long addr = (unsigned long)ptr; + + return (void *)(addr & ~UNCACHED_SHADOW_MASK); +} +#else /* CONFIG_MMU */ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { @@ -64,12 +91,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, void *ret; unsigned int i, err = 0; struct page *page, *end; - -#ifdef CONFIG_MMU phys_addr_t pa; struct vm_struct *area; unsigned long va; -#endif if (in_interrupt()) BUG(); @@ -86,26 +110,8 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, * we need to ensure that there are no cachelines in use, * or worse dirty in this area. */ - flush_dcache_range(virt_to_phys((void *)vaddr), - virt_to_phys((void *)vaddr) + size); + arch_dma_prep_coherent(virt_to_page((unsigned long)vaddr), size); -#ifndef CONFIG_MMU - ret = (void *)vaddr; - /* - * Here's the magic! Note if the uncached shadow is not implemented, - * it's up to the calling code to also test that condition and make - * other arranegments, such as manually flushing the cache and so on. - */ -# ifdef CONFIG_XILINX_UNCACHED_SHADOW - ret = (void *)((unsigned) ret | UNCACHED_SHADOW_MASK); -# endif - if ((unsigned int)ret > cpuinfo.dcache_base && - (unsigned int)ret < cpuinfo.dcache_high) - pr_warn("ERROR: Your cache coherent area is CACHED!!!\n"); - - /* dma_handle is same as physical (shadowed) address */ - *dma_handle = (dma_addr_t)ret; -#else /* Allocate some common virtual space to map the new pages. */ area = get_vm_area(size, VM_ALLOC); if (!area) { @@ -117,7 +123,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* This gives us the real physical address of the first page. */ *dma_handle = pa = __virt_to_phys(vaddr); -#endif /* * free wasted pages. We skip the first page since we know @@ -131,10 +136,8 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, split_page(page, order); for (i = 0; i < size && err == 0; i += PAGE_SIZE) { -#ifdef CONFIG_MMU /* MS: This is the whole magic - use cache inhibit pages */ err = map_page(va + i, pa + i, _PAGE_KERNEL | _PAGE_NO_CACHE); -#endif SetPageReserved(page); page++; @@ -154,7 +157,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, return ret; } -#ifdef CONFIG_MMU static pte_t *consistent_virt_to_pte(void *vaddr) { unsigned long addr = (unsigned long)vaddr; @@ -172,7 +174,6 @@ long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr, return pte_pfn(*ptep); } -#endif /* * free page(s) as defined by the above mapping. @@ -187,18 +188,6 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, size = PAGE_ALIGN(size); -#ifndef CONFIG_MMU - /* Clear SHADOW_MASK bit in address, and free as per usual */ -# ifdef CONFIG_XILINX_UNCACHED_SHADOW - vaddr = (void *)((unsigned)vaddr & ~UNCACHED_SHADOW_MASK); -# endif - page = virt_to_page(vaddr); - - do { - __free_reserved_page(page); - page++; - } while (size -= PAGE_SIZE); -#else do { pte_t *ptep = consistent_virt_to_pte(vaddr); unsigned long pfn; @@ -216,5 +205,5 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, /* flush tlb */ flush_tlb_all(); -#endif } +#endif /* CONFIG_MMU */ From 04e3543e228fcbb000de922ec5d366f398bdd9ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Aug 2019 16:03:48 +0200 Subject: [PATCH 0279/2880] microblaze: use the generic dma coherent remap allocator This switches to using common code for the DMA allocations, including potential use of the CMA allocator if configured. Switching to the generic code enables DMA allocations from atomic context, which is required by the DMA API documentation, and also adds various other minor features drivers start relying upon. It also makes sure we have on tested code base for all architectures that require uncached pte bits for coherent DMA allocations. Signed-off-by: Christoph Hellwig Signed-off-by: Michal Simek --- arch/microblaze/Kconfig | 1 + arch/microblaze/mm/consistent.c | 152 +------------------------------- 2 files changed, 5 insertions(+), 148 deletions(-) diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index a0d749c309f3..e477896fbae6 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -17,6 +17,7 @@ config MICROBLAZE select TIMER_OF select CLONE_BACKWARDS3 select COMMON_CLK + select DMA_DIRECT_REMAP if MMU select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS select GENERIC_CPU_DEVICES diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c index 1a859e8b58c2..0e0f733eb846 100644 --- a/arch/microblaze/mm/consistent.c +++ b/arch/microblaze/mm/consistent.c @@ -4,43 +4,16 @@ * Copyright (C) 2010 Michal Simek * Copyright (C) 2010 PetaLogix * Copyright (C) 2005 John Williams - * - * Based on PowerPC version derived from arch/arm/mm/consistent.c - * Copyright (C) 2001 Dan Malek (dmalek@jlc.net) - * Copyright (C) 2000 Russell King */ -#include -#include -#include #include -#include #include #include -#include -#include #include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include #include - -#include -#include -#include -#include -#include -#include -#include #include -#include +#include void arch_dma_prep_coherent(struct page *page, size_t size) { @@ -84,126 +57,9 @@ void *cached_kernel_address(void *ptr) return (void *)(addr & ~UNCACHED_SHADOW_MASK); } #else /* CONFIG_MMU */ -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) +static int __init atomic_pool_init(void) { - unsigned long order, vaddr; - void *ret; - unsigned int i, err = 0; - struct page *page, *end; - phys_addr_t pa; - struct vm_struct *area; - unsigned long va; - - if (in_interrupt()) - BUG(); - - /* Only allocate page size areas. */ - size = PAGE_ALIGN(size); - order = get_order(size); - - vaddr = __get_free_pages(gfp | __GFP_ZERO, order); - if (!vaddr) - return NULL; - - /* - * we need to ensure that there are no cachelines in use, - * or worse dirty in this area. - */ - arch_dma_prep_coherent(virt_to_page((unsigned long)vaddr), size); - - /* Allocate some common virtual space to map the new pages. */ - area = get_vm_area(size, VM_ALLOC); - if (!area) { - free_pages(vaddr, order); - return NULL; - } - va = (unsigned long) area->addr; - ret = (void *)va; - - /* This gives us the real physical address of the first page. */ - *dma_handle = pa = __virt_to_phys(vaddr); - - /* - * free wasted pages. We skip the first page since we know - * that it will have count = 1 and won't require freeing. - * We also mark the pages in use as reserved so that - * remap_page_range works. - */ - page = virt_to_page(vaddr); - end = page + (1 << order); - - split_page(page, order); - - for (i = 0; i < size && err == 0; i += PAGE_SIZE) { - /* MS: This is the whole magic - use cache inhibit pages */ - err = map_page(va + i, pa + i, _PAGE_KERNEL | _PAGE_NO_CACHE); - - SetPageReserved(page); - page++; - } - - /* Free the otherwise unused pages. */ - while (page < end) { - __free_page(page); - page++; - } - - if (err) { - free_pages(vaddr, order); - return NULL; - } - - return ret; -} - -static pte_t *consistent_virt_to_pte(void *vaddr) -{ - unsigned long addr = (unsigned long)vaddr; - - return pte_offset_kernel(pmd_offset(pgd_offset_k(addr), addr), addr); -} - -long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr, - dma_addr_t dma_addr) -{ - pte_t *ptep = consistent_virt_to_pte(vaddr); - - if (pte_none(*ptep) || !pte_present(*ptep)) - return 0; - - return pte_pfn(*ptep); -} - -/* - * free page(s) as defined by the above mapping. - */ -void arch_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, unsigned long attrs) -{ - struct page *page; - - if (in_interrupt()) - BUG(); - - size = PAGE_ALIGN(size); - - do { - pte_t *ptep = consistent_virt_to_pte(vaddr); - unsigned long pfn; - - if (!pte_none(*ptep) && pte_present(*ptep)) { - pfn = pte_pfn(*ptep); - pte_clear(&init_mm, (unsigned int)vaddr, ptep); - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - __free_reserved_page(page); - } - } - vaddr += PAGE_SIZE; - } while (size -= PAGE_SIZE); - - /* flush tlb */ - flush_tlb_all(); + return dma_atomic_pool_init(GFP_KERNEL, pgprot_noncached(PAGE_KERNEL)); } +postcore_initcall(atomic_pool_init); #endif /* CONFIG_MMU */ From 539005ffc6260fcb3bb4171138be5f66a41185a9 Mon Sep 17 00:00:00 2001 From: Lori Hikichi Date: Thu, 8 Aug 2019 09:07:53 +0530 Subject: [PATCH 0280/2880] i2c: iproc: Add full name of devicetree node to adapter name Add the full name of the devicetree node to the adapter name. Without this change, all adapters have the same name making it difficult to distinguish between multiple instances. The most obvious way to see this is to use the utility i2c_detect. e.g. "i2c-detect -l" Before i2c-1 i2c Broadcom iProc I2C adapter I2C adapter i2c-0 i2c Broadcom iProc I2C adapter I2C adapter After i2c-1 i2c Broadcom iProc (i2c@e0000) I2C adapter i2c-0 i2c Broadcom iProc (i2c@b0000) I2C adapter Now it is easy to figure out which adapter maps to a which DT node. Signed-off-by: Lori Hikichi Signed-off-by: Rayagonda Kokatanur Reviewed-by: Ray Jui Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-bcm-iproc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-bcm-iproc.c b/drivers/i2c/busses/i2c-bcm-iproc.c index 2c7f145a036e..ee5578173fb1 100644 --- a/drivers/i2c/busses/i2c-bcm-iproc.c +++ b/drivers/i2c/busses/i2c-bcm-iproc.c @@ -917,7 +917,9 @@ static int bcm_iproc_i2c_probe(struct platform_device *pdev) adap = &iproc_i2c->adapter; i2c_set_adapdata(adap, iproc_i2c); - strlcpy(adap->name, "Broadcom iProc I2C adapter", sizeof(adap->name)); + snprintf(adap->name, sizeof(adap->name), + "Broadcom iProc (%s)", + of_node_full_name(iproc_i2c->device->of_node)); adap->algo = &bcm_iproc_algo; adap->quirks = &bcm_iproc_i2c_quirks; adap->dev.parent = &pdev->dev; From 67a53081e655d41a77f510377364600e5e9bf89c Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Thu, 15 Aug 2019 11:25:50 +0530 Subject: [PATCH 0281/2880] i2c: iproc: Make bcm_iproc_i2c_quirks constant Static structure bcm_iproc_i2c_quirks, of type i2c_adapter_quirks, is only used when being assigned to constant field quirks of a variable having type i2c_adapter. Hence make bcm_iproc_i2c_quirks constant as well to prevent it from unintended modification. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Reviewed-by: Ray Jui Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-bcm-iproc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-bcm-iproc.c b/drivers/i2c/busses/i2c-bcm-iproc.c index ee5578173fb1..24269a5e3af6 100644 --- a/drivers/i2c/busses/i2c-bcm-iproc.c +++ b/drivers/i2c/busses/i2c-bcm-iproc.c @@ -803,7 +803,7 @@ static struct i2c_algorithm bcm_iproc_algo = { .unreg_slave = bcm_iproc_i2c_unreg_slave, }; -static struct i2c_adapter_quirks bcm_iproc_i2c_quirks = { +static const struct i2c_adapter_quirks bcm_iproc_i2c_quirks = { .max_read_len = M_RX_MAX_READ_LEN, }; From d1bbf38aaf882240cccca414e5a48db42e105da7 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 30 Jul 2019 08:04:00 -0500 Subject: [PATCH 0282/2880] PCI: Fix typos and whitespace errors Fix typos in drivers/pci. Comment and whitespace changes only. Link: https://lore.kernel.org/r/20190819115306.27338-1-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas Reviewed-by: Rob Herring Acked-by: Thomas Petazzoni # armada8k --- Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt | 2 +- Documentation/devicetree/bindings/pci/pci-armada8k.txt | 2 +- drivers/pci/Kconfig | 2 +- drivers/pci/vc.c | 1 - include/linux/pci.h | 4 ++-- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt index a7f5f5afa0e6..de4b2baf91e8 100644 --- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt +++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt @@ -50,7 +50,7 @@ Additional required properties for imx7d-pcie and imx8mq-pcie: - power-domains: Must be set to a phandle pointing to PCIE_PHY power domain - resets: Must contain phandles to PCIe-related reset lines exposed by SRC IP block -- reset-names: Must contain the following entires: +- reset-names: Must contain the following entries: - "pciephy" - "apps" - "turnoff" diff --git a/Documentation/devicetree/bindings/pci/pci-armada8k.txt b/Documentation/devicetree/bindings/pci/pci-armada8k.txt index 9e3fc15e1af8..1aaa09254001 100644 --- a/Documentation/devicetree/bindings/pci/pci-armada8k.txt +++ b/Documentation/devicetree/bindings/pci/pci-armada8k.txt @@ -11,7 +11,7 @@ Required properties: - reg-names: - "ctrl" for the control register region - "config" for the config space region -- interrupts: Interrupt specifier for the PCIe controler +- interrupts: Interrupt specifier for the PCIe controller - clocks: reference to the PCIe controller clocks - clock-names: mandatory if there is a second clock, in this case the name must be "core" for the first clock and "reg" for the second diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 2ab92409210a..46f4912a370d 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -170,7 +170,7 @@ config PCI_P2PDMA Many PCIe root complexes do not support P2P transactions and it's hard to tell which support it at all, so at this time, - P2P DMA transations must be between devices behind the same root + P2P DMA transactions must be between devices behind the same root port. If unsure, say N. diff --git a/drivers/pci/vc.c b/drivers/pci/vc.c index 5acd9c02683a..b39e854b80ec 100644 --- a/drivers/pci/vc.c +++ b/drivers/pci/vc.c @@ -409,7 +409,6 @@ void pci_restore_vc_state(struct pci_dev *dev) * For each type of VC capability, VC/VC9/MFVC, find the capability, size * it, and allocate a buffer for save/restore. */ - void pci_allocate_vc_save_buffers(struct pci_dev *dev) { int i; diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..c133579c01fa 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -969,7 +969,7 @@ resource_size_t pcibios_align_resource(void *, const struct resource *, resource_size_t, resource_size_t); -/* Weak but can be overriden by arch */ +/* Weak but can be overridden by arch */ void pci_fixup_cardbus(struct pci_bus *); /* Generic PCI functions used internally */ @@ -1801,7 +1801,7 @@ static inline int pci_ats_page_aligned(struct pci_dev *dev) { return 0; } #include -/* These two functions provide almost identical functionality. Depennding +/* These two functions provide almost identical functionality. Depending * on the architecture, one will be implemented as a wrapper around the * other (in drivers/pci/mmap.c). * From b071c1fd7a8ad041759d9a9e87d1b4333417e36c Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Wed, 7 Aug 2019 15:20:49 +0200 Subject: [PATCH 0283/2880] PCI: OF: Correct of_irq_parse_pci() documentation 530210c7814e ("of/irq: Replace of_irq with of_phandle_args") changed the of_irq_parse_pci() parameter type but didn't change the corresponding documentation. Update the function doc to match. Link: https://lore.kernel.org/r/20190807132049.10304-1-lkundrak@v3.sk Signed-off-by: Lubomir Rintel Signed-off-by: Bjorn Helgaas --- drivers/pci/of.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/of.c b/drivers/pci/of.c index bc7b27a28795..36891e7deee3 100644 --- a/drivers/pci/of.c +++ b/drivers/pci/of.c @@ -353,7 +353,7 @@ EXPORT_SYMBOL_GPL(devm_of_pci_get_host_bridge_resources); /** * of_irq_parse_pci - Resolve the interrupt for a PCI device * @pdev: the device whose interrupt is to be resolved - * @out_irq: structure of_irq filled by this function + * @out_irq: structure of_phandle_args filled by this function * * This function resolves the PCI interrupt for a given PCI device. If a * device-node exists for a given pci_dev, it will use normal OF tree From 55507aea58824578610eb0cb5c250a0c997987c9 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Mon, 26 Aug 2019 00:10:39 +0200 Subject: [PATCH 0284/2880] PCI: Remove unnecessary returns Remove unnecessary "return" statements at the end of void functions. No functional change intended. Link: https://lore.kernel.org/r/20190825221039.6977-1-kw@linux.com Link: https://lore.kernel.org/r/20190826095143.21353-1-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-mediatek.c | 2 -- drivers/pci/hotplug/cpci_hotplug_core.c | 1 - drivers/pci/hotplug/cpqphp_core.c | 1 - drivers/pci/hotplug/cpqphp_ctrl.c | 4 ---- drivers/pci/hotplug/cpqphp_nvram.h | 5 +---- drivers/pci/hotplug/rpadlpar_core.c | 1 - drivers/pci/hotplug/rpaphp_core.c | 1 - 7 files changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c index 80601e1b939e..b92ce6ef718e 100644 --- a/drivers/pci/controller/pcie-mediatek.c +++ b/drivers/pci/controller/pcie-mediatek.c @@ -630,8 +630,6 @@ static void mtk_pcie_intr_handler(struct irq_desc *desc) } chained_irq_exit(irqchip, desc); - - return; } static int mtk_pcie_setup_irq(struct mtk_pcie_port *port, diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c index 603eadf3d965..d0559d2faf50 100644 --- a/drivers/pci/hotplug/cpci_hotplug_core.c +++ b/drivers/pci/hotplug/cpci_hotplug_core.c @@ -563,7 +563,6 @@ cleanup_slots(void) } cleanup_null: up_write(&list_rwsem); - return; } int diff --git a/drivers/pci/hotplug/cpqphp_core.c b/drivers/pci/hotplug/cpqphp_core.c index 16bbb183695a..b8aacb41a83c 100644 --- a/drivers/pci/hotplug/cpqphp_core.c +++ b/drivers/pci/hotplug/cpqphp_core.c @@ -173,7 +173,6 @@ static void pci_print_IRQ_route(void) dbg("%d %d %d %d\n", tbus, tdevice >> 3, tdevice & 0x7, tslot); } - return; } diff --git a/drivers/pci/hotplug/cpqphp_ctrl.c b/drivers/pci/hotplug/cpqphp_ctrl.c index b7f4e1f099d9..68de958a9be8 100644 --- a/drivers/pci/hotplug/cpqphp_ctrl.c +++ b/drivers/pci/hotplug/cpqphp_ctrl.c @@ -1872,8 +1872,6 @@ static void interrupt_event_handler(struct controller *ctrl) } } /* End of FOR loop */ } - - return; } @@ -1943,8 +1941,6 @@ void cpqhp_pushbutton_thread(struct timer_list *t) p_slot->state = STATIC_STATE; } - - return; } diff --git a/drivers/pci/hotplug/cpqphp_nvram.h b/drivers/pci/hotplug/cpqphp_nvram.h index 918ff8dbfe62..70e879b6a23f 100644 --- a/drivers/pci/hotplug/cpqphp_nvram.h +++ b/drivers/pci/hotplug/cpqphp_nvram.h @@ -16,10 +16,7 @@ #ifndef CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM -static inline void compaq_nvram_init(void __iomem *rom_start) -{ - return; -} +static inline void compaq_nvram_init(void __iomem *rom_start) { } static inline int compaq_nvram_load(void __iomem *rom_start, struct controller *ctrl) { diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c index 182f9e3443ee..977946e4e613 100644 --- a/drivers/pci/hotplug/rpadlpar_core.c +++ b/drivers/pci/hotplug/rpadlpar_core.c @@ -473,7 +473,6 @@ int __init rpadlpar_io_init(void) void rpadlpar_io_exit(void) { dlpar_sysfs_exit(); - return; } module_init(rpadlpar_io_init); diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c index bcd5d357ca23..d34a3baf9b3a 100644 --- a/drivers/pci/hotplug/rpaphp_core.c +++ b/drivers/pci/hotplug/rpaphp_core.c @@ -412,7 +412,6 @@ static void __exit cleanup_slots(void) pci_hp_deregister(&slot->hotplug_slot); dealloc_slot_struct(slot); } - return; } static int __init rpaphp_init(void) From 1fb027d7596464d3fad3ed59f70f43807ef926c6 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 8 Jul 2019 17:03:31 +0000 Subject: [PATCH 0285/2880] fuse: require /dev/fuse reads to have enough buffer capacity (take 2) [ This retries commit d4b13963f217 ("fuse: require /dev/fuse reads to have enough buffer capacity"), which was reverted. In this version we require only `sizeof(fuse_in_header) + sizeof(fuse_write_in)` instead of 4K for FUSE request header room, because, contrary to libfuse and kernel client behaviour, GlusterFS actually provides only so much room for request header. ] A FUSE filesystem server queues /dev/fuse sys_read calls to get filesystem requests to handle. It does not know in advance what would be that request as it can be anything that client issues - LOOKUP, READ, WRITE, ... Many requests are short and retrieve data from the filesystem. However WRITE and NOTIFY_REPLY write data into filesystem. Before getting into operation phase, FUSE filesystem server and kernel client negotiate what should be the maximum write size the client will ever issue. After negotiation the contract in between server/client is that the filesystem server then should queue /dev/fuse sys_read calls with enough buffer capacity to receive any client request - WRITE in particular, while FUSE client should not, in particular, send WRITE requests with > negotiated max_write payload. FUSE client in kernel and libfuse historically reserve 4K for request header. However an existing filesystem server - GlusterFS - was found which reserves only 80 bytes for header room (= `sizeof(fuse_in_header) + sizeof(fuse_write_in)`). Since `sizeof(fuse_in_header) + sizeof(fuse_write_in)` == `sizeof(fuse_in_header) + sizeof(fuse_read_in)` == `sizeof(fuse_in_header) + sizeof(fuse_notify_retrieve_in)` is the absolute minimum any sane filesystem should be using for header room, the contract is that filesystem server should queue sys_reads with `sizeof(fuse_in_header) + sizeof(fuse_write_in)` + max_write buffer. If the filesystem server does not follow this contract, what can happen is that fuse_dev_do_read will see that request size is > buffer size, and then it will return EIO to client who issued the request but won't indicate in any way that there is a problem to filesystem server. This can be hard to diagnose because for some requests, e.g. for NOTIFY_REPLY which mimics WRITE, there is no client thread that is waiting for request completion and that EIO goes nowhere, while on filesystem server side things look like the kernel is not replying back after successful NOTIFY_RETRIEVE request made by the server. We can make the problem easy to diagnose if we indicate via error return to filesystem server when it is violating the contract. This should not practically cause problems because if a filesystem server is using shorter buffer, writes to it were already very likely to cause EIO, and if the filesystem is read-only it should be too following FUSE_MIN_READ_BUFFER minimum buffer size. Please see [1] for context where the problem of stuck filesystem was hit for real (because kernel client was incorrectly sending more than max_write data with NOTIFY_REPLY; see also previous patch), how the situation was traced and for more involving patch that did not make it into the tree. [1] https://marc.info/?l=linux-fsdevel&m=155057023600853&w=2 Signed-off-by: Kirill Smelkov Tested-by: Sander Eikelenboom Cc: Han-Wen Nienhuys Cc: Jakob Unterwurzacher Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ea8237513dfa..fdb85895737b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1317,6 +1317,24 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, unsigned reqsize; unsigned int hash; + /* + * Require sane minimum read buffer - that has capacity for fixed part + * of any request header + negotiated max_write room for data. + * + * Historically libfuse reserves 4K for fixed header room, but e.g. + * GlusterFS reserves only 80 bytes + * + * = `sizeof(fuse_in_header) + sizeof(fuse_write_in)` + * + * which is the absolute minimum any sane filesystem should be using + * for header room. + */ + if (nbytes < max_t(size_t, FUSE_MIN_READ_BUFFER, + sizeof(struct fuse_in_header) + + sizeof(struct fuse_write_in) + + fc->max_write)) + return -EINVAL; + restart: spin_lock(&fiq->waitq.lock); err = -EAGAIN; From 17b2cbe294922ec2b78e29b93ed602b56cd75a4e Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 22 Jul 2019 10:17:17 +0300 Subject: [PATCH 0286/2880] fuse: cleanup fuse_wait_on_page_writeback fuse_wait_on_page_writeback() always returns zero and nobody cares. Let's make it void. Signed-off-by: Maxim Patlasov Signed-off-by: Vasily Averin Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ae2828beb00..e076c2cf65b0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -383,12 +383,11 @@ static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) * Since fuse doesn't rely on the VM writeback tracking, this has to * use some other means. */ -static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) +static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) { struct fuse_inode *fi = get_fuse_inode(inode); wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); - return 0; } /* From 56d250ef9650edce600c96e2f918b9b9bafda85e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 29 Aug 2019 11:01:18 +0200 Subject: [PATCH 0287/2880] cuse: fix broken release The inode parameter in cuse_release() is likely *not* a fuse inode. It's a small wonder it didn't blow up until now. Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index bab7a0db81dd..d011a1ad1d4f 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -142,11 +142,10 @@ static int cuse_open(struct inode *inode, struct file *file) static int cuse_release(struct inode *inode, struct file *file) { - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - fuse_sync_release(fi, ff, file->f_flags); + fuse_sync_release(NULL, ff, file->f_flags); fuse_conn_put(fc); return 0; From b65dc4f6b339ff57321fd95f2f7b6197a3c24ba4 Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Tue, 13 Aug 2019 18:31:33 +0800 Subject: [PATCH 0288/2880] mfd: ezx-pcap: Replace mutex_lock with spin_lock As mutex_lock might sleep. Function pcap_adc_irq is an interrupt handler. The use of mutex_lock in pcap_adc_irq may cause sleep in IRQ context. Replace mutex_lock with spin_lock to avoid this. Signed-off-by: Fuqian Huang Signed-off-by: Lee Jones --- drivers/mfd/ezx-pcap.c | 53 ++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c index f505e3e1274b..70fa18b04ad2 100644 --- a/drivers/mfd/ezx-pcap.c +++ b/drivers/mfd/ezx-pcap.c @@ -35,7 +35,7 @@ struct pcap_chip { /* IO */ u32 buf; - struct mutex io_mutex; + spinlock_t io_lock; /* IRQ */ unsigned int irq_base; @@ -48,7 +48,7 @@ struct pcap_chip { struct pcap_adc_request *adc_queue[PCAP_ADC_MAXQ]; u8 adc_head; u8 adc_tail; - struct mutex adc_mutex; + spinlock_t adc_lock; }; /* IO */ @@ -76,14 +76,15 @@ static int ezx_pcap_putget(struct pcap_chip *pcap, u32 *data) int ezx_pcap_write(struct pcap_chip *pcap, u8 reg_num, u32 value) { + unsigned long flags; int ret; - mutex_lock(&pcap->io_mutex); + spin_lock_irqsave(&pcap->io_lock, flags); value &= PCAP_REGISTER_VALUE_MASK; value |= PCAP_REGISTER_WRITE_OP_BIT | (reg_num << PCAP_REGISTER_ADDRESS_SHIFT); ret = ezx_pcap_putget(pcap, &value); - mutex_unlock(&pcap->io_mutex); + spin_unlock_irqrestore(&pcap->io_lock, flags); return ret; } @@ -91,14 +92,15 @@ EXPORT_SYMBOL_GPL(ezx_pcap_write); int ezx_pcap_read(struct pcap_chip *pcap, u8 reg_num, u32 *value) { + unsigned long flags; int ret; - mutex_lock(&pcap->io_mutex); + spin_lock_irqsave(&pcap->io_lock, flags); *value = PCAP_REGISTER_READ_OP_BIT | (reg_num << PCAP_REGISTER_ADDRESS_SHIFT); ret = ezx_pcap_putget(pcap, value); - mutex_unlock(&pcap->io_mutex); + spin_unlock_irqrestore(&pcap->io_lock, flags); return ret; } @@ -106,11 +108,12 @@ EXPORT_SYMBOL_GPL(ezx_pcap_read); int ezx_pcap_set_bits(struct pcap_chip *pcap, u8 reg_num, u32 mask, u32 val) { + unsigned long flags; int ret; u32 tmp = PCAP_REGISTER_READ_OP_BIT | (reg_num << PCAP_REGISTER_ADDRESS_SHIFT); - mutex_lock(&pcap->io_mutex); + spin_lock_irqsave(&pcap->io_lock, flags); ret = ezx_pcap_putget(pcap, &tmp); if (ret) goto out_unlock; @@ -121,7 +124,7 @@ int ezx_pcap_set_bits(struct pcap_chip *pcap, u8 reg_num, u32 mask, u32 val) ret = ezx_pcap_putget(pcap, &tmp); out_unlock: - mutex_unlock(&pcap->io_mutex); + spin_unlock_irqrestore(&pcap->io_lock, flags); return ret; } @@ -212,14 +215,15 @@ static void pcap_irq_handler(struct irq_desc *desc) /* ADC */ void pcap_set_ts_bits(struct pcap_chip *pcap, u32 bits) { + unsigned long flags; u32 tmp; - mutex_lock(&pcap->adc_mutex); + spin_lock_irqsave(&pcap->adc_lock, flags); ezx_pcap_read(pcap, PCAP_REG_ADC, &tmp); tmp &= ~(PCAP_ADC_TS_M_MASK | PCAP_ADC_TS_REF_LOWPWR); tmp |= bits & (PCAP_ADC_TS_M_MASK | PCAP_ADC_TS_REF_LOWPWR); ezx_pcap_write(pcap, PCAP_REG_ADC, tmp); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, flags); } EXPORT_SYMBOL_GPL(pcap_set_ts_bits); @@ -234,15 +238,16 @@ static void pcap_disable_adc(struct pcap_chip *pcap) static void pcap_adc_trigger(struct pcap_chip *pcap) { + unsigned long flags; u32 tmp; u8 head; - mutex_lock(&pcap->adc_mutex); + spin_lock_irqsave(&pcap->adc_lock, flags); head = pcap->adc_head; if (!pcap->adc_queue[head]) { /* queue is empty, save power */ pcap_disable_adc(pcap); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, flags); return; } /* start conversion on requested bank, save TS_M bits */ @@ -254,7 +259,7 @@ static void pcap_adc_trigger(struct pcap_chip *pcap) tmp |= PCAP_ADC_AD_SEL1; ezx_pcap_write(pcap, PCAP_REG_ADC, tmp); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, flags); ezx_pcap_write(pcap, PCAP_REG_ADR, PCAP_ADR_ASC); } @@ -265,11 +270,11 @@ static irqreturn_t pcap_adc_irq(int irq, void *_pcap) u16 res[2]; u32 tmp; - mutex_lock(&pcap->adc_mutex); + spin_lock(&pcap->adc_lock); req = pcap->adc_queue[pcap->adc_head]; if (WARN(!req, "adc irq without pending request\n")) { - mutex_unlock(&pcap->adc_mutex); + spin_unlock(&pcap->adc_lock); return IRQ_HANDLED; } @@ -285,7 +290,7 @@ static irqreturn_t pcap_adc_irq(int irq, void *_pcap) pcap->adc_queue[pcap->adc_head] = NULL; pcap->adc_head = (pcap->adc_head + 1) & (PCAP_ADC_MAXQ - 1); - mutex_unlock(&pcap->adc_mutex); + spin_unlock(&pcap->adc_lock); /* pass the results and release memory */ req->callback(req->data, res); @@ -301,6 +306,7 @@ int pcap_adc_async(struct pcap_chip *pcap, u8 bank, u32 flags, u8 ch[], void *callback, void *data) { struct pcap_adc_request *req; + unsigned long irq_flags; /* This will be freed after we have a result */ req = kmalloc(sizeof(struct pcap_adc_request), GFP_KERNEL); @@ -314,15 +320,15 @@ int pcap_adc_async(struct pcap_chip *pcap, u8 bank, u32 flags, u8 ch[], req->callback = callback; req->data = data; - mutex_lock(&pcap->adc_mutex); + spin_lock_irqsave(&pcap->adc_lock, irq_flags); if (pcap->adc_queue[pcap->adc_tail]) { - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, irq_flags); kfree(req); return -EBUSY; } pcap->adc_queue[pcap->adc_tail] = req; pcap->adc_tail = (pcap->adc_tail + 1) & (PCAP_ADC_MAXQ - 1); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, irq_flags); /* start conversion */ pcap_adc_trigger(pcap); @@ -389,16 +395,17 @@ static int pcap_add_subdev(struct pcap_chip *pcap, static int ezx_pcap_remove(struct spi_device *spi) { struct pcap_chip *pcap = spi_get_drvdata(spi); + unsigned long flags; int i; /* remove all registered subdevs */ device_for_each_child(&spi->dev, NULL, pcap_remove_subdev); /* cleanup ADC */ - mutex_lock(&pcap->adc_mutex); + spin_lock_irqsave(&pcap->adc_lock, flags); for (i = 0; i < PCAP_ADC_MAXQ; i++) kfree(pcap->adc_queue[i]); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, flags); /* cleanup irqchip */ for (i = pcap->irq_base; i < (pcap->irq_base + PCAP_NIRQS); i++) @@ -426,8 +433,8 @@ static int ezx_pcap_probe(struct spi_device *spi) goto ret; } - mutex_init(&pcap->io_mutex); - mutex_init(&pcap->adc_mutex); + spin_lock_init(&pcap->io_lock); + spin_lock_init(&pcap->adc_lock); INIT_WORK(&pcap->isr_work, pcap_isr_work); INIT_WORK(&pcap->msr_work, pcap_msr_work); spi_set_drvdata(spi, pcap); From b9a801dfa59163dc2db8147a98af406eb79e51de Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 1 Aug 2019 22:03:35 +0300 Subject: [PATCH 0289/2880] mfd: Add support for Merrifield Basin Cove PMIC Add an MFD driver for Intel Merrifield Basin Cove PMIC. Firmware on the platforms which are using Basin Cove PMIC is "smarter" than on the rest supported by vanilla kernel. It handles first level of interrupt itself, while others do it on OS level. The driver is done in the same way as the rest of Intel PMIC MFD drivers in the kernel to support the initial design. The design allows to use one driver among few PMICs without knowing implementation details of the each hardware version or generation. Signed-off-by: Andy Shevchenko Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 11 ++ drivers/mfd/Makefile | 1 + drivers/mfd/intel_soc_pmic_mrfld.c | 157 +++++++++++++++++++++++ include/linux/mfd/intel_soc_pmic_mrfld.h | 81 ++++++++++++ 4 files changed, 250 insertions(+) create mode 100644 drivers/mfd/intel_soc_pmic_mrfld.c create mode 100644 include/linux/mfd/intel_soc_pmic_mrfld.h diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 4a07afe50b35..a6854d41d1ca 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -597,6 +597,17 @@ config INTEL_SOC_PMIC_CHTDC_TI Select this option for supporting Dollar Cove (TI version) PMIC device that is found on some Intel Cherry Trail systems. +config INTEL_SOC_PMIC_MRFLD + tristate "Support for Intel Merrifield Basin Cove PMIC" + depends on GPIOLIB + depends on ACPI + depends on INTEL_SCU_IPC + select MFD_CORE + select REGMAP_IRQ + help + Select this option for supporting Basin Cove PMIC device + that is found on Intel Merrifield systems. + config MFD_INTEL_LPSS tristate select COMMON_CLK diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 7b6a6aa4fe42..3ae6fa6552d3 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -242,6 +242,7 @@ obj-$(CONFIG_INTEL_SOC_PMIC_CHTWC) += intel_soc_pmic_chtwc.o obj-$(CONFIG_INTEL_SOC_PMIC_CHTDC_TI) += intel_soc_pmic_chtdc_ti.o mt6397-objs := mt6397-core.o mt6397-irq.o obj-$(CONFIG_MFD_MT6397) += mt6397.o +obj-$(CONFIG_INTEL_SOC_PMIC_MRFLD) += intel_soc_pmic_mrfld.o obj-$(CONFIG_MFD_ALTERA_A10SR) += altera-a10sr.o obj-$(CONFIG_MFD_ALTERA_SYSMGR) += altera-sysmgr.o diff --git a/drivers/mfd/intel_soc_pmic_mrfld.c b/drivers/mfd/intel_soc_pmic_mrfld.c new file mode 100644 index 000000000000..26a1551c5faf --- /dev/null +++ b/drivers/mfd/intel_soc_pmic_mrfld.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device access for Basin Cove PMIC + * + * Copyright (c) 2019, Intel Corporation. + * Author: Andy Shevchenko + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Level 2 IRQs + * + * Firmware on the systems with Basin Cove PMIC services Level 1 IRQs + * without an assistance. Thus, each of the Level 1 IRQ is represented + * as a separate RTE in IOAPIC. + */ +static struct resource irq_level2_resources[] = { + DEFINE_RES_IRQ(0), /* power button */ + DEFINE_RES_IRQ(0), /* TMU */ + DEFINE_RES_IRQ(0), /* thermal */ + DEFINE_RES_IRQ(0), /* BCU */ + DEFINE_RES_IRQ(0), /* ADC */ + DEFINE_RES_IRQ(0), /* charger */ + DEFINE_RES_IRQ(0), /* GPIO */ +}; + +static const struct mfd_cell bcove_dev[] = { + { + .name = "mrfld_bcove_pwrbtn", + .num_resources = 1, + .resources = &irq_level2_resources[0], + }, { + .name = "mrfld_bcove_tmu", + .num_resources = 1, + .resources = &irq_level2_resources[1], + }, { + .name = "mrfld_bcove_thermal", + .num_resources = 1, + .resources = &irq_level2_resources[2], + }, { + .name = "mrfld_bcove_bcu", + .num_resources = 1, + .resources = &irq_level2_resources[3], + }, { + .name = "mrfld_bcove_adc", + .num_resources = 1, + .resources = &irq_level2_resources[4], + }, { + .name = "mrfld_bcove_charger", + .num_resources = 1, + .resources = &irq_level2_resources[5], + }, { + .name = "mrfld_bcove_pwrsrc", + .num_resources = 1, + .resources = &irq_level2_resources[5], + }, { + .name = "mrfld_bcove_gpio", + .num_resources = 1, + .resources = &irq_level2_resources[6], + }, + { .name = "mrfld_bcove_region", }, +}; + +static int bcove_ipc_byte_reg_read(void *context, unsigned int reg, + unsigned int *val) +{ + u8 ipc_out; + int ret; + + ret = intel_scu_ipc_ioread8(reg, &ipc_out); + if (ret) + return ret; + + *val = ipc_out; + return 0; +} + +static int bcove_ipc_byte_reg_write(void *context, unsigned int reg, + unsigned int val) +{ + u8 ipc_in = val; + int ret; + + ret = intel_scu_ipc_iowrite8(reg, ipc_in); + if (ret) + return ret; + + return 0; +} + +static const struct regmap_config bcove_regmap_config = { + .reg_bits = 16, + .val_bits = 8, + .max_register = 0xff, + .reg_write = bcove_ipc_byte_reg_write, + .reg_read = bcove_ipc_byte_reg_read, +}; + +static int bcove_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct intel_soc_pmic *pmic; + unsigned int i; + int ret; + + pmic = devm_kzalloc(dev, sizeof(*pmic), GFP_KERNEL); + if (!pmic) + return -ENOMEM; + + platform_set_drvdata(pdev, pmic); + pmic->dev = &pdev->dev; + + pmic->regmap = devm_regmap_init(dev, NULL, pmic, &bcove_regmap_config); + if (IS_ERR(pmic->regmap)) + return PTR_ERR(pmic->regmap); + + for (i = 0; i < ARRAY_SIZE(irq_level2_resources); i++) { + ret = platform_get_irq(pdev, i); + if (ret < 0) + return ret; + + irq_level2_resources[i].start = ret; + irq_level2_resources[i].end = ret; + } + + return devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, + bcove_dev, ARRAY_SIZE(bcove_dev), + NULL, 0, NULL); +} + +static const struct acpi_device_id bcove_acpi_ids[] = { + { "INTC100E" }, + {} +}; +MODULE_DEVICE_TABLE(acpi, bcove_acpi_ids); + +static struct platform_driver bcove_driver = { + .driver = { + .name = "intel_soc_pmic_mrfld", + .acpi_match_table = bcove_acpi_ids, + }, + .probe = bcove_probe, +}; +module_platform_driver(bcove_driver); + +MODULE_DESCRIPTION("IPC driver for Intel SoC Basin Cove PMIC"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/mfd/intel_soc_pmic_mrfld.h b/include/linux/mfd/intel_soc_pmic_mrfld.h new file mode 100644 index 000000000000..4daecd682275 --- /dev/null +++ b/include/linux/mfd/intel_soc_pmic_mrfld.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Header file for Intel Merrifield Basin Cove PMIC + * + * Copyright (C) 2019 Intel Corporation. All rights reserved. + */ + +#ifndef __INTEL_SOC_PMIC_MRFLD_H__ +#define __INTEL_SOC_PMIC_MRFLD_H__ + +#include + +#define BCOVE_ID 0x00 + +#define BCOVE_ID_MINREV0 GENMASK(2, 0) +#define BCOVE_ID_MAJREV0 GENMASK(5, 3) +#define BCOVE_ID_VENDID0 GENMASK(7, 6) + +#define BCOVE_MINOR(x) (unsigned int)(((x) & BCOVE_ID_MINREV0) >> 0) +#define BCOVE_MAJOR(x) (unsigned int)(((x) & BCOVE_ID_MAJREV0) >> 3) +#define BCOVE_VENDOR(x) (unsigned int)(((x) & BCOVE_ID_VENDID0) >> 6) + +#define BCOVE_IRQLVL1 0x01 + +#define BCOVE_PBIRQ 0x02 +#define BCOVE_TMUIRQ 0x03 +#define BCOVE_THRMIRQ 0x04 +#define BCOVE_BCUIRQ 0x05 +#define BCOVE_ADCIRQ 0x06 +#define BCOVE_CHGRIRQ0 0x07 +#define BCOVE_CHGRIRQ1 0x08 +#define BCOVE_GPIOIRQ 0x09 +#define BCOVE_CRITIRQ 0x0B + +#define BCOVE_MIRQLVL1 0x0C + +#define BCOVE_MPBIRQ 0x0D +#define BCOVE_MTMUIRQ 0x0E +#define BCOVE_MTHRMIRQ 0x0F +#define BCOVE_MBCUIRQ 0x10 +#define BCOVE_MADCIRQ 0x11 +#define BCOVE_MCHGRIRQ0 0x12 +#define BCOVE_MCHGRIRQ1 0x13 +#define BCOVE_MGPIOIRQ 0x14 +#define BCOVE_MCRITIRQ 0x16 + +#define BCOVE_SCHGRIRQ0 0x4E +#define BCOVE_SCHGRIRQ1 0x4F + +/* Level 1 IRQs */ +#define BCOVE_LVL1_PWRBTN BIT(0) /* power button */ +#define BCOVE_LVL1_TMU BIT(1) /* time management unit */ +#define BCOVE_LVL1_THRM BIT(2) /* thermal */ +#define BCOVE_LVL1_BCU BIT(3) /* burst control unit */ +#define BCOVE_LVL1_ADC BIT(4) /* ADC */ +#define BCOVE_LVL1_CHGR BIT(5) /* charger */ +#define BCOVE_LVL1_GPIO BIT(6) /* GPIO */ +#define BCOVE_LVL1_CRIT BIT(7) /* critical event */ + +/* Level 2 IRQs: power button */ +#define BCOVE_PBIRQ_PBTN BIT(0) +#define BCOVE_PBIRQ_UBTN BIT(1) + +/* Level 2 IRQs: ADC */ +#define BCOVE_ADCIRQ_BATTEMP BIT(2) +#define BCOVE_ADCIRQ_SYSTEMP BIT(3) +#define BCOVE_ADCIRQ_BATTID BIT(4) +#define BCOVE_ADCIRQ_VIBATT BIT(5) +#define BCOVE_ADCIRQ_CCTICK BIT(7) + +/* Level 2 IRQs: charger */ +#define BCOVE_CHGRIRQ_BAT0ALRT BIT(4) +#define BCOVE_CHGRIRQ_BAT1ALRT BIT(5) +#define BCOVE_CHGRIRQ_BATCRIT BIT(6) + +#define BCOVE_CHGRIRQ_VBUSDET BIT(0) +#define BCOVE_CHGRIRQ_DCDET BIT(1) +#define BCOVE_CHGRIRQ_BATTDET BIT(2) +#define BCOVE_CHGRIRQ_USBIDDET BIT(3) + +#endif /* __INTEL_SOC_PMIC_MRFLD_H__ */ From cbd1c5c4d443c4a83c9c7eecc0561b9aeafddf50 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 16 Aug 2019 20:33:42 +0300 Subject: [PATCH 0290/2880] mfd: intel-lpss: Consistently use GENMASK() Since we already are using BIT() macro, use GENMASK() as well for sake of consistency. Signed-off-by: Andy Shevchenko Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c index 277f48f1cc1c..3e16a1765142 100644 --- a/drivers/mfd/intel-lpss.c +++ b/drivers/mfd/intel-lpss.c @@ -47,10 +47,10 @@ #define LPSS_PRIV_IDLELTR 0x14 #define LPSS_PRIV_LTR_REQ BIT(15) -#define LPSS_PRIV_LTR_SCALE_MASK 0xc00 -#define LPSS_PRIV_LTR_SCALE_1US 0x800 -#define LPSS_PRIV_LTR_SCALE_32US 0xc00 -#define LPSS_PRIV_LTR_VALUE_MASK 0x3ff +#define LPSS_PRIV_LTR_SCALE_MASK GENMASK(11, 10) +#define LPSS_PRIV_LTR_SCALE_1US (2 << 10) +#define LPSS_PRIV_LTR_SCALE_32US (3 << 10) +#define LPSS_PRIV_LTR_VALUE_MASK GENMASK(9, 0) #define LPSS_PRIV_SSP_REG 0x20 #define LPSS_PRIV_SSP_REG_DIS_DMA_FIN BIT(0) @@ -59,8 +59,8 @@ #define LPSS_PRIV_CAPS 0xfc #define LPSS_PRIV_CAPS_NO_IDMA BIT(8) +#define LPSS_PRIV_CAPS_TYPE_MASK GENMASK(7, 4) #define LPSS_PRIV_CAPS_TYPE_SHIFT 4 -#define LPSS_PRIV_CAPS_TYPE_MASK (0xf << LPSS_PRIV_CAPS_TYPE_SHIFT) /* This matches the type field in CAPS register */ enum intel_lpss_dev_type { From c5b90cb26e83ad89cb18e5ec97a992f02d8f750d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 16 Aug 2019 20:56:02 +0300 Subject: [PATCH 0291/2880] mfd: intel-lpss: Add Intel Skylake ACPI IDs Some of the laptops, like ASUS U306UA, may expose LPSS devices via ACPI. Add their IDs to the list. Signed-off-by: Andy Shevchenko Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss-acpi.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/mfd/intel-lpss-acpi.c b/drivers/mfd/intel-lpss-acpi.c index 61ffb8b393e4..c8fe334b5fe8 100644 --- a/drivers/mfd/intel-lpss-acpi.c +++ b/drivers/mfd/intel-lpss-acpi.c @@ -18,6 +18,10 @@ #include "intel-lpss.h" +static const struct intel_lpss_platform_info spt_info = { + .clk_rate = 120000000, +}; + static struct property_entry spt_i2c_properties[] = { PROPERTY_ENTRY_U32("i2c-sda-hold-time-ns", 230), { }, @@ -28,6 +32,19 @@ static const struct intel_lpss_platform_info spt_i2c_info = { .properties = spt_i2c_properties, }; +static struct property_entry uart_properties[] = { + PROPERTY_ENTRY_U32("reg-io-width", 4), + PROPERTY_ENTRY_U32("reg-shift", 2), + PROPERTY_ENTRY_BOOL("snps,uart-16550-compatible"), + { }, +}; + +static const struct intel_lpss_platform_info spt_uart_info = { + .clk_rate = 120000000, + .clk_con_id = "baudclk", + .properties = uart_properties, +}; + static const struct intel_lpss_platform_info bxt_info = { .clk_rate = 100000000, }; @@ -58,8 +75,17 @@ static const struct intel_lpss_platform_info apl_i2c_info = { static const struct acpi_device_id intel_lpss_acpi_ids[] = { /* SPT */ + { "INT3440", (kernel_ulong_t)&spt_info }, + { "INT3441", (kernel_ulong_t)&spt_info }, + { "INT3442", (kernel_ulong_t)&spt_i2c_info }, + { "INT3443", (kernel_ulong_t)&spt_i2c_info }, + { "INT3444", (kernel_ulong_t)&spt_i2c_info }, + { "INT3445", (kernel_ulong_t)&spt_i2c_info }, { "INT3446", (kernel_ulong_t)&spt_i2c_info }, { "INT3447", (kernel_ulong_t)&spt_i2c_info }, + { "INT3448", (kernel_ulong_t)&spt_uart_info }, + { "INT3449", (kernel_ulong_t)&spt_uart_info }, + { "INT344A", (kernel_ulong_t)&spt_uart_info }, /* BXT */ { "80860AAC", (kernel_ulong_t)&bxt_i2c_info }, { "80860ABC", (kernel_ulong_t)&bxt_info }, From f68c0a873ef28637a201ec37e1bafdf040813454 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 20 Aug 2019 12:37:15 +0200 Subject: [PATCH 0292/2880] mfd: sm501: Include the GPIO driver header This driver creates a gpio chip so it needs to include the appropriate header explicitly rather than implicitly. Signed-off-by: Linus Walleij Signed-off-by: Lee Jones --- drivers/mfd/sm501.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c index d5e34a5eb250..154270f8d8d7 100644 --- a/drivers/mfd/sm501.c +++ b/drivers/mfd/sm501.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include From 1094422253db01ac3a12bde010c75e5135021d2d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 20 Aug 2019 17:34:43 +0200 Subject: [PATCH 0293/2880] mfd: htc-i2cpld: Drop check because i2c_unregister_device() is NULL safe No need to check the argument of i2c_unregister_device() because the function itself does it. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/htc-i2cpld.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mfd/htc-i2cpld.c b/drivers/mfd/htc-i2cpld.c index 370519af5d0b..8ad6768bd7a2 100644 --- a/drivers/mfd/htc-i2cpld.c +++ b/drivers/mfd/htc-i2cpld.c @@ -385,8 +385,7 @@ static void htcpld_unregister_chip_i2c( htcpld = platform_get_drvdata(pdev); chip = &htcpld->chip[chip_index]; - if (chip->client) - i2c_unregister_device(chip->client); + i2c_unregister_device(chip->client); } static int htcpld_register_chip_gpio( From 569fac74627cc332a2097a7a4bfdc654b8e7f273 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 21 Aug 2019 11:37:12 +0300 Subject: [PATCH 0294/2880] mfd: intel-lpss: Use MODULE_SOFTDEP() instead of implicit request There is no need to handle optional module request in the driver when user space tools has that feature for ages. Replace custom code by MODULE_SOFTDEP() macro to let user space know that we would like to have the DMA driver loaded first, if any. Signed-off-by: Andy Shevchenko Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c index 3e16a1765142..bfe4ff337581 100644 --- a/drivers/mfd/intel-lpss.c +++ b/drivers/mfd/intel-lpss.c @@ -128,17 +128,6 @@ static const struct mfd_cell intel_lpss_spi_cell = { static DEFINE_IDA(intel_lpss_devid_ida); static struct dentry *intel_lpss_debugfs; -static int intel_lpss_request_dma_module(const char *name) -{ - static bool intel_lpss_dma_requested; - - if (intel_lpss_dma_requested) - return 0; - - intel_lpss_dma_requested = true; - return request_module("%s", name); -} - static void intel_lpss_cache_ltr(struct intel_lpss *lpss) { lpss->active_ltr = readl(lpss->priv + LPSS_PRIV_ACTIVELTR); @@ -429,16 +418,6 @@ int intel_lpss_probe(struct device *dev, dev_warn(dev, "Failed to create debugfs entries\n"); if (intel_lpss_has_idma(lpss)) { - /* - * Ensure the DMA driver is loaded before the host - * controller device appears, so that the host controller - * driver can request its DMA channels as early as - * possible. - * - * If the DMA module is not there that's OK as well. - */ - intel_lpss_request_dma_module(LPSS_IDMA64_DRIVER_NAME); - ret = mfd_add_devices(dev, lpss->devid, &intel_lpss_idma64_cell, 1, info->mem, info->irq, NULL); if (ret) @@ -554,3 +533,11 @@ MODULE_AUTHOR("Heikki Krogerus "); MODULE_AUTHOR("Jarkko Nikula "); MODULE_DESCRIPTION("Intel LPSS core driver"); MODULE_LICENSE("GPL v2"); +/* + * Ensure the DMA driver is loaded before the host controller device appears, + * so that the host controller driver can request its DMA channels as early + * as possible. + * + * If the DMA module is not there that's OK as well. + */ +MODULE_SOFTDEP("pre: platform:" LPSS_IDMA64_DRIVER_NAME); From fea3ac55e112ee52feba03c48c182ab6d0ad8e92 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 29 Aug 2019 13:25:01 +0200 Subject: [PATCH 0295/2880] mfd: db8500-prcmu: Support the higher DB8520 ARMSS The DB8520 used in a lot of Samsung phones has a slightly higher maximum ARMSS frequency than the DB8500. In order to not confuse the OPP framework and cpufreq, make sure the PRCMU driver returns the correct frequency. Cc: Stephan Gerhold Signed-off-by: Linus Walleij Signed-off-by: Lee Jones --- drivers/mfd/db8500-prcmu.c | 40 +++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c index 0f459c246634..0e019cc5da42 100644 --- a/drivers/mfd/db8500-prcmu.c +++ b/drivers/mfd/db8500-prcmu.c @@ -1695,21 +1695,41 @@ static long round_clock_rate(u8 clock, unsigned long rate) return rounded_rate; } -static const unsigned long armss_freqs[] = { +static const unsigned long db8500_armss_freqs[] = { 200000000, 400000000, 800000000, 998400000 }; +/* The DB8520 has slightly higher ARMSS max frequency */ +static const unsigned long db8520_armss_freqs[] = { + 200000000, + 400000000, + 800000000, + 1152000000 +}; + + + static long round_armss_rate(unsigned long rate) { unsigned long freq = 0; + const unsigned long *freqs; + int nfreqs; int i; + if (fw_info.version.project == PRCMU_FW_PROJECT_U8520) { + freqs = db8520_armss_freqs; + nfreqs = ARRAY_SIZE(db8520_armss_freqs); + } else { + freqs = db8500_armss_freqs; + nfreqs = ARRAY_SIZE(db8500_armss_freqs); + } + /* Find the corresponding arm opp from the cpufreq table. */ - for (i = 0; i < ARRAY_SIZE(armss_freqs); i++) { - freq = armss_freqs[i]; + for (i = 0; i < nfreqs; i++) { + freq = freqs[i]; if (rate <= freq) break; } @@ -1854,11 +1874,21 @@ static int set_armss_rate(unsigned long rate) { unsigned long freq; u8 opps[] = { ARM_EXTCLK, ARM_50_OPP, ARM_100_OPP, ARM_MAX_OPP }; + const unsigned long *freqs; + int nfreqs; int i; + if (fw_info.version.project == PRCMU_FW_PROJECT_U8520) { + freqs = db8520_armss_freqs; + nfreqs = ARRAY_SIZE(db8520_armss_freqs); + } else { + freqs = db8500_armss_freqs; + nfreqs = ARRAY_SIZE(db8500_armss_freqs); + } + /* Find the corresponding arm opp from the cpufreq table. */ - for (i = 0; i < ARRAY_SIZE(armss_freqs); i++) { - freq = armss_freqs[i]; + for (i = 0; i < nfreqs; i++) { + freq = freqs[i]; if (rate == freq) break; } From b788d111e6760602a097cd774e09d0f427de707c Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Sun, 18 Aug 2019 15:55:59 +0200 Subject: [PATCH 0296/2880] dt-bindings: mfd: mediatek: mt6397: Change to relative paths Paths in dt-bindings should be relative as suggested by Lee Jones. Suggested-By: Lee Jones Signed-off-by: Frank Wunderlich Reviewed-by: Rob Herring Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/mt6397.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/mfd/mt6397.txt b/Documentation/devicetree/bindings/mfd/mt6397.txt index 0ebd08af777d..ab3163a60929 100644 --- a/Documentation/devicetree/bindings/mfd/mt6397.txt +++ b/Documentation/devicetree/bindings/mfd/mt6397.txt @@ -12,7 +12,7 @@ MT6397/MT6323 is a multifunction device with the following sub modules: It is interfaced to host controller using SPI interface by a proprietary hardware called PMIC wrapper or pwrap. MT6397/MT6323 MFD is a child device of pwrap. See the following for pwarp node definitions: -Documentation/devicetree/bindings/soc/mediatek/pwrap.txt +../soc/mediatek/pwrap.txt This document describes the binding for MFD device and its sub module. @@ -27,9 +27,9 @@ Optional subnodes: - regulators Required properties: - compatible: "mediatek,mt6397-regulator" - see Documentation/devicetree/bindings/regulator/mt6397-regulator.txt + see ../regulator/mt6397-regulator.txt - compatible: "mediatek,mt6323-regulator" - see Documentation/devicetree/bindings/regulator/mt6323-regulator.txt + see ../regulator/mt6323-regulator.txt - codec Required properties: - compatible: "mediatek,mt6397-codec" @@ -39,12 +39,12 @@ Optional subnodes: - led Required properties: - compatible: "mediatek,mt6323-led" - see Documentation/devicetree/bindings/leds/leds-mt6323.txt + see ../leds/leds-mt6323.txt - keys Required properties: - compatible: "mediatek,mt6397-keys" or "mediatek,mt6323-keys" - see Documentation/devicetree/bindings/input/mtk-pmic-keys.txt + see ../input/mtk-pmic-keys.txt Example: pwrap: pwrap@1000f000 { From 7051919d17512f51e21d91b6564882b0476f2a2c Mon Sep 17 00:00:00 2001 From: Josef Friedl Date: Sun, 18 Aug 2019 15:56:00 +0200 Subject: [PATCH 0297/2880] dt-bindings: mfd: mediatek: Update RTC to include MT6323 Add MT6323 to RTC bindings. Signed-off-by: Josef Friedl Signed-off-by: Frank Wunderlich Reviewed-by: Rob Herring Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/mt6397.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mfd/mt6397.txt b/Documentation/devicetree/bindings/mfd/mt6397.txt index ab3163a60929..5fccf987865b 100644 --- a/Documentation/devicetree/bindings/mfd/mt6397.txt +++ b/Documentation/devicetree/bindings/mfd/mt6397.txt @@ -22,8 +22,10 @@ compatible: "mediatek,mt6397" or "mediatek,mt6323" Optional subnodes: - rtc - Required properties: + Required properties: Should be one of follows + - compatible: "mediatek,mt6323-rtc" - compatible: "mediatek,mt6397-rtc" + For details, see ../rtc/rtc-mt6397.txt - regulators Required properties: - compatible: "mediatek,mt6397-regulator" From 8ab1267ff73dc372d63dffc7736058d456b8c75b Mon Sep 17 00:00:00 2001 From: Josef Friedl Date: Sun, 18 Aug 2019 15:56:01 +0200 Subject: [PATCH 0298/2880] dt-bindings: mfd: mediatek: Add MT6323 Power Controller - Add Power Controller section to existing binding document. - Add MT6323 PWRC bindings document with example. Suggested-by: Frank Wunderlich Signed-off-by: Josef Friedl Signed-off-by: Frank Wunderlich Signed-off-by: Lee Jones --- .../devicetree/bindings/mfd/mt6397.txt | 6 ++++++ .../bindings/power/reset/mt6323-poweroff.txt | 20 +++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt diff --git a/Documentation/devicetree/bindings/mfd/mt6397.txt b/Documentation/devicetree/bindings/mfd/mt6397.txt index 5fccf987865b..a9b105ac00a8 100644 --- a/Documentation/devicetree/bindings/mfd/mt6397.txt +++ b/Documentation/devicetree/bindings/mfd/mt6397.txt @@ -8,6 +8,7 @@ MT6397/MT6323 is a multifunction device with the following sub modules: - Clock - LED - Keys +- Power controller It is interfaced to host controller using SPI interface by a proprietary hardware called PMIC wrapper or pwrap. MT6397/MT6323 MFD is a child device of pwrap. @@ -48,6 +49,11 @@ Optional subnodes: - compatible: "mediatek,mt6397-keys" or "mediatek,mt6323-keys" see ../input/mtk-pmic-keys.txt +- power-controller + Required properties: + - compatible: "mediatek,mt6323-pwrc" + For details, see ../power/reset/mt6323-poweroff.txt + Example: pwrap: pwrap@1000f000 { compatible = "mediatek,mt8135-pwrap"; diff --git a/Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt b/Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt new file mode 100644 index 000000000000..933f0c48e887 --- /dev/null +++ b/Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt @@ -0,0 +1,20 @@ +Device Tree Bindings for Power Controller on MediaTek PMIC + +The power controller which could be found on PMIC is responsible for externally +powering off or on the remote MediaTek SoC through the circuit BBPU. + +Required properties: +- compatible: Should be one of follows + "mediatek,mt6323-pwrc": for MT6323 PMIC + +Example: + + pmic { + compatible = "mediatek,mt6323"; + + ... + + power-controller { + compatible = "mediatek,mt6323-pwrc"; + }; + } From 09c1dec470029f812e86aa92908abfd9132c2fab Mon Sep 17 00:00:00 2001 From: Josef Friedl Date: Sun, 18 Aug 2019 15:56:04 +0200 Subject: [PATCH 0299/2880] mfd: mt6397: Add mutex include Add missing mutex.h. Signed-off-by: Josef Friedl Signed-off-by: Frank Wunderlich Signed-off-by: Lee Jones --- include/linux/mfd/mt6397/core.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h index 9320c2a1e3e6..fc88d315bdde 100644 --- a/include/linux/mfd/mt6397/core.h +++ b/include/linux/mfd/mt6397/core.h @@ -7,6 +7,8 @@ #ifndef __MFD_MT6397_CORE_H__ #define __MFD_MT6397_CORE_H__ +#include + enum chip_id { MT6323_CHIP_ID = 0x23, MT6391_CHIP_ID = 0x91, From 7c3f7cd5a0c96ab40baaaf08968249fcb96c4057 Mon Sep 17 00:00:00 2001 From: Josef Friedl Date: Sun, 18 Aug 2019 15:56:06 +0200 Subject: [PATCH 0300/2880] mfd: mt6323: Replace boilerplate resource code with DEFINE_RES_* macros Simplifies and reduces LoC. Signed-off-by: Josef Friedl Signed-off-by: Frank Wunderlich Signed-off-by: Lee Jones --- drivers/mfd/mt6397-core.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c index 93c888153b77..7c81a20865f0 100644 --- a/drivers/mfd/mt6397-core.c +++ b/drivers/mfd/mt6397-core.c @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -19,16 +20,8 @@ #define MT6397_RTC_SIZE 0x3e static const struct resource mt6397_rtc_resources[] = { - { - .start = MT6397_RTC_BASE, - .end = MT6397_RTC_BASE + MT6397_RTC_SIZE, - .flags = IORESOURCE_MEM, - }, - { - .start = MT6397_IRQ_RTC, - .end = MT6397_IRQ_RTC, - .flags = IORESOURCE_IRQ, - }, + DEFINE_RES_MEM(MT6397_RTC_BASE, MT6397_RTC_SIZE), + DEFINE_RES_IRQ(MT6397_IRQ_RTC), }; static const struct resource mt6323_keys_resources[] = { From 8391c6cb2414d9a75bbe247a838d28cb0cee77ee Mon Sep 17 00:00:00 2001 From: Josef Friedl Date: Sun, 18 Aug 2019 15:56:08 +0200 Subject: [PATCH 0301/2880] mfd: mt6323: Add MT6323 RTC and PWRC Add entry for RTC and Power Controller to MT6323. Signed-off-by: Josef Friedl Signed-off-by: Frank Wunderlich Signed-off-by: Lee Jones --- drivers/mfd/mt6397-core.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c index 7c81a20865f0..310dae26ddff 100644 --- a/drivers/mfd/mt6397-core.c +++ b/drivers/mfd/mt6397-core.c @@ -16,9 +16,20 @@ #include #include +#define MT6323_RTC_BASE 0x8000 +#define MT6323_RTC_SIZE 0x40 + #define MT6397_RTC_BASE 0xe000 #define MT6397_RTC_SIZE 0x3e +#define MT6323_PWRC_BASE 0x8000 +#define MT6323_PWRC_SIZE 0x40 + +static const struct resource mt6323_rtc_resources[] = { + DEFINE_RES_MEM(MT6323_RTC_BASE, MT6323_RTC_SIZE), + DEFINE_RES_IRQ(MT6323_IRQ_STATUS_RTC), +}; + static const struct resource mt6397_rtc_resources[] = { DEFINE_RES_MEM(MT6397_RTC_BASE, MT6397_RTC_SIZE), DEFINE_RES_IRQ(MT6397_IRQ_RTC), @@ -34,8 +45,17 @@ static const struct resource mt6397_keys_resources[] = { DEFINE_RES_IRQ(MT6397_IRQ_HOMEKEY), }; +static const struct resource mt6323_pwrc_resources[] = { + DEFINE_RES_MEM(MT6323_PWRC_BASE, MT6323_PWRC_SIZE), +}; + static const struct mfd_cell mt6323_devs[] = { { + .name = "mt6323-rtc", + .num_resources = ARRAY_SIZE(mt6323_rtc_resources), + .resources = mt6323_rtc_resources, + .of_compatible = "mediatek,mt6323-rtc", + }, { .name = "mt6323-regulator", .of_compatible = "mediatek,mt6323-regulator" }, { @@ -46,6 +66,11 @@ static const struct mfd_cell mt6323_devs[] = { .num_resources = ARRAY_SIZE(mt6323_keys_resources), .resources = mt6323_keys_resources, .of_compatible = "mediatek,mt6323-keys" + }, { + .name = "mt6323-pwrc", + .num_resources = ARRAY_SIZE(mt6323_pwrc_resources), + .resources = mt6323_pwrc_resources, + .of_compatible = "mediatek,mt6323-pwrc" }, }; From c709bf455d60662377a47686180c0ac5c65f4d07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Aug 2019 18:12:36 +0200 Subject: [PATCH 0302/2880] microblaze: remove ioremap_fullcache No callers of this function. Signed-off-by: Christoph Hellwig Signed-off-by: Michal Simek --- arch/microblaze/include/asm/io.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/microblaze/include/asm/io.h b/arch/microblaze/include/asm/io.h index c7968139486f..86c95b2a1ce1 100644 --- a/arch/microblaze/include/asm/io.h +++ b/arch/microblaze/include/asm/io.h @@ -40,7 +40,6 @@ extern void iounmap(volatile void __iomem *addr); extern void __iomem *ioremap(phys_addr_t address, unsigned long size); #define ioremap_nocache(addr, size) ioremap((addr), (size)) -#define ioremap_fullcache(addr, size) ioremap((addr), (size)) #define ioremap_wc(addr, size) ioremap((addr), (size)) #define ioremap_wt(addr, size) ioremap((addr), (size)) From bcd69da98e36afccb4cb5fbdbd2b7bb78c07184d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 13 Aug 2019 13:58:53 +0200 Subject: [PATCH 0303/2880] video: backlight: Drop default m for {LCD,BACKLIGHT_CLASS_DEVICE} When running "make oldconfig" on a .config where CONFIG_BACKLIGHT_LCD_SUPPORT is not set, two new config options ("Lowlevel LCD controls" and "Lowlevel Backlight controls") appear, both defaulting to "m". Drop the "default m", as options should default to disabled, and because several driver config options already select LCD_CLASS_DEVICE or BACKLIGHT_CLASS_DEVICE when needed. Fixes: 8c5dc8d9f19c7992 ("video: backlight: Remove useless BACKLIGHT_LCD_SUPPORT kernel symbol") Signed-off-by: Geert Uytterhoeven Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/Kconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig index 8b081d61773e..40676be2e46a 100644 --- a/drivers/video/backlight/Kconfig +++ b/drivers/video/backlight/Kconfig @@ -10,7 +10,6 @@ menu "Backlight & LCD device support" # config LCD_CLASS_DEVICE tristate "Lowlevel LCD controls" - default m help This framework adds support for low-level control of LCD. Some framebuffer devices connect to platform-specific LCD modules @@ -143,7 +142,6 @@ endif # LCD_CLASS_DEVICE # config BACKLIGHT_CLASS_DEVICE tristate "Lowlevel Backlight controls" - default m help This framework adds support for low-level control of the LCD backlight. This includes support for brightness and power. From 28a1d72a221ecdf8f63205d40cb654c81a2b2da7 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 20 Aug 2019 17:34:39 +0200 Subject: [PATCH 0304/2880] video: backlight: tosa_lcd: drop check because i2c_unregister_device() is NULL safe No need to check the argument of i2c_unregister_device() because the function itself does it. Signed-off-by: Wolfram Sang Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/tosa_lcd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/video/backlight/tosa_lcd.c b/drivers/video/backlight/tosa_lcd.c index 65cb7578776f..29af8e27b6e5 100644 --- a/drivers/video/backlight/tosa_lcd.c +++ b/drivers/video/backlight/tosa_lcd.c @@ -222,8 +222,7 @@ static int tosa_lcd_remove(struct spi_device *spi) { struct tosa_lcd_data *data = spi_get_drvdata(spi); - if (data->i2c) - i2c_unregister_device(data->i2c); + i2c_unregister_device(data->i2c); tosa_lcd_tg_off(data); From ec665b756e6f79c60078b00dbdabea3aa8a4b787 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 31 Jul 2019 11:40:18 +0300 Subject: [PATCH 0305/2880] backlight: gpio-backlight: Correct initial power state handling The default-on property - or the def_value via legacy pdata) should be handled as: if it is 1, the backlight must be enabled (kept enabled) if it is 0, the backlight must be disabled (kept disabled) This only works for the case when default-on is set. If it is not set then the brightness of the backlight is set to 0. Now if the backlight is enabled by external driver (graphics) the backlight will stay disabled since the brightness is configured as 0. The backlight will not turn on. In order to minimize screen flickering during device boot: The initial brightness should be set to 1. If booted in non DT mode or no phandle link to the backlight node: follow the def_value/default-on to select UNBLANK or POWERDOWN If in DT boot we have phandle link then leave the GPIO in a state which the bootloader left it and let the user of the backlight to configure it further. Signed-off-by: Peter Ujfalusi Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/gpio_backlight.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/video/backlight/gpio_backlight.c b/drivers/video/backlight/gpio_backlight.c index e84f3087e29f..18e053e4716c 100644 --- a/drivers/video/backlight/gpio_backlight.c +++ b/drivers/video/backlight/gpio_backlight.c @@ -59,13 +59,11 @@ static int gpio_backlight_probe_dt(struct platform_device *pdev, struct gpio_backlight *gbl) { struct device *dev = &pdev->dev; - enum gpiod_flags flags; int ret; gbl->def_value = device_property_read_bool(dev, "default-on"); - flags = gbl->def_value ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW; - gbl->gpiod = devm_gpiod_get(dev, NULL, flags); + gbl->gpiod = devm_gpiod_get(dev, NULL, GPIOD_ASIS); if (IS_ERR(gbl->gpiod)) { ret = PTR_ERR(gbl->gpiod); @@ -79,6 +77,22 @@ static int gpio_backlight_probe_dt(struct platform_device *pdev, return 0; } +static int gpio_backlight_initial_power_state(struct gpio_backlight *gbl) +{ + struct device_node *node = gbl->dev->of_node; + + /* Not booted with device tree or no phandle link to the node */ + if (!node || !node->phandle) + return gbl->def_value ? FB_BLANK_UNBLANK : FB_BLANK_POWERDOWN; + + /* if the enable GPIO is disabled, do not enable the backlight */ + if (gpiod_get_value_cansleep(gbl->gpiod) == 0) + return FB_BLANK_POWERDOWN; + + return FB_BLANK_UNBLANK; +} + + static int gpio_backlight_probe(struct platform_device *pdev) { struct gpio_backlight_platform_data *pdata = @@ -136,7 +150,9 @@ static int gpio_backlight_probe(struct platform_device *pdev) return PTR_ERR(bl); } - bl->props.brightness = gbl->def_value; + bl->props.power = gpio_backlight_initial_power_state(gbl); + bl->props.brightness = 1; + backlight_update_status(bl); platform_set_drvdata(pdev, bl); From 6451e123dec34e295cb0affcd48f671275bdb671 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 9 Jul 2019 12:00:04 -0700 Subject: [PATCH 0306/2880] MAINTAINERS: Add entry for stable backlight sysfs ABI documentation Add an entry for the stable backlight sysfs ABI to the MAINTAINERS file. Signed-off-by: Matthias Kaehlcke Acked-by: Daniel Thompson Signed-off-by: Lee Jones --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 783569e3c4b4..5d7bb3024e13 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2906,6 +2906,7 @@ F: drivers/video/backlight/ F: include/linux/backlight.h F: include/linux/pwm_backlight.h F: Documentation/devicetree/bindings/leds/backlight +F: Documentation/ABI/stable/sysfs-class-backlight BATMAN ADVANCED M: Marek Lindner From d55c028f8b25bdaaba9ae08026052b5b44d031b0 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 9 Jul 2019 12:00:05 -0700 Subject: [PATCH 0307/2880] backlight: Expose brightness curve type through sysfs Backlight brightness curves can have different shapes. The two main types are linear and non-linear curves. The human eye doesn't perceive linearly increasing/decreasing brightness as linear (see also 88ba95bedb79 "backlight: pwm_bl: Compute brightness of LED linearly to human eye"), hence many backlights use non-linear (often logarithmic) brightness curves. The type of curve currently is opaque to userspace, so userspace often uses more or less reliable heuristics (like the number of brightness levels) to decide whether to treat a backlight device as linear or non-linear. Export the type of the brightness curve via the new sysfs attribute 'scale'. The value of the attribute can be 'linear', 'non-linear' or 'unknown'. For devices that don't provide information about the scale of their brightness curve the value of the 'scale' attribute is 'unknown'. Signed-off-by: Matthias Kaehlcke Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- .../ABI/testing/sysfs-class-backlight | 26 +++++++++++++++++++ MAINTAINERS | 1 + drivers/video/backlight/backlight.c | 19 ++++++++++++++ include/linux/backlight.h | 8 ++++++ 4 files changed, 54 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-class-backlight diff --git a/Documentation/ABI/testing/sysfs-class-backlight b/Documentation/ABI/testing/sysfs-class-backlight new file mode 100644 index 000000000000..3ab175a3f5cb --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-backlight @@ -0,0 +1,26 @@ +What: /sys/class/backlight//scale +Date: July 2019 +KernelVersion: 5.4 +Contact: Daniel Thompson +Description: + Description of the scale of the brightness curve. + + The human eye senses brightness approximately logarithmically, + hence linear changes in brightness are perceived as being + non-linear. To achieve a linear perception of brightness changes + controls like sliders need to apply a logarithmic mapping for + backlights with a linear brightness curve. + + Possible values of the attribute are: + + unknown + The scale of the brightness curve is unknown. + + linear + The brightness changes linearly with each step. Brightness + controls should apply a logarithmic mapping for a linear + perception. + + non-linear + The brightness changes non-linearly with each step. Brightness + controls should use a linear mapping for a linear perception. diff --git a/MAINTAINERS b/MAINTAINERS index 5d7bb3024e13..fe75bb96a317 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2907,6 +2907,7 @@ F: include/linux/backlight.h F: include/linux/pwm_backlight.h F: Documentation/devicetree/bindings/leds/backlight F: Documentation/ABI/stable/sysfs-class-backlight +F: Documentation/ABI/testing/sysfs-class-backlight BATMAN ADVANCED M: Marek Lindner diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 5dc07106a59e..cac3e35d7630 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -32,6 +32,12 @@ static const char *const backlight_types[] = { [BACKLIGHT_FIRMWARE] = "firmware", }; +static const char *const backlight_scale_types[] = { + [BACKLIGHT_SCALE_UNKNOWN] = "unknown", + [BACKLIGHT_SCALE_LINEAR] = "linear", + [BACKLIGHT_SCALE_NON_LINEAR] = "non-linear", +}; + #if defined(CONFIG_FB) || (defined(CONFIG_FB_MODULE) && \ defined(CONFIG_BACKLIGHT_CLASS_DEVICE_MODULE)) /* This callback gets called when something important happens inside a @@ -246,6 +252,18 @@ static ssize_t actual_brightness_show(struct device *dev, } static DEVICE_ATTR_RO(actual_brightness); +static ssize_t scale_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct backlight_device *bd = to_backlight_device(dev); + + if (WARN_ON(bd->props.scale > BACKLIGHT_SCALE_NON_LINEAR)) + return sprintf(buf, "unknown\n"); + + return sprintf(buf, "%s\n", backlight_scale_types[bd->props.scale]); +} +static DEVICE_ATTR_RO(scale); + static struct class *backlight_class; #ifdef CONFIG_PM_SLEEP @@ -292,6 +310,7 @@ static struct attribute *bl_device_attrs[] = { &dev_attr_brightness.attr, &dev_attr_actual_brightness.attr, &dev_attr_max_brightness.attr, + &dev_attr_scale.attr, &dev_attr_type.attr, NULL, }; diff --git a/include/linux/backlight.h b/include/linux/backlight.h index 0b5897446dca..c7d6b2e8c3b5 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -46,6 +46,12 @@ enum backlight_notification { BACKLIGHT_UNREGISTERED, }; +enum backlight_scale { + BACKLIGHT_SCALE_UNKNOWN = 0, + BACKLIGHT_SCALE_LINEAR, + BACKLIGHT_SCALE_NON_LINEAR, +}; + struct backlight_device; struct fb_info; @@ -80,6 +86,8 @@ struct backlight_properties { enum backlight_type type; /* Flags used to signal drivers of state changes */ unsigned int state; + /* Type of the brightness scale (linear, non-linear, ...) */ + enum backlight_scale scale; #define BL_CORE_SUSPENDED (1 << 0) /* backlight is suspended */ #define BL_CORE_FBBLANK (1 << 1) /* backlight is under an fb blank event */ From 511a204638d7d750f859c332635d09f38273b4f0 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 9 Jul 2019 12:00:06 -0700 Subject: [PATCH 0308/2880] backlight: pwm_bl: Set scale type for CIE 1931 curves For backlight curves calculated with the CIE 1931 algorithm set the brightness scale type to non-linear. This makes the scale type available to userspace via the 'scale' sysfs attribute. Signed-off-by: Matthias Kaehlcke Tested-by: Enric Balletbo i Serra Acked-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/pwm_bl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index 2201b8c78641..d6d2b6407d7e 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -536,6 +536,8 @@ static int pwm_backlight_probe(struct platform_device *pdev) goto err_alloc; } + memset(&props, 0, sizeof(struct backlight_properties)); + if (data->levels) { /* * For the DT case, only when brightness levels is defined @@ -574,6 +576,8 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->levels = data->levels; } + + props.scale = BACKLIGHT_SCALE_NON_LINEAR; } else { /* * That only happens for the non-DT case, where platform data @@ -584,7 +588,6 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->lth_brightness = data->lth_brightness * (state.period / pb->scale); - memset(&props, 0, sizeof(struct backlight_properties)); props.type = BACKLIGHT_RAW; props.max_brightness = data->max_brightness; bl = backlight_device_register(dev_name(&pdev->dev), &pdev->dev, pb, From c0b64faf0fe6ca2574a00faed1ae833130db4e08 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 9 Jul 2019 12:00:07 -0700 Subject: [PATCH 0309/2880] backlight: pwm_bl: Set scale type for brightness curves specified in the DT Check if a brightness curve specified in the device tree is linear or not and set the corresponding property accordingly. This makes the scale type available to userspace via the 'scale' sysfs attribute. To determine if a curve is linear it is compared to a interpolated linear curve between min and max brightness. The curve is considered linear if no value deviates more than +/-5% of ${brightness_range} from their interpolated value. Signed-off-by: Matthias Kaehlcke Acked-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/pwm_bl.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index d6d2b6407d7e..746eebc411df 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -387,6 +387,31 @@ int pwm_backlight_brightness_default(struct device *dev, } #endif +static bool pwm_backlight_is_linear(struct platform_pwm_backlight_data *data) +{ + unsigned int nlevels = data->max_brightness + 1; + unsigned int min_val = data->levels[0]; + unsigned int max_val = data->levels[nlevels - 1]; + /* + * Multiplying by 128 means that even in pathological cases such + * as (max_val - min_val) == nlevels the error at max_val is less + * than 1%. + */ + unsigned int slope = (128 * (max_val - min_val)) / nlevels; + unsigned int margin = (max_val - min_val) / 20; /* 5% */ + int i; + + for (i = 1; i < nlevels; i++) { + unsigned int linear_value = min_val + ((i * slope) / 128); + unsigned int delta = abs(linear_value - data->levels[i]); + + if (delta > margin) + return false; + } + + return true; +} + static int pwm_backlight_initial_power_state(const struct pwm_bl_data *pb) { struct device_node *node = pb->dev->of_node; @@ -550,6 +575,11 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->levels = data->levels; } + + if (pwm_backlight_is_linear(data)) + props.scale = BACKLIGHT_SCALE_LINEAR; + else + props.scale = BACKLIGHT_SCALE_NON_LINEAR; } else if (!data->max_brightness) { /* * If no brightness levels are provided and max_brightness is From 345f0254e5b2f4090e4a00ebc996e07e9bdcd070 Mon Sep 17 00:00:00 2001 From: Maya Nakamura Date: Fri, 12 Jul 2019 08:27:38 +0000 Subject: [PATCH 0310/2880] HID: hv: Remove dependencies on PAGE_SIZE for ring buffer Define the ring buffer size as a constant expression because it should not depend on the guest page size. Signed-off-by: Maya Nakamura Reviewed-by: Michael Kelley Acked-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/hid-hyperv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-hyperv.c b/drivers/hid/hid-hyperv.c index 7795831d37c2..cc5b09b87ab0 100644 --- a/drivers/hid/hid-hyperv.c +++ b/drivers/hid/hid-hyperv.c @@ -104,8 +104,8 @@ struct synthhid_input_report { #pragma pack(pop) -#define INPUTVSC_SEND_RING_BUFFER_SIZE (10*PAGE_SIZE) -#define INPUTVSC_RECV_RING_BUFFER_SIZE (10*PAGE_SIZE) +#define INPUTVSC_SEND_RING_BUFFER_SIZE (40 * 1024) +#define INPUTVSC_RECV_RING_BUFFER_SIZE (40 * 1024) enum pipe_prot_msg_type { From c87a37ebd40b889178664c2c09cc187334146292 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 20 Aug 2019 18:03:25 +0800 Subject: [PATCH 0311/2880] 9p: avoid attaching writeback_fid on mmap with type PRIVATE Currently on mmap cache policy, we always attach writeback_fid whether mmap type is SHARED or PRIVATE. However, in the use case of kata-container which combines 9p(Guest OS) with overlayfs(Host OS), this behavior will trigger overlayfs' copy-up when excute command inside container. Link: http://lkml.kernel.org/r/20190820100325.10313-1-cgxu519@zoho.com.cn Signed-off-by: Chengguang Xu Signed-off-by: Dominique Martinet --- fs/9p/vfs_file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 4cc966a31cb3..fe7f0bd2048e 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -513,6 +513,7 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); if (!v9inode->writeback_fid && + (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) { /* * clone a fid and add it to writeback_fid @@ -614,6 +615,8 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) (vma->vm_end - vma->vm_start - 1), }; + if (!(vma->vm_flags & VM_SHARED)) + return; p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); From 0ce772fe79b68f83df40f07f28207b292785c677 Mon Sep 17 00:00:00 2001 From: Lu Shuaibing Date: Thu, 13 Jun 2019 15:08:54 +0800 Subject: [PATCH 0312/2880] 9p: Transport error uninitialized The p9_tag_alloc() does not initialize the transport error t_err field. The struct p9_req_t *req is allocated and stored in a struct p9_client variable. The field t_err is never initialized before p9_conn_cancel() checks its value. KUMSAN(KernelUninitializedMemorySantizer, a new error detection tool) reports this bug. ================================================================== BUG: KUMSAN: use of uninitialized memory in p9_conn_cancel+0x2d9/0x3b0 Read of size 4 at addr ffff88805f9b600c by task kworker/1:2/1216 CPU: 1 PID: 1216 Comm: kworker/1:2 Not tainted 5.2.0-rc4+ #28 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Workqueue: events p9_write_work Call Trace: dump_stack+0x75/0xae __kumsan_report+0x17c/0x3e6 kumsan_report+0xe/0x20 p9_conn_cancel+0x2d9/0x3b0 p9_write_work+0x183/0x4a0 process_one_work+0x4d1/0x8c0 worker_thread+0x6e/0x780 kthread+0x1ca/0x1f0 ret_from_fork+0x35/0x40 Allocated by task 1979: save_stack+0x19/0x80 __kumsan_kmalloc.constprop.3+0xbc/0x120 kmem_cache_alloc+0xa7/0x170 p9_client_prepare_req.part.9+0x3b/0x380 p9_client_rpc+0x15e/0x880 p9_client_create+0x3d0/0xac0 v9fs_session_init+0x192/0xc80 v9fs_mount+0x67/0x470 legacy_get_tree+0x70/0xd0 vfs_get_tree+0x4a/0x1c0 do_mount+0xba9/0xf90 ksys_mount+0xa8/0x120 __x64_sys_mount+0x62/0x70 do_syscall_64+0x6d/0x1e0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 0: (stack is not available) The buggy address belongs to the object at ffff88805f9b6008 which belongs to the cache p9_req_t of size 144 The buggy address is located 4 bytes inside of 144-byte region [ffff88805f9b6008, ffff88805f9b6098) The buggy address belongs to the page: page:ffffea00017e6d80 refcount:1 mapcount:0 mapping:ffff888068b63740 index:0xffff88805f9b7d90 compound_mapcount: 0 flags: 0x100000000010200(slab|head) raw: 0100000000010200 ffff888068b66450 ffff888068b66450 ffff888068b63740 raw: ffff88805f9b7d90 0000000000100001 00000001ffffffff 0000000000000000 page dumped because: kumsan: bad access detected ================================================================== Link: http://lkml.kernel.org/r/20190613070854.10434-1-shuaibinglu@126.com Signed-off-by: Lu Shuaibing [dominique.martinet@cea.fr: grouped the added init with the others] Signed-off-by: Dominique Martinet --- net/9p/client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/9p/client.c b/net/9p/client.c index 9622f3e469f6..1d48afc7033c 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -281,6 +281,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size) p9pdu_reset(&req->tc); p9pdu_reset(&req->rc); + req->t_err = 0; req->status = REQ_STATUS_ALLOC; init_waitqueue_head(&req->wq); INIT_LIST_HEAD(&req->req_list); From 962a991c5de18452d6c429d99f3039387cf5cbb0 Mon Sep 17 00:00:00 2001 From: Bharath Vedartham Date: Thu, 23 May 2019 01:15:19 +0530 Subject: [PATCH 0313/2880] 9p/cache.c: Fix memory leak in v9fs_cache_session_get_cookie v9fs_cache_session_get_cookie assigns a random cachetag to v9ses->cachetag, if the cachetag is not assigned previously. v9fs_random_cachetag allocates memory to v9ses->cachetag with kmalloc and uses scnprintf to fill it up with a cachetag. But if scnprintf fails, v9ses->cachetag is not freed in the current code causing a memory leak. Fix this by freeing v9ses->cachetag it v9fs_random_cachetag fails. This was reported by syzbot, the link to the report is below: https://syzkaller.appspot.com/bug?id=f012bdf297a7a4c860c38a88b44fbee43fd9bbf3 Link: http://lkml.kernel.org/r/20190522194519.GA5313@bharath12345-Inspiron-5559 Reported-by: syzbot+3a030a73b6c1e9833815@syzkaller.appspotmail.com Signed-off-by: Bharath Vedartham Signed-off-by: Dominique Martinet --- fs/9p/cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 995e332eee5c..eb2151fb6049 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -51,6 +51,8 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) if (!v9ses->cachetag) { if (v9fs_random_cachetag(v9ses) < 0) { v9ses->fscache = NULL; + kfree(v9ses->cachetag); + v9ses->cachetag = NULL; return; } } From aafee43b72863f1f70aeaf1332d049916e8df239 Mon Sep 17 00:00:00 2001 From: Bharath Vedartham Date: Thu, 23 May 2019 22:26:20 +0530 Subject: [PATCH 0314/2880] 9p/vfs_super.c: Remove unused parameter data in v9fs_fill_super v9fs_fill_super has a param 'void *data' which is unused in the function. This patch removes the 'void *data' param in v9fs_fill_super and changes the parameters in all function calls of v9fs_fill_super. Link: http://lkml.kernel.org/r/20190523165619.GA4209@bharath12345-Inspiron-5559 Signed-off-by: Bharath Vedartham Signed-off-by: Dominique Martinet --- fs/9p/vfs_super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 08112fbcaece..6c5ac0a5e539 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -58,7 +58,7 @@ static int v9fs_set_super(struct super_block *s, void *data) static int v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, - int flags, void *data) + int flags) { int ret; @@ -128,7 +128,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(sb); goto clunk_fid; } - retval = v9fs_fill_super(sb, v9ses, flags, data); + retval = v9fs_fill_super(sb, v9ses, flags); if (retval) goto release_sb; From da23b6faa8bf0f1c50a0700440e9ff3f52b3bd9a Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Sat, 31 Aug 2019 17:24:01 +0300 Subject: [PATCH 0315/2880] watchdog: iTCO: Add support for Cannon Lake PCH iTCO In Intel Cannon Lake PCH the NO_REBOOT bit was moved from the private register space to be part of the TCO1_CNT register. For this reason introduce another version (6) that uses this register to set and clear NO_REBOOT bit. Signed-off-by: Mika Westerberg Acked-by: Guenter Roeck Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/watchdog/iTCO_wdt.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index c559f706ae7e..156360e37714 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -48,6 +48,7 @@ /* Includes */ #include /* For ACPI support */ +#include /* For BIT() */ #include /* For module specific items */ #include /* For new moduleparam's */ #include /* For standard types (like size_t) */ @@ -215,6 +216,23 @@ static int update_no_reboot_bit_mem(void *priv, bool set) return 0; } +static int update_no_reboot_bit_cnt(void *priv, bool set) +{ + struct iTCO_wdt_private *p = priv; + u16 val, newval; + + val = inw(TCO1_CNT(p)); + if (set) + val |= BIT(0); + else + val &= ~BIT(0); + outw(val, TCO1_CNT(p)); + newval = inw(TCO1_CNT(p)); + + /* make sure the update is successful */ + return val != newval ? -EIO : 0; +} + static void iTCO_wdt_no_reboot_bit_setup(struct iTCO_wdt_private *p, struct itco_wdt_platform_data *pdata) { @@ -224,7 +242,9 @@ static void iTCO_wdt_no_reboot_bit_setup(struct iTCO_wdt_private *p, return; } - if (p->iTCO_version >= 2) + if (p->iTCO_version >= 6) + p->update_no_reboot_bit = update_no_reboot_bit_cnt; + else if (p->iTCO_version >= 2) p->update_no_reboot_bit = update_no_reboot_bit_mem; else if (p->iTCO_version == 1) p->update_no_reboot_bit = update_no_reboot_bit_pci; @@ -452,7 +472,8 @@ static int iTCO_wdt_probe(struct platform_device *pdev) * Get the Memory-Mapped GCS or PMC register, we need it for the * NO_REBOOT flag (TCO v2 and v3). */ - if (p->iTCO_version >= 2 && !pdata->update_no_reboot_bit) { + if (p->iTCO_version >= 2 && p->iTCO_version < 6 && + !pdata->update_no_reboot_bit) { p->gcs_pmc_res = platform_get_resource(pdev, IORESOURCE_MEM, ICH_RES_MEM_GCS_PMC); @@ -502,6 +523,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev) /* Clear out the (probably old) status */ switch (p->iTCO_version) { + case 6: case 5: case 4: outw(0x0008, TCO1_STS(p)); /* Clear the Time Out Status bit */ From b84398d6d7f900805662b1619223fd644d862d7c Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Sat, 31 Aug 2019 17:24:02 +0300 Subject: [PATCH 0316/2880] i2c: i801: Use iTCO version 6 in Cannon Lake PCH and beyond Intel Cannon Lake PCH moved the NO_REBOOT bit to reside as part of the TCO registers instead so update the i2c-i801 driver so that for Cannon Lake and beyond register platform device for iTCO using version 6. The affected PCHs are Cannon Lake, Cedar Fork, Comet Lake, Elkhart Lake and Ice Lake. Signed-off-by: Mika Westerberg Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-i801.c | 144 +++++++++++++++++++++------------- 1 file changed, 89 insertions(+), 55 deletions(-) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index a6469978e735..38afd16c857a 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -292,7 +292,8 @@ struct i801_priv { #define FEATURE_HOST_NOTIFY BIT(5) /* Not really a feature, but it's convenient to handle it as such */ #define FEATURE_IDF BIT(15) -#define FEATURE_TCO BIT(16) +#define FEATURE_TCO_SPT BIT(16) +#define FEATURE_TCO_CNL BIT(17) static const char *i801_feature_names[] = { "SMBus PEC", @@ -1491,57 +1492,23 @@ static inline unsigned int i801_get_adapter_class(struct i801_priv *priv) } #endif -static const struct itco_wdt_platform_data tco_platform_data = { +static const struct itco_wdt_platform_data spt_tco_platform_data = { .name = "Intel PCH", .version = 4, }; static DEFINE_SPINLOCK(p2sb_spinlock); -static void i801_add_tco(struct i801_priv *priv) +static struct platform_device * +i801_add_tco_spt(struct i801_priv *priv, struct pci_dev *pci_dev, + struct resource *tco_res) { - struct pci_dev *pci_dev = priv->pci_dev; - struct resource tco_res[3], *res; - struct platform_device *pdev; + struct resource *res; unsigned int devfn; - u32 tco_base, tco_ctl; - u32 base_addr, ctrl_val; u64 base64_addr; + u32 base_addr; u8 hidden; - if (!(priv->features & FEATURE_TCO)) - return; - - pci_read_config_dword(pci_dev, TCOBASE, &tco_base); - pci_read_config_dword(pci_dev, TCOCTL, &tco_ctl); - if (!(tco_ctl & TCOCTL_EN)) - return; - - memset(tco_res, 0, sizeof(tco_res)); - - res = &tco_res[ICH_RES_IO_TCO]; - res->start = tco_base & ~1; - res->end = res->start + 32 - 1; - res->flags = IORESOURCE_IO; - - /* - * Power Management registers. - */ - devfn = PCI_DEVFN(PCI_SLOT(pci_dev->devfn), 2); - pci_bus_read_config_dword(pci_dev->bus, devfn, ACPIBASE, &base_addr); - - res = &tco_res[ICH_RES_IO_SMI]; - res->start = (base_addr & ~1) + ACPIBASE_SMI_OFF; - res->end = res->start + 3; - res->flags = IORESOURCE_IO; - - /* - * Enable the ACPI I/O space. - */ - pci_bus_read_config_dword(pci_dev->bus, devfn, ACPICTRL, &ctrl_val); - ctrl_val |= ACPICTRL_EN; - pci_bus_write_config_dword(pci_dev->bus, devfn, ACPICTRL, ctrl_val); - /* * We must access the NO_REBOOT bit over the Primary to Sideband * bridge (P2SB). The BIOS prevents the P2SB device from being @@ -1577,15 +1544,76 @@ static void i801_add_tco(struct i801_priv *priv) res->end = res->start + 3; res->flags = IORESOURCE_MEM; - pdev = platform_device_register_resndata(&pci_dev->dev, "iTCO_wdt", -1, - tco_res, 3, &tco_platform_data, - sizeof(tco_platform_data)); - if (IS_ERR(pdev)) { - dev_warn(&pci_dev->dev, "failed to create iTCO device\n"); - return; - } + return platform_device_register_resndata(&pci_dev->dev, "iTCO_wdt", -1, + tco_res, 3, &spt_tco_platform_data, + sizeof(spt_tco_platform_data)); +} - priv->tco_pdev = pdev; +static const struct itco_wdt_platform_data cnl_tco_platform_data = { + .name = "Intel PCH", + .version = 6, +}; + +static struct platform_device * +i801_add_tco_cnl(struct i801_priv *priv, struct pci_dev *pci_dev, + struct resource *tco_res) +{ + return platform_device_register_resndata(&pci_dev->dev, "iTCO_wdt", -1, + tco_res, 2, &cnl_tco_platform_data, + sizeof(cnl_tco_platform_data)); +} + +static void i801_add_tco(struct i801_priv *priv) +{ + u32 base_addr, tco_base, tco_ctl, ctrl_val; + struct pci_dev *pci_dev = priv->pci_dev; + struct resource tco_res[3], *res; + unsigned int devfn; + + /* If we have ACPI based watchdog use that instead */ + if (acpi_has_watchdog()) + return; + + if (!(priv->features & (FEATURE_TCO_SPT | FEATURE_TCO_CNL))) + return; + + pci_read_config_dword(pci_dev, TCOBASE, &tco_base); + pci_read_config_dword(pci_dev, TCOCTL, &tco_ctl); + if (!(tco_ctl & TCOCTL_EN)) + return; + + memset(tco_res, 0, sizeof(tco_res)); + + res = &tco_res[ICH_RES_IO_TCO]; + res->start = tco_base & ~1; + res->end = res->start + 32 - 1; + res->flags = IORESOURCE_IO; + + /* + * Power Management registers. + */ + devfn = PCI_DEVFN(PCI_SLOT(pci_dev->devfn), 2); + pci_bus_read_config_dword(pci_dev->bus, devfn, ACPIBASE, &base_addr); + + res = &tco_res[ICH_RES_IO_SMI]; + res->start = (base_addr & ~1) + ACPIBASE_SMI_OFF; + res->end = res->start + 3; + res->flags = IORESOURCE_IO; + + /* + * Enable the ACPI I/O space. + */ + pci_bus_read_config_dword(pci_dev->bus, devfn, ACPICTRL, &ctrl_val); + ctrl_val |= ACPICTRL_EN; + pci_bus_write_config_dword(pci_dev->bus, devfn, ACPICTRL, ctrl_val); + + if (priv->features & FEATURE_TCO_CNL) + priv->tco_pdev = i801_add_tco_cnl(priv, pci_dev, tco_res); + else + priv->tco_pdev = i801_add_tco_spt(priv, pci_dev, tco_res); + + if (IS_ERR(priv->tco_pdev)) + dev_warn(&pci_dev->dev, "failed to create iTCO device\n"); } #ifdef CONFIG_ACPI @@ -1695,13 +1723,21 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) switch (dev->device) { case PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_SMBUS: case PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_SMBUS: - case PCI_DEVICE_ID_INTEL_CANNONLAKE_H_SMBUS: - case PCI_DEVICE_ID_INTEL_CANNONLAKE_LP_SMBUS: case PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS: case PCI_DEVICE_ID_INTEL_LEWISBURG_SSKU_SMBUS: - case PCI_DEVICE_ID_INTEL_CDF_SMBUS: case PCI_DEVICE_ID_INTEL_DNV_SMBUS: case PCI_DEVICE_ID_INTEL_KABYLAKE_PCH_H_SMBUS: + priv->features |= FEATURE_I2C_BLOCK_READ; + priv->features |= FEATURE_IRQ; + priv->features |= FEATURE_SMBUS_PEC; + priv->features |= FEATURE_BLOCK_BUFFER; + priv->features |= FEATURE_TCO_SPT; + priv->features |= FEATURE_HOST_NOTIFY; + break; + + case PCI_DEVICE_ID_INTEL_CANNONLAKE_H_SMBUS: + case PCI_DEVICE_ID_INTEL_CANNONLAKE_LP_SMBUS: + case PCI_DEVICE_ID_INTEL_CDF_SMBUS: case PCI_DEVICE_ID_INTEL_ICELAKE_LP_SMBUS: case PCI_DEVICE_ID_INTEL_COMETLAKE_SMBUS: case PCI_DEVICE_ID_INTEL_ELKHART_LAKE_SMBUS: @@ -1711,9 +1747,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) priv->features |= FEATURE_IRQ; priv->features |= FEATURE_SMBUS_PEC; priv->features |= FEATURE_BLOCK_BUFFER; - /* If we have ACPI based watchdog use that instead */ - if (!acpi_has_watchdog()) - priv->features |= FEATURE_TCO; + priv->features |= FEATURE_TCO_CNL; priv->features |= FEATURE_HOST_NOTIFY; break; From f8c274e4a70e7cdc43db56e2391c485c580b5a43 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 13 Aug 2019 13:55:55 +0200 Subject: [PATCH 0317/2880] i2c: hix5hd2: Remove IRQF_ONESHOT The drivers sets IRQF_ONESHOT and passes only a primary handler. The IRQ is masked while the primary is handler is invoked independently of IRQF_ONESHOT. With IRQF_ONESHOT the core code will not force-thread the interrupt and this is probably not intended. I *assume* that the original author copied the IRQ registration from another driver which passed a primary and secondary handler and removed the secondary handler but keeping the ONESHOT flag. Remove IRQF_ONESHOT. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-hix5hd2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-hix5hd2.c b/drivers/i2c/busses/i2c-hix5hd2.c index 4df1434b3597..8497c7a95dd4 100644 --- a/drivers/i2c/busses/i2c-hix5hd2.c +++ b/drivers/i2c/busses/i2c-hix5hd2.c @@ -445,8 +445,7 @@ static int hix5hd2_i2c_probe(struct platform_device *pdev) hix5hd2_i2c_init(priv); ret = devm_request_irq(&pdev->dev, irq, hix5hd2_i2c_irq, - IRQF_NO_SUSPEND | IRQF_ONESHOT, - dev_name(&pdev->dev), priv); + IRQF_NO_SUSPEND, dev_name(&pdev->dev), priv); if (ret != 0) { dev_err(&pdev->dev, "cannot request HS-I2C IRQ %d\n", irq); goto err_clk; From 7077ad2ee316551b4ba9602838d09b67683e20b6 Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Mon, 19 Aug 2019 13:28:54 +0530 Subject: [PATCH 0318/2880] i2c: synquacer: Make synquacer_i2c_ops constant Static structure synquacer_i2c_ops, of type i2c_adapter, is only used when it is copied into a field of another structure. It is not itself modified. Hence make it const to protect it from unintended modification. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Acked-by: Ard Biesheuvel Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-synquacer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-synquacer.c b/drivers/i2c/busses/i2c-synquacer.c index f724c8e6b360..39762f0611b1 100644 --- a/drivers/i2c/busses/i2c-synquacer.c +++ b/drivers/i2c/busses/i2c-synquacer.c @@ -526,7 +526,7 @@ static const struct i2c_algorithm synquacer_i2c_algo = { .functionality = synquacer_i2c_functionality, }; -static struct i2c_adapter synquacer_i2c_ops = { +static const struct i2c_adapter synquacer_i2c_ops = { .owner = THIS_MODULE, .name = "synquacer_i2c-adapter", .algo = &synquacer_i2c_algo, From 0a321b97368aa0567f238207eeb19cc8cbb6a7e2 Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Sat, 6 Jul 2019 18:49:11 +0530 Subject: [PATCH 0319/2880] i2c: fsi: Add of_put_node() before break Each iteration of for_each_available_childe_of_node puts the previous node, but in the case of a break from the middle of the loop, there is no put, thus causing a memory leak. Add an of_node_put before the break. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Reviewed-by: Eddie James Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-fsi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-fsi.c b/drivers/i2c/busses/i2c-fsi.c index da5eb3960def..e0c256922d4f 100644 --- a/drivers/i2c/busses/i2c-fsi.c +++ b/drivers/i2c/busses/i2c-fsi.c @@ -707,8 +707,10 @@ static int fsi_i2c_probe(struct device *dev) continue; port = kzalloc(sizeof(*port), GFP_KERNEL); - if (!port) + if (!port) { + of_node_put(np); break; + } port->master = i2c; port->port = port_no; From 0e3ff0ac5f71bdb6be2a698de0ed0c7e6e738269 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Aug 2019 12:53:14 +0200 Subject: [PATCH 0320/2880] PCI: rockchip: Propagate errors for optional regulators regulator_get_optional() can fail for a number of reasons besides probe deferral. It can for example return -ENOMEM if it runs out of memory as it tries to allocate data structures. Propagating only -EPROBE_DEFER is problematic because it results in these legitimately fatal errors being treated as "regulator not specified in DT". What we really want is to ignore the optional regulators only if they have not been specified in DT. regulator_get_optional() returns -ENODEV in this case, so that's the special case that we need to handle. So we propagate all errors, except -ENODEV, so that real failures will still cause the driver to fail probe. Tested-by: Heiko Stuebner Signed-off-by: Thierry Reding Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Reviewed-by: Heiko Stuebner Acked-by: Shawn Lin Cc: Shawn Lin Cc: Heiko Stuebner Cc: linux-rockchip@lists.infradead.org --- drivers/pci/controller/pcie-rockchip-host.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/pci/controller/pcie-rockchip-host.c b/drivers/pci/controller/pcie-rockchip-host.c index 8d20f1793a61..ef8e677ce9d1 100644 --- a/drivers/pci/controller/pcie-rockchip-host.c +++ b/drivers/pci/controller/pcie-rockchip-host.c @@ -608,29 +608,29 @@ static int rockchip_pcie_parse_host_dt(struct rockchip_pcie *rockchip) rockchip->vpcie12v = devm_regulator_get_optional(dev, "vpcie12v"); if (IS_ERR(rockchip->vpcie12v)) { - if (PTR_ERR(rockchip->vpcie12v) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(rockchip->vpcie12v) != -ENODEV) + return PTR_ERR(rockchip->vpcie12v); dev_info(dev, "no vpcie12v regulator found\n"); } rockchip->vpcie3v3 = devm_regulator_get_optional(dev, "vpcie3v3"); if (IS_ERR(rockchip->vpcie3v3)) { - if (PTR_ERR(rockchip->vpcie3v3) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(rockchip->vpcie3v3) != -ENODEV) + return PTR_ERR(rockchip->vpcie3v3); dev_info(dev, "no vpcie3v3 regulator found\n"); } rockchip->vpcie1v8 = devm_regulator_get_optional(dev, "vpcie1v8"); if (IS_ERR(rockchip->vpcie1v8)) { - if (PTR_ERR(rockchip->vpcie1v8) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(rockchip->vpcie1v8) != -ENODEV) + return PTR_ERR(rockchip->vpcie1v8); dev_info(dev, "no vpcie1v8 regulator found\n"); } rockchip->vpcie0v9 = devm_regulator_get_optional(dev, "vpcie0v9"); if (IS_ERR(rockchip->vpcie0v9)) { - if (PTR_ERR(rockchip->vpcie0v9) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(rockchip->vpcie0v9) != -ENODEV) + return PTR_ERR(rockchip->vpcie0v9); dev_info(dev, "no vpcie0v9 regulator found\n"); } From ddd6960087d4b45759434146d681a94bbb1c54ad Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Aug 2019 12:53:15 +0200 Subject: [PATCH 0321/2880] PCI: exynos: Propagate errors for optional PHYs devm_of_phy_get() can fail for a number of reasons besides probe deferral. It can for example return -ENOMEM if it runs out of memory as it tries to allocate devres structures. Propagating only -EPROBE_DEFER is problematic because it results in these legitimately fatal errors being treated as "PHY not specified in DT". What we really want is to ignore the optional PHYs only if they have not been specified in DT. devm_of_phy_get() returns -ENODEV in this case, so that's the special case that we need to handle. So we propagate all errors, except -ENODEV, so that real failures will still cause the driver to fail probe. Signed-off-by: Thierry Reding Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Cc: Jingoo Han Cc: Kukjin Kim Cc: Krzysztof Kozlowski --- drivers/pci/controller/dwc/pci-exynos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pci-exynos.c b/drivers/pci/controller/dwc/pci-exynos.c index cee5f2f590e2..14a6ba4067fb 100644 --- a/drivers/pci/controller/dwc/pci-exynos.c +++ b/drivers/pci/controller/dwc/pci-exynos.c @@ -465,7 +465,7 @@ static int __init exynos_pcie_probe(struct platform_device *pdev) ep->phy = devm_of_phy_get(dev, np, NULL); if (IS_ERR(ep->phy)) { - if (PTR_ERR(ep->phy) == -EPROBE_DEFER) + if (PTR_ERR(ep->phy) != -ENODEV) return PTR_ERR(ep->phy); ep->phy = NULL; From 2170a09fb4b0f66e06e5bcdcbc98c9ccbf353650 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Aug 2019 12:53:16 +0200 Subject: [PATCH 0322/2880] PCI: imx6: Propagate errors for optional regulators regulator_get_optional() can fail for a number of reasons besides probe deferral. It can for example return -ENOMEM if it runs out of memory as it tries to allocate data structures. Propagating only -EPROBE_DEFER is problematic because it results in these legitimately fatal errors being treated as "regulator not specified in DT". What we really want is to ignore the optional regulators only if they have not been specified in DT. regulator_get_optional() returns -ENODEV in this case, so that's the special case that we need to handle. So we propagate all errors, except -ENODEV, so that real failures will still cause the driver to fail probe. Signed-off-by: Thierry Reding Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Cc: Richard Zhu Cc: Lucas Stach Cc: Shawn Guo Cc: Sascha Hauer Cc: Fabio Estevam Cc: kernel@pengutronix.de Cc: linux-imx@nxp.com --- drivers/pci/controller/dwc/pci-imx6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 9b5cb5b70389..aabf22eaa6b9 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -1173,8 +1173,8 @@ static int imx6_pcie_probe(struct platform_device *pdev) imx6_pcie->vpcie = devm_regulator_get_optional(&pdev->dev, "vpcie"); if (IS_ERR(imx6_pcie->vpcie)) { - if (PTR_ERR(imx6_pcie->vpcie) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(imx6_pcie->vpcie) != -ENODEV) + return PTR_ERR(imx6_pcie->vpcie); imx6_pcie->vpcie = NULL; } From e7a877b2fa79c57c3eac7d28803279a356844907 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Aug 2019 12:53:17 +0200 Subject: [PATCH 0323/2880] PCI: armada8x: Propagate errors for optional PHYs devm_of_phy_get_by_index() can fail for a number of reasons besides probe deferral. It can for example return -ENOMEM if it runs out of memory as it tries to allocate devres structures. Propagating only -EPROBE_DEFER is problematic because it results in these legitimately fatal errors being treated as "PHY not specified in DT". What we really want is to ignore the optional PHYs only if they have not been specified in DT. devm_of_phy_get_by_index() returns -ENODEV in this case, so that's the special case that we need to handle. So we propagate all errors, except -ENODEV, so that real failures will still cause the driver to fail probe. Signed-off-by: Thierry Reding Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Cc: Thomas Petazzoni --- drivers/pci/controller/dwc/pcie-armada8k.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-armada8k.c b/drivers/pci/controller/dwc/pcie-armada8k.c index 3d55dc78d999..49596547e8c2 100644 --- a/drivers/pci/controller/dwc/pcie-armada8k.c +++ b/drivers/pci/controller/dwc/pcie-armada8k.c @@ -118,11 +118,10 @@ static int armada8k_pcie_setup_phys(struct armada8k_pcie *pcie) for (i = 0; i < ARMADA8K_PCIE_MAX_LANES; i++) { pcie->phy[i] = devm_of_phy_get_by_index(dev, node, i); - if (IS_ERR(pcie->phy[i]) && - (PTR_ERR(pcie->phy[i]) == -EPROBE_DEFER)) - return PTR_ERR(pcie->phy[i]); - if (IS_ERR(pcie->phy[i])) { + if (PTR_ERR(pcie->phy[i]) != -ENODEV) + return PTR_ERR(pcie->phy[i]); + pcie->phy[i] = NULL; continue; } From 8f9e1641ba445437095411d9fda2324121110d5d Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Aug 2019 12:53:18 +0200 Subject: [PATCH 0324/2880] PCI: histb: Propagate errors for optional regulators regulator_get_optional() can fail for a number of reasons besides probe deferral. It can for example return -ENOMEM if it runs out of memory as it tries to allocate data structures. Propagating only -EPROBE_DEFER is problematic because it results in these legitimately fatal errors being treated as "regulator not specified in DT". What we really want is to ignore the optional regulators only if they have not been specified in DT. regulator_get_optional() returns -ENODEV in this case, so that's the special case that we need to handle. So we propagate all errors, except -ENODEV, so that real failures will still cause the driver to fail probe. Signed-off-by: Thierry Reding Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Cc: Shawn Guo --- drivers/pci/controller/dwc/pcie-histb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-histb.c b/drivers/pci/controller/dwc/pcie-histb.c index 954bc2b74bbc..811b5c6d62ea 100644 --- a/drivers/pci/controller/dwc/pcie-histb.c +++ b/drivers/pci/controller/dwc/pcie-histb.c @@ -340,8 +340,8 @@ static int histb_pcie_probe(struct platform_device *pdev) hipcie->vpcie = devm_regulator_get_optional(dev, "vpcie"); if (IS_ERR(hipcie->vpcie)) { - if (PTR_ERR(hipcie->vpcie) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(hipcie->vpcie) != -ENODEV) + return PTR_ERR(hipcie->vpcie); hipcie->vpcie = NULL; } From c1e33666c94fda365a321c471f2a82996fdf3d83 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Aug 2019 12:53:19 +0200 Subject: [PATCH 0325/2880] PCI: iproc: Propagate errors for optional PHYs devm_phy_get() can fail for a number of reasons besides probe deferral. It can for example return -ENOMEM if it runs out of memory as it tries to allocate devres structures. Propagating only -EPROBE_DEFER is problematic because it results in these legitimately fatal errors being treated as "PHY not specified in DT". What we really want is to ignore the optional PHYs only if they have not been specified in DT. devm_phy_optional_get() is a function that exactly does what's required here, so use that instead. Signed-off-by: Thierry Reding Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Cc: Ray Jui Cc: Scott Branden Cc: bcm-kernel-feedback-list@broadcom.com --- drivers/pci/controller/pcie-iproc-platform.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pcie-iproc-platform.c b/drivers/pci/controller/pcie-iproc-platform.c index 5a3550b6bb29..9ee6200a66f4 100644 --- a/drivers/pci/controller/pcie-iproc-platform.c +++ b/drivers/pci/controller/pcie-iproc-platform.c @@ -93,12 +93,9 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev) pcie->need_ib_cfg = of_property_read_bool(np, "dma-ranges"); /* PHY use is optional */ - pcie->phy = devm_phy_get(dev, "pcie-phy"); - if (IS_ERR(pcie->phy)) { - if (PTR_ERR(pcie->phy) == -EPROBE_DEFER) - return -EPROBE_DEFER; - pcie->phy = NULL; - } + pcie->phy = devm_phy_optional_get(dev, "pcie-phy"); + if (IS_ERR(pcie->phy)) + return PTR_ERR(pcie->phy); ret = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, &resources, &iobase); From 3675f052b43ba51b99b85b073c7070e083f3e6fb Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 4 Jul 2019 20:44:44 +0200 Subject: [PATCH 0326/2880] Smack: Don't ignore other bprm->unsafe flags if LSM_UNSAFE_PTRACE is set There is a logic bug in the current smack_bprm_set_creds(): If LSM_UNSAFE_PTRACE is set, but the ptrace state is deemed to be acceptable (e.g. because the ptracer detached in the meantime), the other ->unsafe flags aren't checked. As far as I can tell, this means that something like the following could work (but I haven't tested it): - task A: create task B with fork() - task B: set NO_NEW_PRIVS - task B: install a seccomp filter that makes open() return 0 under some conditions - task B: replace fd 0 with a malicious library - task A: attach to task B with PTRACE_ATTACH - task B: execve() a file with an SMACK64EXEC extended attribute - task A: while task B is still in the middle of execve(), exit (which destroys the ptrace relationship) Make sure that if any flags other than LSM_UNSAFE_PTRACE are set in bprm->unsafe, we reject the execve(). Cc: stable@vger.kernel.org Fixes: 5663884caab1 ("Smack: unify all ptrace accesses in the smack") Signed-off-by: Jann Horn Signed-off-by: Casey Schaufler --- security/smack/smack_lsm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 4c5e5a438f8b..d8b59480a01c 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -937,7 +937,8 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm) if (rc != 0) return rc; - } else if (bprm->unsafe) + } + if (bprm->unsafe & ~LSM_UNSAFE_PTRACE) return -EPERM; bsp->smk_task = isp->smk_task; From a1a07f22346144d1e2108f9faa2a41fe67579e85 Mon Sep 17 00:00:00 2001 From: luanshi Date: Fri, 5 Jul 2019 10:35:20 +0800 Subject: [PATCH 0327/2880] smack: fix some kernel-doc notations Fix/add kernel-doc notation and fix typos in security/smack/. Signed-off-by: Liguang Zhang Signed-off-by: Casey Schaufler --- security/smack/smack_lsm.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index d8b59480a01c..336e855e7ab2 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -307,7 +307,7 @@ static struct smack_known *smk_fetch(const char *name, struct inode *ip, /** * init_inode_smack - initialize an inode security blob - * @isp: the blob to initialize + * @inode: inode to extract the info from * @skp: a pointer to the Smack label entry to use in the blob * */ @@ -509,7 +509,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp) /** * smack_syslog - Smack approval on syslog - * @type: message type + * @typefrom_file: unused * * Returns 0 on success, error code otherwise. */ @@ -765,7 +765,7 @@ static int smack_sb_eat_lsm_opts(char *options, void **mnt_opts) /** * smack_set_mnt_opts - set Smack specific mount options * @sb: the file system superblock - * @opts: Smack mount options + * @mnt_opts: Smack mount options * @kern_flags: mount option from kernel space or user space * @set_kern_flags: where to store converted mount opts * @@ -959,7 +959,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm) * smack_inode_alloc_security - allocate an inode blob * @inode: the inode in need of a blob * - * Returns 0 if it gets a blob, -ENOMEM otherwise + * Returns 0 */ static int smack_inode_alloc_security(struct inode *inode) { @@ -1165,7 +1165,7 @@ static int smack_inode_rename(struct inode *old_inode, * * This is the important Smack hook. * - * Returns 0 if access is permitted, -EACCES otherwise + * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_permission(struct inode *inode, int mask) { @@ -1223,8 +1223,7 @@ static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr) /** * smack_inode_getattr - Smack check for getting attributes - * @mnt: vfsmount of the object - * @dentry: the object + * @path: path to extract the info from * * Returns 0 if access is permitted, an error code otherwise */ @@ -1871,14 +1870,13 @@ static int smack_file_receive(struct file *file) /** * smack_file_open - Smack dentry open processing * @file: the object - * @cred: task credential * * Set the security blob in the file structure. * Allow the open only if the task has read access. There are * many read operations (e.g. fstat) that you can do with an * fd even if you have the file open write-only. * - * Returns 0 + * Returns 0 if current has access, error code otherwise */ static int smack_file_open(struct file *file) { @@ -1901,7 +1899,7 @@ static int smack_file_open(struct file *file) /** * smack_cred_alloc_blank - "allocate" blank task-level security credentials - * @new: the new credentials + * @cred: the new credentials * @gfp: the atomicity of any memory allocations * * Prepare a blank set of credentials for modification. This must allocate all @@ -1984,7 +1982,7 @@ static void smack_cred_transfer(struct cred *new, const struct cred *old) /** * smack_cred_getsecid - get the secid corresponding to a creds structure - * @c: the object creds + * @cred: the object creds * @secid: where to put the result * * Sets the secid to contain a u32 version of the smack label. @@ -2141,8 +2139,6 @@ static int smack_task_getioprio(struct task_struct *p) /** * smack_task_setscheduler - Smack check on setting scheduler * @p: the task object - * @policy: unused - * @lp: unused * * Return 0 if read access is permitted */ @@ -2612,8 +2608,9 @@ static void smk_ipv6_port_label(struct socket *sock, struct sockaddr *address) /** * smk_ipv6_port_check - check Smack port access - * @sock: socket + * @sk: socket * @address: address + * @act: the action being taken * * Create or update the port list entry */ @@ -2783,7 +2780,7 @@ static int smack_socket_post_create(struct socket *sock, int family, * * Cross reference the peer labels for SO_PEERSEC * - * Returns 0 on success, and error code otherwise + * Returns 0 */ static int smack_socket_socketpair(struct socket *socka, struct socket *sockb) @@ -3015,13 +3012,13 @@ static int smack_shm_shmctl(struct kern_ipc_perm *isp, int cmd) * * Returns 0 if current has the requested access, error code otherwise */ -static int smack_shm_shmat(struct kern_ipc_perm *ipc, char __user *shmaddr, +static int smack_shm_shmat(struct kern_ipc_perm *isp, char __user *shmaddr, int shmflg) { int may; may = smack_flags_to_may(shmflg); - return smk_curacc_shm(ipc, may); + return smk_curacc_shm(isp, may); } /** @@ -4763,7 +4760,7 @@ static __init void init_smack_known_list(void) /** * smack_init - initialize the smack system * - * Returns 0 + * Returns 0 on success, -ENOMEM is there's no memory */ static __init int smack_init(void) { From 3f4287e7d98a2954f20bf96c567fdffcd2b63eb9 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Tue, 23 Jul 2019 18:00:15 +0800 Subject: [PATCH 0328/2880] security: smack: Fix possible null-pointer dereferences in smack_socket_sock_rcv_skb() In smack_socket_sock_rcv_skb(), there is an if statement on line 3920 to check whether skb is NULL: if (skb && skb->secmark != 0) This check indicates skb can be NULL in some cases. But on lines 3931 and 3932, skb is used: ad.a.u.net->netif = skb->skb_iif; ipv6_skb_to_auditdata(skb, &ad.a, NULL); Thus, possible null-pointer dereferences may occur when skb is NULL. To fix these possible bugs, an if statement is added to check skb. These bugs are found by a static analysis tool STCheck written by us. Signed-off-by: Jia-Ju Bai Signed-off-by: Casey Schaufler --- security/smack/smack_lsm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 336e855e7ab2..516f7ba96f1e 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -3923,6 +3923,8 @@ access_check: skp = smack_ipv6host_label(&sadd); if (skp == NULL) skp = smack_net_ambient; + if (skb == NULL) + break; #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); ad.a.u.net->family = family; From e5bfad3d7acc5702f32aafeb388362994f4d7bd0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 21 Aug 2019 22:54:41 -0700 Subject: [PATCH 0329/2880] smack: use GFP_NOFS while holding inode_smack::smk_lock inode_smack::smk_lock is taken during smack_d_instantiate(), which is called during a filesystem transaction when creating a file on ext4. Therefore to avoid a deadlock, all code that takes this lock must use GFP_NOFS, to prevent memory reclaim from waiting for the filesystem transaction to complete. Reported-by: syzbot+0eefc1e06a77d327a056@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Casey Schaufler --- security/smack/smack_access.c | 6 +++--- security/smack/smack_lsm.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index f1c93a7be9ec..38ac3da4e791 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -465,7 +465,7 @@ char *smk_parse_smack(const char *string, int len) if (i == 0 || i >= SMK_LONGLABEL) return ERR_PTR(-EINVAL); - smack = kzalloc(i + 1, GFP_KERNEL); + smack = kzalloc(i + 1, GFP_NOFS); if (smack == NULL) return ERR_PTR(-ENOMEM); @@ -500,7 +500,7 @@ int smk_netlbl_mls(int level, char *catset, struct netlbl_lsm_secattr *sap, if ((m & *cp) == 0) continue; rc = netlbl_catmap_setbit(&sap->attr.mls.cat, - cat, GFP_KERNEL); + cat, GFP_NOFS); if (rc < 0) { netlbl_catmap_free(sap->attr.mls.cat); return rc; @@ -536,7 +536,7 @@ struct smack_known *smk_import_entry(const char *string, int len) if (skp != NULL) goto freeout; - skp = kzalloc(sizeof(*skp), GFP_KERNEL); + skp = kzalloc(sizeof(*skp), GFP_NOFS); if (skp == NULL) { skp = ERR_PTR(-ENOMEM); goto freeout; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 516f7ba96f1e..abeb09c30633 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -288,7 +288,7 @@ static struct smack_known *smk_fetch(const char *name, struct inode *ip, if (!(ip->i_opflags & IOP_XATTR)) return ERR_PTR(-EOPNOTSUPP); - buffer = kzalloc(SMK_LONGLABEL, GFP_KERNEL); + buffer = kzalloc(SMK_LONGLABEL, GFP_NOFS); if (buffer == NULL) return ERR_PTR(-ENOMEM); From 82d51481544146db2e9c9fb90c8fed92a5d0f93b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ard=C3=B6?= Date: Wed, 4 Sep 2019 11:29:53 +0200 Subject: [PATCH 0330/2880] i2c-eeprom_slave: Add support for more eeprom models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 32 and a 64 kbit memory. These needs 16 bit address so added support for that as well. Signed-off-by: Björn Ardö Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-slave-eeprom.c | 36 +++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/drivers/i2c/i2c-slave-eeprom.c b/drivers/i2c/i2c-slave-eeprom.c index be65d3842878..773afaabfb61 100644 --- a/drivers/i2c/i2c-slave-eeprom.c +++ b/drivers/i2c/i2c-slave-eeprom.c @@ -11,6 +11,7 @@ * pointer, yet implementation is deferred until the need actually arises. */ +#include #include #include #include @@ -21,12 +22,18 @@ struct eeprom_data { struct bin_attribute bin; - bool first_write; spinlock_t buffer_lock; - u8 buffer_idx; + u16 buffer_idx; + u16 address_mask; + u8 num_address_bytes; + u8 idx_write_cnt; u8 buffer[]; }; +#define I2C_SLAVE_BYTELEN GENMASK(15, 0) +#define I2C_SLAVE_FLAG_ADDR16 BIT(16) +#define I2C_SLAVE_DEVICE_MAGIC(_len, _flags) ((_flags) | (_len)) + static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, enum i2c_slave_event event, u8 *val) { @@ -34,12 +41,14 @@ static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, switch (event) { case I2C_SLAVE_WRITE_RECEIVED: - if (eeprom->first_write) { - eeprom->buffer_idx = *val; - eeprom->first_write = false; + if (eeprom->idx_write_cnt < eeprom->num_address_bytes) { + if (eeprom->idx_write_cnt == 0) + eeprom->buffer_idx = 0; + eeprom->buffer_idx = *val | (eeprom->buffer_idx << 8); + eeprom->idx_write_cnt++; } else { spin_lock(&eeprom->buffer_lock); - eeprom->buffer[eeprom->buffer_idx++] = *val; + eeprom->buffer[eeprom->buffer_idx++ & eeprom->address_mask] = *val; spin_unlock(&eeprom->buffer_lock); } break; @@ -50,7 +59,7 @@ static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, /* fallthrough */ case I2C_SLAVE_READ_REQUESTED: spin_lock(&eeprom->buffer_lock); - *val = eeprom->buffer[eeprom->buffer_idx]; + *val = eeprom->buffer[eeprom->buffer_idx & eeprom->address_mask]; spin_unlock(&eeprom->buffer_lock); /* * Do not increment buffer_idx here, because we don't know if @@ -61,7 +70,7 @@ static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, case I2C_SLAVE_STOP: case I2C_SLAVE_WRITE_REQUESTED: - eeprom->first_write = true; + eeprom->idx_write_cnt = 0; break; default: @@ -105,13 +114,16 @@ static int i2c_slave_eeprom_probe(struct i2c_client *client, const struct i2c_de { struct eeprom_data *eeprom; int ret; - unsigned size = id->driver_data; + unsigned int size = FIELD_GET(I2C_SLAVE_BYTELEN, id->driver_data); + unsigned int flag_addr16 = FIELD_GET(I2C_SLAVE_FLAG_ADDR16, id->driver_data); eeprom = devm_kzalloc(&client->dev, sizeof(struct eeprom_data) + size, GFP_KERNEL); if (!eeprom) return -ENOMEM; - eeprom->first_write = true; + eeprom->idx_write_cnt = 0; + eeprom->num_address_bytes = flag_addr16 ? 2 : 1; + eeprom->address_mask = size - 1; spin_lock_init(&eeprom->buffer_lock); i2c_set_clientdata(client, eeprom); @@ -146,7 +158,9 @@ static int i2c_slave_eeprom_remove(struct i2c_client *client) } static const struct i2c_device_id i2c_slave_eeprom_id[] = { - { "slave-24c02", 2048 / 8 }, + { "slave-24c02", I2C_SLAVE_DEVICE_MAGIC(2048 / 8, 0) }, + { "slave-24c32", I2C_SLAVE_DEVICE_MAGIC(32768 / 8, I2C_SLAVE_FLAG_ADDR16) }, + { "slave-24c64", I2C_SLAVE_DEVICE_MAGIC(65536 / 8, I2C_SLAVE_FLAG_ADDR16) }, { } }; MODULE_DEVICE_TABLE(i2c, i2c_slave_eeprom_id); From 539b7569c56523486db95214d62a108532c7960f Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 20 Aug 2019 17:34:40 +0200 Subject: [PATCH 0331/2880] i2c: cht-wc: drop check because i2c_unregister_device() is NULL safe No need to check the argument of i2c_unregister_device() because the function itself does it. Signed-off-by: Wolfram Sang Acked-by: Hans de Goede Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-cht-wc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-cht-wc.c b/drivers/i2c/busses/i2c-cht-wc.c index f6546de66fbc..b8fde61bb5d8 100644 --- a/drivers/i2c/busses/i2c-cht-wc.c +++ b/drivers/i2c/busses/i2c-cht-wc.c @@ -409,8 +409,7 @@ static int cht_wc_i2c_adap_i2c_remove(struct platform_device *pdev) { struct cht_wc_i2c_adap *adap = platform_get_drvdata(pdev); - if (adap->client) - i2c_unregister_device(adap->client); + i2c_unregister_device(adap->client); i2c_del_adapter(&adap->adapter); irq_domain_remove(adap->irq_domain); From 2252c3172cc5ecfab5aef1057f7c57b39e485f21 Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Thu, 15 Aug 2019 11:28:57 +0530 Subject: [PATCH 0332/2880] i2c: stm32f7: Make structure stm32f7_i2c_algo constant Static structure stm32f7_i2c_algo, of type i2c_algorithm, is used only when it is assigned to constant field algo of a variable having type i2c_adapter. As stm32f7_i2c_algo is therefore never modified, make it const as well to protect it from unintended modification. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Acked-by: Pierre-Yves MORDRET Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-stm32f7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index 266d1c269b83..d36cf08461f7 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -1809,7 +1809,7 @@ static u32 stm32f7_i2c_func(struct i2c_adapter *adap) I2C_FUNC_SMBUS_I2C_BLOCK; } -static struct i2c_algorithm stm32f7_i2c_algo = { +static const struct i2c_algorithm stm32f7_i2c_algo = { .master_xfer = stm32f7_i2c_xfer, .smbus_xfer = stm32f7_i2c_smbus_xfer, .functionality = stm32f7_i2c_func, From 41d529d6227c443a5827cb8b8f040402dedcf3d2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 13 Aug 2019 13:55:54 +0200 Subject: [PATCH 0333/2880] i2c: exynos5: Remove IRQF_ONESHOT The drivers sets IRQF_ONESHOT and passes only a primary handler. The IRQ is masked while the primary is handler is invoked independently of IRQF_ONESHOT. With IRQF_ONESHOT the core code will not force-thread the interrupt and this is probably not intended. I *assume* that the original author copied the IRQ registration from another driver which passed a primary and secondary handler and removed the secondary handler but keeping the ONESHOT flag. Remove IRQF_ONESHOT. Reported-by: Benjamin Rouxel Tested-by: Benjamin Rouxel Signed-off-by: Sebastian Andrzej Siewior Tested-by: Krzysztof Kozlowski Acked-by: Krzysztof Kozlowski Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-exynos5.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-exynos5.c b/drivers/i2c/busses/i2c-exynos5.c index e4e7932f7800..e7514c16b756 100644 --- a/drivers/i2c/busses/i2c-exynos5.c +++ b/drivers/i2c/busses/i2c-exynos5.c @@ -791,9 +791,7 @@ static int exynos5_i2c_probe(struct platform_device *pdev) } ret = devm_request_irq(&pdev->dev, i2c->irq, exynos5_i2c_irq, - IRQF_NO_SUSPEND | IRQF_ONESHOT, - dev_name(&pdev->dev), i2c); - + IRQF_NO_SUSPEND, dev_name(&pdev->dev), i2c); if (ret != 0) { dev_err(&pdev->dev, "cannot request HS-I2C IRQ %d\n", i2c->irq); goto err_clk; From d098913a10f8ef8e6043765d7f2fa552527d9c42 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 5 Sep 2019 07:37:22 -0700 Subject: [PATCH 0334/2880] bus: ti-sysc: Fix clock handling for no-idle quirks NFSroot can fail on dra7 when cpsw is probed using ti-sysc interconnect target module driver as reported by Keerthy. Device clocks and the interconnect target module may or may not be enabled by the bootloader on init, but we currently assume the clocks and module are on from the bootloader for "ti,no-idle" and "ti,no-idle-on-init" quirks as reported by Grygorii Strashko. Let's fix the issue by always enabling clocks init, and never disable them for "ti,no-idle" quirk. For "ti,no-idle-on-init" quirk, we must decrement the usage count later on to allow PM runtime to idle the module if requested. Fixes: 1a5cd7c23cc5 ("bus: ti-sysc: Enable all clocks directly during init to read revision") Cc: Keerthy Cc: Vignesh Raghavendra Reported-by: Keerthy Reported-by: Grygorii Strashko Reviewed-by: Grygorii Strashko Signed-off-by: Tony Lindgren --- drivers/bus/ti-sysc.c | 48 +++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 2db474ab4c6b..da88de487792 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -1630,17 +1630,19 @@ static int sysc_init_module(struct sysc *ddata) if (error) return error; - if (manage_clocks) { - sysc_clkdm_deny_idle(ddata); + sysc_clkdm_deny_idle(ddata); - error = sysc_enable_opt_clocks(ddata); - if (error) - return error; + /* + * Always enable clocks. The bootloader may or may not have enabled + * the related clocks. + */ + error = sysc_enable_opt_clocks(ddata); + if (error) + return error; - error = sysc_enable_main_clocks(ddata); - if (error) - goto err_opt_clocks; - } + error = sysc_enable_main_clocks(ddata); + if (error) + goto err_opt_clocks; if (!(ddata->cfg.quirks & SYSC_QUIRK_NO_RESET_ON_INIT)) { error = sysc_rstctrl_reset_deassert(ddata, true); @@ -1658,7 +1660,7 @@ static int sysc_init_module(struct sysc *ddata) goto err_main_clocks; } - if (!ddata->legacy_mode && manage_clocks) { + if (!ddata->legacy_mode) { error = sysc_enable_module(ddata->dev); if (error) goto err_main_clocks; @@ -1675,6 +1677,7 @@ err_main_clocks: if (manage_clocks) sysc_disable_main_clocks(ddata); err_opt_clocks: + /* No re-enable of clockdomain autoidle to prevent module autoidle */ if (manage_clocks) { sysc_disable_opt_clocks(ddata); sysc_clkdm_allow_idle(ddata); @@ -2355,6 +2358,28 @@ static void ti_sysc_idle(struct work_struct *work) ddata = container_of(work, struct sysc, idle_work.work); + /* + * One time decrement of clock usage counts if left on from init. + * Note that we disable opt clocks unconditionally in this case + * as they are enabled unconditionally during init without + * considering sysc_opt_clks_needed() at that point. + */ + if (ddata->cfg.quirks & (SYSC_QUIRK_NO_IDLE | + SYSC_QUIRK_NO_IDLE_ON_INIT)) { + sysc_clkdm_deny_idle(ddata); + sysc_disable_main_clocks(ddata); + sysc_disable_opt_clocks(ddata); + sysc_clkdm_allow_idle(ddata); + } + + /* Keep permanent PM runtime usage count for SYSC_QUIRK_NO_IDLE */ + if (ddata->cfg.quirks & SYSC_QUIRK_NO_IDLE) + return; + + /* + * Decrement PM runtime usage count for SYSC_QUIRK_NO_IDLE_ON_INIT + * and SYSC_QUIRK_NO_RESET_ON_INIT + */ if (pm_runtime_active(ddata->dev)) pm_runtime_put_sync(ddata->dev); } @@ -2439,7 +2464,8 @@ static int sysc_probe(struct platform_device *pdev) INIT_DELAYED_WORK(&ddata->idle_work, ti_sysc_idle); /* At least earlycon won't survive without deferred idle */ - if (ddata->cfg.quirks & (SYSC_QUIRK_NO_IDLE_ON_INIT | + if (ddata->cfg.quirks & (SYSC_QUIRK_NO_IDLE | + SYSC_QUIRK_NO_IDLE_ON_INIT | SYSC_QUIRK_NO_RESET_ON_INIT)) { schedule_delayed_work(&ddata->idle_work, 3000); } else { From 244c06c3b6d86513066426c9707ee460cd845349 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Thu, 5 Sep 2019 00:32:26 -0600 Subject: [PATCH 0335/2880] PCI/IOV: Remove group write permission from sriov_numvfs, sriov_drivers_autoprobe Previously the sriov_numvfs and sriov_drivers_autoprobe sysfs files had 0664 permissions, which allowed group write. libvirt runs as root when dealing with PCI, and it chowns files needed by qemu, so group write permission should not be needed. Change these permissions from 0664 to 0644, which is what DEVICE_ATTR_RW() does by default. [bhelgaas: commit log] Link: https://lore.kernel.org/r/20190905063226.43269-1-skunberg.kelsey@gmail.com Link: https://lore.kernel.org/r/850cf536-0b72-d78c-efaf-855dcb391087@redhat.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Greg Kroah-Hartman Acked-by: Donald Dutile --- drivers/pci/iov.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 053054362247..617dccebccf5 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -375,12 +375,11 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev, } static DEVICE_ATTR_RO(sriov_totalvfs); -static DEVICE_ATTR(sriov_numvfs, 0664, sriov_numvfs_show, sriov_numvfs_store); +static DEVICE_ATTR_RW(sriov_numvfs); static DEVICE_ATTR_RO(sriov_offset); static DEVICE_ATTR_RO(sriov_stride); static DEVICE_ATTR_RO(sriov_vf_device); -static DEVICE_ATTR(sriov_drivers_autoprobe, 0664, sriov_drivers_autoprobe_show, - sriov_drivers_autoprobe_store); +static DEVICE_ATTR_RW(sriov_drivers_autoprobe); static struct attribute *sriov_dev_attrs[] = { &dev_attr_sriov_totalvfs.attr, From 7f1c62c443a453deb6eb3515e3c05650ffe0dcf0 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Mon, 26 Aug 2019 00:46:16 +0200 Subject: [PATCH 0336/2880] PCI: Add pci_info_ratelimited() to ratelimit PCI separately Do not use printk_ratelimit() in drivers/pci/pci.c as it shares the rate limiting state with all other callers to the printk_ratelimit(). Add pci_info_ratelimited() (similar to pci_notice_ratelimited() added in the commit a88a7b3eb076 ("vfio: Use dev_printk() when possible")) and use it instead of printk_ratelimit() + pci_info(). Link: https://lore.kernel.org/r/20190825224616.8021-1-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 4 ++-- include/linux/pci.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 29ed5ec1ac27..08dfb16bd084 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -890,8 +890,8 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state) pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK); - if (dev->current_state != state && printk_ratelimit()) - pci_info(dev, "Refused to change power state, currently in D%d\n", + if (dev->current_state != state) + pci_info_ratelimited(dev, "Refused to change power state, currently in D%d\n", dev->current_state); /* diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..f5bab312995d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2433,4 +2433,7 @@ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type); #define pci_notice_ratelimited(pdev, fmt, arg...) \ dev_notice_ratelimited(&(pdev)->dev, fmt, ##arg) +#define pci_info_ratelimited(pdev, fmt, arg...) \ + dev_info_ratelimited(&(pdev)->dev, fmt, ##arg) + #endif /* LINUX_PCI_H */ From 8050f3f6645ae0f7e4c1304593f6f7eb2ee7d85c Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Mon, 26 Aug 2019 17:14:36 +0200 Subject: [PATCH 0337/2880] PCI: Use static const struct, not const static struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the static keyword to the front of declarations of pci_regs_behavior[] and pcie_cap_regs_behavior[], which resolves compiler warnings when building with "W=1": drivers/pci/pci-bridge-emul.c:41:1: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] const static struct pci_bridge_reg_behavior pci_regs_behavior[] = { ^ drivers/pci/pci-bridge-emul.c:176:1: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] const static struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { ^ Link: https://lore.kernel.org/r/20190826151436.4672-1-kw@linux.com Link: https://lore.kernel.org/r/20190828131733.5817-1-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas Acked-by: Thomas Petazzoni --- drivers/pci/pci-bridge-emul.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index 06083b86d4f4..5fd90105510d 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -38,7 +38,7 @@ struct pci_bridge_reg_behavior { u32 rsvd; }; -const static struct pci_bridge_reg_behavior pci_regs_behavior[] = { +static const struct pci_bridge_reg_behavior pci_regs_behavior[] = { [PCI_VENDOR_ID / 4] = { .ro = ~0 }, [PCI_COMMAND / 4] = { .rw = (PCI_COMMAND_IO | PCI_COMMAND_MEMORY | @@ -173,7 +173,7 @@ const static struct pci_bridge_reg_behavior pci_regs_behavior[] = { }, }; -const static struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { +static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { [PCI_CAP_LIST_ID / 4] = { /* * Capability ID, Next Capability Pointer and From 70aaf61a9b8b86eb08da96344efd1c0f0925ee6e Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 22 Aug 2019 10:10:11 -0600 Subject: [PATCH 0338/2880] PCI: Clean up resource_alignment parameter to not require static buffer Clean up the 'resource_alignment' parameter code to use kstrdup() in the initcall routine instead of a static buffer that wastes memory regardless of whether the feature is used. This allows us to drop 'COMMAND_LINE_SIZE' bytes (typically 256-4096 depending on architecture) of static data. This is similar to what has been done for the 'disable_acs_redir' parameter. Link: https://lore.kernel.org/r/20190822161013.5481-2-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 08dfb16bd084..fbfb64ba447d 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5932,8 +5932,7 @@ resource_size_t __weak pcibios_default_alignment(void) return 0; } -#define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE -static char resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0}; +static char *resource_alignment_param; static DEFINE_SPINLOCK(resource_alignment_lock); /** @@ -5954,7 +5953,7 @@ static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev, spin_lock(&resource_alignment_lock); p = resource_alignment_param; - if (!*p && !align) + if (!p || !*p) goto out; if (pci_has_flag(PCI_PROBE_ONLY)) { align = 0; @@ -6120,21 +6119,25 @@ void pci_reassigndev_resource_alignment(struct pci_dev *dev) static ssize_t pci_set_resource_alignment_param(const char *buf, size_t count) { - if (count > RESOURCE_ALIGNMENT_PARAM_SIZE - 1) - count = RESOURCE_ALIGNMENT_PARAM_SIZE - 1; spin_lock(&resource_alignment_lock); - strncpy(resource_alignment_param, buf, count); - resource_alignment_param[count] = '\0'; + + kfree(resource_alignment_param); + resource_alignment_param = kstrndup(buf, count, GFP_KERNEL); + spin_unlock(&resource_alignment_lock); - return count; + + return resource_alignment_param ? count : -ENOMEM; } static ssize_t pci_get_resource_alignment_param(char *buf, size_t size) { - size_t count; + size_t count = 0; + spin_lock(&resource_alignment_lock); - count = snprintf(buf, size, "%s", resource_alignment_param); + if (resource_alignment_param) + count = snprintf(buf, size, "%s", resource_alignment_param); spin_unlock(&resource_alignment_lock); + return count; } @@ -6275,8 +6278,7 @@ static int __init pci_setup(char *str) } else if (!strncmp(str, "cbmemsize=", 10)) { pci_cardbus_mem_size = memparse(str + 10, &str); } else if (!strncmp(str, "resource_alignment=", 19)) { - pci_set_resource_alignment_param(str + 19, - strlen(str + 19)); + resource_alignment_param = str + 19; } else if (!strncmp(str, "ecrc=", 5)) { pcie_ecrc_get_policy(str + 5); } else if (!strncmp(str, "hpiosize=", 9)) { @@ -6311,15 +6313,18 @@ static int __init pci_setup(char *str) early_param("pci", pci_setup); /* - * 'disable_acs_redir_param' is initialized in pci_setup(), above, to point - * to data in the __initdata section which will be freed after the init - * sequence is complete. We can't allocate memory in pci_setup() because some - * architectures do not have any memory allocation service available during - * an early_param() call. So we allocate memory and copy the variable here - * before the init section is freed. + * 'resource_alignment_param' and 'disable_acs_redir_param' are initialized + * in pci_setup(), above, to point to data in the __initdata section which + * will be freed after the init sequence is complete. We can't allocate memory + * in pci_setup() because some architectures do not have any memory allocation + * service available during an early_param() call. So we allocate memory and + * copy the variable here before the init section is freed. + * */ static int __init pci_realloc_setup_params(void) { + resource_alignment_param = kstrdup(resource_alignment_param, + GFP_KERNEL); disable_acs_redir_param = kstrdup(disable_acs_redir_param, GFP_KERNEL); return 0; From 2783d0638a51e8dba6319d9f4a2b445995a4fad1 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 5 Sep 2019 13:01:29 -0700 Subject: [PATCH 0339/2880] bus: ti-sysc: Fix handling of invalid clocks We can currently get "Unable to handle kernel paging request at virtual address" for invalid clocks with dts node but no driver: (__clk_get_hw) from [] (ti_sysc_find_one_clockdomain+0x18/0x34) (ti_sysc_find_one_clockdomain) from [] (ti_sysc_clkdm_init+0x34/0xdc) (ti_sysc_clkdm_init) from [] (sysc_probe+0xa50/0x10e8) (sysc_probe) from [] (platform_drv_probe+0x58/0xa8) Let's add IS_ERR checks to ti_sysc_clkdm_init() as And let's start treating clk_get() with -ENOENT as a proper error. If the clock name is specified in device tree we must succeed with clk_get() to continue. For modules with no clock names specified in device tree we will just ignore the clocks. Fixes: 2b2f7def058a ("bus: ti-sysc: Add support for missing clockdomain handling") Acked-by: Roger Quadros Tested-by: Keerthy Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/pdata-quirks.c | 4 ++-- drivers/bus/ti-sysc.c | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c index 6c6f8fce854e..d942a3357090 100644 --- a/arch/arm/mach-omap2/pdata-quirks.c +++ b/arch/arm/mach-omap2/pdata-quirks.c @@ -491,11 +491,11 @@ static int ti_sysc_clkdm_init(struct device *dev, struct clk *fck, struct clk *ick, struct ti_sysc_cookie *cookie) { - if (fck) + if (!IS_ERR(fck)) cookie->clkdm = ti_sysc_find_one_clockdomain(fck); if (cookie->clkdm) return 0; - if (ick) + if (!IS_ERR(ick)) cookie->clkdm = ti_sysc_find_one_clockdomain(ick); if (cookie->clkdm) return 0; diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index da88de487792..24583d82b584 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -280,9 +280,6 @@ static int sysc_get_one_clock(struct sysc *ddata, const char *name) ddata->clocks[index] = devm_clk_get(ddata->dev, name); if (IS_ERR(ddata->clocks[index])) { - if (PTR_ERR(ddata->clocks[index]) == -ENOENT) - return 0; - dev_err(ddata->dev, "clock get error for %s: %li\n", name, PTR_ERR(ddata->clocks[index])); @@ -357,7 +354,7 @@ static int sysc_get_clocks(struct sysc *ddata) continue; error = sysc_get_one_clock(ddata, name); - if (error && error != -ENOENT) + if (error) return error; } From fe050f99072d6b5175c35427a6f72846790441ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ard=C3=B6?= Date: Thu, 5 Sep 2019 16:50:26 +0200 Subject: [PATCH 0340/2880] i2c: slave-eeprom: Add comment about address handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The behaviour of the EEPROM in the case where we only send an 8bit address to a 16bit address EEPROM is not defined. Added comment about that the slave-eeprom might behave differently from how an actual device does (only one model measured). Reported-by: Wolfram Sang Signed-off-by: Björn Ardö Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-slave-eeprom.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/i2c/i2c-slave-eeprom.c b/drivers/i2c/i2c-slave-eeprom.c index 773afaabfb61..92ff9991bae8 100644 --- a/drivers/i2c/i2c-slave-eeprom.c +++ b/drivers/i2c/i2c-slave-eeprom.c @@ -11,6 +11,12 @@ * pointer, yet implementation is deferred until the need actually arises. */ +/* + * FIXME: What to do if only 8 bits of a 16 bit address are sent? + * The ST-M24C64 sends only 0xff then. Needs verification with other + * EEPROMs, though. We currently use the 8 bit as a valid address. + */ + #include #include #include From 22ac74a6194798ce131c55ff0dfbb9697650e626 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 5 Sep 2019 12:45:32 +0900 Subject: [PATCH 0341/2880] i2c: uniphier(-f): use devm_platform_ioremap_resource() Replace the chain of platform_get_resource() and devm_ioremap_resource() with devm_platform_ioremap_resource(). This allows to remove the local variable for (struct resource *), and have one function call less. Signed-off-by: Masahiro Yamada Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-uniphier-f.c | 4 +--- drivers/i2c/busses/i2c-uniphier.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/busses/i2c-uniphier-f.c b/drivers/i2c/busses/i2c-uniphier-f.c index 7acca2599f04..fc5354845ffa 100644 --- a/drivers/i2c/busses/i2c-uniphier-f.c +++ b/drivers/i2c/busses/i2c-uniphier-f.c @@ -538,7 +538,6 @@ static int uniphier_fi2c_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct uniphier_fi2c_priv *priv; - struct resource *regs; u32 bus_speed; unsigned long clk_rate; int irq, ret; @@ -547,8 +546,7 @@ static int uniphier_fi2c_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); - priv->membase = devm_ioremap_resource(dev, regs); + priv->membase = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(priv->membase)) return PTR_ERR(priv->membase); diff --git a/drivers/i2c/busses/i2c-uniphier.c b/drivers/i2c/busses/i2c-uniphier.c index 0173840c32af..a6d7a3709051 100644 --- a/drivers/i2c/busses/i2c-uniphier.c +++ b/drivers/i2c/busses/i2c-uniphier.c @@ -326,7 +326,6 @@ static int uniphier_i2c_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct uniphier_i2c_priv *priv; - struct resource *regs; u32 bus_speed; unsigned long clk_rate; int irq, ret; @@ -335,8 +334,7 @@ static int uniphier_i2c_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); - priv->membase = devm_ioremap_resource(dev, regs); + priv->membase = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(priv->membase)) return PTR_ERR(priv->membase); From 9ee7e72fbbb835ba92b2cbf4349997adf065eab0 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 5 Sep 2019 13:46:48 +0900 Subject: [PATCH 0342/2880] i2c: uniphier(-f): remove all dev_dbg() I have fixed various bugs, and these drivers are (I hope) pretty stable now. Remove all dev_dbg() for code clean-up. If I end up with debugging the drivers again, I will locally revert this commit. I no longer need the debug code in upstream. Signed-off-by: Masahiro Yamada Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-uniphier-f.c | 22 +--------------------- drivers/i2c/busses/i2c-uniphier.c | 18 +++--------------- 2 files changed, 4 insertions(+), 36 deletions(-) diff --git a/drivers/i2c/busses/i2c-uniphier-f.c b/drivers/i2c/busses/i2c-uniphier-f.c index fc5354845ffa..4241aac79e7e 100644 --- a/drivers/i2c/busses/i2c-uniphier-f.c +++ b/drivers/i2c/busses/i2c-uniphier-f.c @@ -108,7 +108,6 @@ static void uniphier_fi2c_fill_txfifo(struct uniphier_fi2c_priv *priv, if (fifo_space-- <= 0) break; - dev_dbg(&priv->adap.dev, "write data: %02x\n", *priv->buf); writel(*priv->buf++, priv->membase + UNIPHIER_FI2C_DTTX); priv->len--; } @@ -124,7 +123,6 @@ static void uniphier_fi2c_drain_rxfifo(struct uniphier_fi2c_priv *priv) break; *priv->buf++ = readl(priv->membase + UNIPHIER_FI2C_DTRX); - dev_dbg(&priv->adap.dev, "read data: %02x\n", priv->buf[-1]); priv->len--; } } @@ -142,8 +140,6 @@ static void uniphier_fi2c_clear_irqs(struct uniphier_fi2c_priv *priv, static void uniphier_fi2c_stop(struct uniphier_fi2c_priv *priv) { - dev_dbg(&priv->adap.dev, "stop condition\n"); - priv->enabled_irqs |= UNIPHIER_FI2C_INT_STOP; uniphier_fi2c_set_irqs(priv); writel(UNIPHIER_FI2C_CR_MST | UNIPHIER_FI2C_CR_STO, @@ -160,21 +156,15 @@ static irqreturn_t uniphier_fi2c_interrupt(int irq, void *dev_id) irq_status = readl(priv->membase + UNIPHIER_FI2C_INT); irq_status &= priv->enabled_irqs; - dev_dbg(&priv->adap.dev, - "interrupt: enabled_irqs=%04x, irq_status=%04x\n", - priv->enabled_irqs, irq_status); - if (irq_status & UNIPHIER_FI2C_INT_STOP) goto complete; if (unlikely(irq_status & UNIPHIER_FI2C_INT_AL)) { - dev_dbg(&priv->adap.dev, "arbitration lost\n"); priv->error = -EAGAIN; goto complete; } if (unlikely(irq_status & UNIPHIER_FI2C_INT_NA)) { - dev_dbg(&priv->adap.dev, "could not get ACK\n"); priv->error = -ENXIO; if (priv->flags & UNIPHIER_FI2C_RD) { /* @@ -215,18 +205,14 @@ static irqreturn_t uniphier_fi2c_interrupt(int irq, void *dev_id) if (unlikely(priv->flags & UNIPHIER_FI2C_MANUAL_NACK)) { if (priv->len <= UNIPHIER_FI2C_FIFO_SIZE && !(priv->flags & UNIPHIER_FI2C_BYTE_WISE)) { - dev_dbg(&priv->adap.dev, - "enable read byte count IRQ\n"); priv->enabled_irqs |= UNIPHIER_FI2C_INT_RB; uniphier_fi2c_set_irqs(priv); priv->flags |= UNIPHIER_FI2C_BYTE_WISE; } - if (priv->len <= 1) { - dev_dbg(&priv->adap.dev, "set NACK\n"); + if (priv->len <= 1) writel(UNIPHIER_FI2C_CR_MST | UNIPHIER_FI2C_CR_NACK, priv->membase + UNIPHIER_FI2C_CR); - } } goto handled; @@ -334,10 +320,6 @@ static int uniphier_fi2c_master_xfer_one(struct i2c_adapter *adap, bool is_read = msg->flags & I2C_M_RD; unsigned long time_left, flags; - dev_dbg(&adap->dev, "%s: addr=0x%02x, len=%d, repeat=%d, stop=%d\n", - is_read ? "receive" : "transmit", msg->addr, msg->len, - repeat, stop); - priv->len = msg->len; priv->buf = msg->buf; priv->enabled_irqs = UNIPHIER_FI2C_INT_FAULTS; @@ -359,7 +341,6 @@ static int uniphier_fi2c_master_xfer_one(struct i2c_adapter *adap, else uniphier_fi2c_tx_init(priv, msg->addr, repeat); - dev_dbg(&adap->dev, "start condition\n"); /* * For a repeated START condition, writing a slave address to the FIFO * kicks the controller. So, the UNIPHIER_FI2C_CR register should be @@ -383,7 +364,6 @@ static int uniphier_fi2c_master_xfer_one(struct i2c_adapter *adap, uniphier_fi2c_recover(priv); return -ETIMEDOUT; } - dev_dbg(&adap->dev, "complete\n"); if (unlikely(priv->flags & UNIPHIER_FI2C_DEFER_STOP_COMP)) { u32 status; diff --git a/drivers/i2c/busses/i2c-uniphier.c b/drivers/i2c/busses/i2c-uniphier.c index a6d7a3709051..0270090c0360 100644 --- a/drivers/i2c/busses/i2c-uniphier.c +++ b/drivers/i2c/busses/i2c-uniphier.c @@ -71,7 +71,6 @@ static int uniphier_i2c_xfer_byte(struct i2c_adapter *adap, u32 txdata, reinit_completion(&priv->comp); txdata |= UNIPHIER_I2C_DTRM_IRQEN; - dev_dbg(&adap->dev, "write data: 0x%04x\n", txdata); writel(txdata, priv->membase + UNIPHIER_I2C_DTRM); time_left = wait_for_completion_timeout(&priv->comp, adap->timeout); @@ -81,8 +80,6 @@ static int uniphier_i2c_xfer_byte(struct i2c_adapter *adap, u32 txdata, } rxdata = readl(priv->membase + UNIPHIER_I2C_DREC); - dev_dbg(&adap->dev, "read data: 0x%04x\n", rxdata); - if (rxdatap) *rxdatap = rxdata; @@ -98,14 +95,11 @@ static int uniphier_i2c_send_byte(struct i2c_adapter *adap, u32 txdata) if (ret) return ret; - if (unlikely(rxdata & UNIPHIER_I2C_DREC_LAB)) { - dev_dbg(&adap->dev, "arbitration lost\n"); + if (unlikely(rxdata & UNIPHIER_I2C_DREC_LAB)) return -EAGAIN; - } - if (unlikely(rxdata & UNIPHIER_I2C_DREC_LRB)) { - dev_dbg(&adap->dev, "could not get ACK\n"); + + if (unlikely(rxdata & UNIPHIER_I2C_DREC_LRB)) return -ENXIO; - } return 0; } @@ -115,7 +109,6 @@ static int uniphier_i2c_tx(struct i2c_adapter *adap, u16 addr, u16 len, { int ret; - dev_dbg(&adap->dev, "start condition\n"); ret = uniphier_i2c_send_byte(adap, addr << 1 | UNIPHIER_I2C_DTRM_STA | UNIPHIER_I2C_DTRM_NACK); @@ -137,7 +130,6 @@ static int uniphier_i2c_rx(struct i2c_adapter *adap, u16 addr, u16 len, { int ret; - dev_dbg(&adap->dev, "start condition\n"); ret = uniphier_i2c_send_byte(adap, addr << 1 | UNIPHIER_I2C_DTRM_STA | UNIPHIER_I2C_DTRM_NACK | @@ -161,7 +153,6 @@ static int uniphier_i2c_rx(struct i2c_adapter *adap, u16 addr, u16 len, static int uniphier_i2c_stop(struct i2c_adapter *adap) { - dev_dbg(&adap->dev, "stop condition\n"); return uniphier_i2c_send_byte(adap, UNIPHIER_I2C_DTRM_STO | UNIPHIER_I2C_DTRM_NACK); } @@ -173,9 +164,6 @@ static int uniphier_i2c_master_xfer_one(struct i2c_adapter *adap, bool recovery = false; int ret; - dev_dbg(&adap->dev, "%s: addr=0x%02x, len=%d, stop=%d\n", - is_read ? "receive" : "transmit", msg->addr, msg->len, stop); - if (is_read) ret = uniphier_i2c_rx(adap, msg->addr, msg->len, msg->buf); else From 688033f52d7192ea8ddf617b86af0a4120247b42 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 3 Sep 2019 14:10:18 +0300 Subject: [PATCH 0343/2880] PCI: pciehp: Add pciehp_set_indicators() to set both indicators Add pciehp_set_indicators() to set power and attention indicators with a single register write. This is a minor optimization because we frequently set both indicators and this can do it with a single command. It also reduces the number of interfaces related to the indicators and makes them more discoverable because callers use the PCI_EXP_SLTCTL_ATTN_IND_* and PCI_EXP_SLTCTL_PWR_IND_* definitions directly. [bhelgaas: extend commit log, s/PCI_EXP_SLTCTL_.*_IND_NONE/INDICATOR_NOOP/ so they don't look like things defined by the spec, add function doc, mask commands to make it obvious we only send valid commands (pcie_do_write_cmd() does mask it, but requires more effort to verify)] Link: https://lore.kernel.org/r/20190903111021.1559-2-efremov@linux.com Signed-off-by: Denis Efremov Signed-off-by: Bjorn Helgaas --- drivers/pci/hotplug/pciehp.h | 3 +++ drivers/pci/hotplug/pciehp_hpc.c | 36 ++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 8c51a04b8083..2ca0f35a6a23 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -167,6 +167,9 @@ int pciehp_power_on_slot(struct controller *ctrl); void pciehp_power_off_slot(struct controller *ctrl); void pciehp_get_power_status(struct controller *ctrl, u8 *status); +#define INDICATOR_NOOP -1 /* Leave indicator unchanged */ +void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn); + void pciehp_set_attention_status(struct controller *ctrl, u8 status); void pciehp_get_latch_status(struct controller *ctrl, u8 *status); int pciehp_query_power_fault(struct controller *ctrl); diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index bd990e3371e3..d7ebcd7111cb 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -443,6 +443,42 @@ void pciehp_set_attention_status(struct controller *ctrl, u8 value) pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, slot_cmd); } +/** + * pciehp_set_indicators() - set attention indicator, power indicator, or both + * @ctrl: PCIe hotplug controller + * @pwr: one of: + * PCI_EXP_SLTCTL_PWR_IND_ON + * PCI_EXP_SLTCTL_PWR_IND_BLINK + * PCI_EXP_SLTCTL_PWR_IND_OFF + * @attn: one of: + * PCI_EXP_SLTCTL_ATTN_IND_ON + * PCI_EXP_SLTCTL_ATTN_IND_BLINK + * PCI_EXP_SLTCTL_ATTN_IND_OFF + * + * Either @pwr or @attn can also be INDICATOR_NOOP to leave that indicator + * unchanged. + */ +void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn) +{ + u16 cmd = 0, mask = 0; + + if (PWR_LED(ctrl) && pwr != INDICATOR_NOOP) { + cmd |= (pwr & PCI_EXP_SLTCTL_PIC); + mask |= PCI_EXP_SLTCTL_PIC; + } + + if (ATTN_LED(ctrl) && attn != INDICATOR_NOOP) { + cmd |= (attn & PCI_EXP_SLTCTL_AIC); + mask |= PCI_EXP_SLTCTL_AIC; + } + + if (cmd) { + pcie_write_cmd_nowait(ctrl, cmd, mask); + ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, + pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, cmd); + } +} + void pciehp_green_led_on(struct controller *ctrl) { if (!PWR_LED(ctrl)) From 94719ba090e2c111272eb97c053c6cc47a7b8856 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 3 Sep 2019 14:10:19 +0300 Subject: [PATCH 0344/2880] PCI: pciehp: Combine adjacent indicator updates Combine adjacent updates of power and attention indicators into a single pciehp_set_indicators() call. This sends one command to the hotplug controller instead of two. Link: https://lore.kernel.org/r/20190903111021.1559-3-efremov@linux.com Signed-off-by: Denis Efremov Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Lukas Wunner --- drivers/pci/hotplug/pciehp_ctrl.c | 26 +++++++++++++++----------- drivers/pci/hotplug/pciehp_hpc.c | 4 ++-- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c index 631ced0ab28a..b10c71493ffa 100644 --- a/drivers/pci/hotplug/pciehp_ctrl.c +++ b/drivers/pci/hotplug/pciehp_ctrl.c @@ -30,7 +30,10 @@ static void set_slot_off(struct controller *ctrl) { - /* turn off slot, turn on Amber LED, turn off Green LED if supported*/ + /* + * Turn off slot, turn on attention indicator, turn off power + * indicator + */ if (POWER_CTRL(ctrl)) { pciehp_power_off_slot(ctrl); @@ -42,8 +45,8 @@ static void set_slot_off(struct controller *ctrl) msleep(1000); } - pciehp_green_led_off(ctrl); - pciehp_set_attention_status(ctrl, 1); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + PCI_EXP_SLTCTL_ATTN_IND_ON); } /** @@ -90,8 +93,8 @@ static int board_added(struct controller *ctrl) } } - pciehp_green_led_on(ctrl); - pciehp_set_attention_status(ctrl, 0); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_ON, + PCI_EXP_SLTCTL_ATTN_IND_OFF); return 0; err_exit: @@ -171,9 +174,9 @@ void pciehp_handle_button_press(struct controller *ctrl) ctrl_info(ctrl, "Slot(%s) Powering on due to button press\n", slot_name(ctrl)); } - /* blink green LED and turn off amber */ - pciehp_green_led_blink(ctrl); - pciehp_set_attention_status(ctrl, 0); + /* blink power indicator and turn off attention */ + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK, + PCI_EXP_SLTCTL_ATTN_IND_OFF); schedule_delayed_work(&ctrl->button_work, 5 * HZ); break; case BLINKINGOFF_STATE: @@ -187,12 +190,13 @@ void pciehp_handle_button_press(struct controller *ctrl) cancel_delayed_work(&ctrl->button_work); if (ctrl->state == BLINKINGOFF_STATE) { ctrl->state = ON_STATE; - pciehp_green_led_on(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_ON, + PCI_EXP_SLTCTL_ATTN_IND_OFF); } else { ctrl->state = OFF_STATE; - pciehp_green_led_off(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + PCI_EXP_SLTCTL_ATTN_IND_OFF); } - pciehp_set_attention_status(ctrl, 0); ctrl_info(ctrl, "Slot(%s): Action canceled due to button press\n", slot_name(ctrl)); break; diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index d7ebcd7111cb..a900bfd22447 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -674,8 +674,8 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id) if ((events & PCI_EXP_SLTSTA_PFD) && !ctrl->power_fault_detected) { ctrl->power_fault_detected = 1; ctrl_err(ctrl, "Slot(%s): Power fault\n", slot_name(ctrl)); - pciehp_set_attention_status(ctrl, 1); - pciehp_green_led_off(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + PCI_EXP_SLTCTL_ATTN_IND_ON); } /* From 106feb2fdced0c240ca8ecb3d5db91b2b64e9a48 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 3 Sep 2019 14:10:20 +0300 Subject: [PATCH 0345/2880] PCI: pciehp: Remove pciehp_set_attention_status() Remove pciehp_set_attention_status() and use pciehp_set_indicators() instead, since the code is mostly the same. Link: https://lore.kernel.org/r/20190903111021.1559-4-efremov@linux.com Signed-off-by: Denis Efremov Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan --- drivers/pci/hotplug/pciehp.h | 1 - drivers/pci/hotplug/pciehp_core.c | 9 +++++++-- drivers/pci/hotplug/pciehp_hpc.c | 25 ------------------------- include/uapi/linux/pci_regs.h | 1 + 4 files changed, 8 insertions(+), 28 deletions(-) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 2ca0f35a6a23..1f69f43a3f29 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -170,7 +170,6 @@ void pciehp_get_power_status(struct controller *ctrl, u8 *status); #define INDICATOR_NOOP -1 /* Leave indicator unchanged */ void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn); -void pciehp_set_attention_status(struct controller *ctrl, u8 status); void pciehp_get_latch_status(struct controller *ctrl, u8 *status); int pciehp_query_power_fault(struct controller *ctrl); void pciehp_green_led_on(struct controller *ctrl); diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index 6ad0d86762cb..b3122c151b80 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -95,15 +95,20 @@ static void cleanup_slot(struct controller *ctrl) } /* - * set_attention_status - Turns the Amber LED for a slot on, off or blink + * set_attention_status - Turns the Attention Indicator on, off or blinking */ static int set_attention_status(struct hotplug_slot *hotplug_slot, u8 status) { struct controller *ctrl = to_ctrl(hotplug_slot); struct pci_dev *pdev = ctrl->pcie->port; + if (status) + status <<= PCI_EXP_SLTCTL_ATTN_IND_SHIFT; + else + status = PCI_EXP_SLTCTL_ATTN_IND_OFF; + pci_config_pm_runtime_get(pdev); - pciehp_set_attention_status(ctrl, status); + pciehp_set_indicators(ctrl, INDICATOR_NOOP, status); pci_config_pm_runtime_put(pdev); return 0; } diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index a900bfd22447..6527345ccd8e 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -418,31 +418,6 @@ int pciehp_set_raw_indicator_status(struct hotplug_slot *hotplug_slot, return 0; } -void pciehp_set_attention_status(struct controller *ctrl, u8 value) -{ - u16 slot_cmd; - - if (!ATTN_LED(ctrl)) - return; - - switch (value) { - case 0: /* turn off */ - slot_cmd = PCI_EXP_SLTCTL_ATTN_IND_OFF; - break; - case 1: /* turn on */ - slot_cmd = PCI_EXP_SLTCTL_ATTN_IND_ON; - break; - case 2: /* turn blink */ - slot_cmd = PCI_EXP_SLTCTL_ATTN_IND_BLINK; - break; - default: - return; - } - pcie_write_cmd_nowait(ctrl, slot_cmd, PCI_EXP_SLTCTL_AIC); - ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, - pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, slot_cmd); -} - /** * pciehp_set_indicators() - set attention indicator, power indicator, or both * @ctrl: PCIe hotplug controller diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f28e562d7ca8..de3e58afc564 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -591,6 +591,7 @@ #define PCI_EXP_SLTCTL_CCIE 0x0010 /* Command Completed Interrupt Enable */ #define PCI_EXP_SLTCTL_HPIE 0x0020 /* Hot-Plug Interrupt Enable */ #define PCI_EXP_SLTCTL_AIC 0x00c0 /* Attention Indicator Control */ +#define PCI_EXP_SLTCTL_ATTN_IND_SHIFT 6 /* Attention Indicator shift */ #define PCI_EXP_SLTCTL_ATTN_IND_ON 0x0040 /* Attention Indicator on */ #define PCI_EXP_SLTCTL_ATTN_IND_BLINK 0x0080 /* Attention Indicator blinking */ #define PCI_EXP_SLTCTL_ATTN_IND_OFF 0x00c0 /* Attention Indicator off */ From 9194094be418f4f13fbab3a6049ea18acb137178 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 3 Sep 2019 14:10:21 +0300 Subject: [PATCH 0346/2880] PCI: pciehp: Remove pciehp_green_led_{on,off,blink}() Remove pciehp_green_led_{on,off,blink}() and use pciehp_set_indicators() instead, since the code is mostly the same. [bhelgaas: drop set_power_indicator() wrapper to reduce the number of interfaces] Link: https://lore.kernel.org/r/20190903111021.1559-5-efremov@linux.com Signed-off-by: Denis Efremov Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan --- drivers/pci/hotplug/pciehp.h | 3 --- drivers/pci/hotplug/pciehp_ctrl.c | 11 ++++++---- drivers/pci/hotplug/pciehp_hpc.c | 36 ------------------------------- 3 files changed, 7 insertions(+), 43 deletions(-) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 1f69f43a3f29..01706448b04a 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -172,9 +172,6 @@ void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn); void pciehp_get_latch_status(struct controller *ctrl, u8 *status); int pciehp_query_power_fault(struct controller *ctrl); -void pciehp_green_led_on(struct controller *ctrl); -void pciehp_green_led_off(struct controller *ctrl); -void pciehp_green_led_blink(struct controller *ctrl); bool pciehp_card_present(struct controller *ctrl); bool pciehp_card_present_or_link_active(struct controller *ctrl); int pciehp_check_link_status(struct controller *ctrl); diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c index b10c71493ffa..71141abfba70 100644 --- a/drivers/pci/hotplug/pciehp_ctrl.c +++ b/drivers/pci/hotplug/pciehp_ctrl.c @@ -68,7 +68,8 @@ static int board_added(struct controller *ctrl) return retval; } - pciehp_green_led_blink(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK, + INDICATOR_NOOP); /* Check link training status */ retval = pciehp_check_link_status(ctrl); @@ -126,8 +127,8 @@ static void remove_board(struct controller *ctrl, bool safe_removal) &ctrl->pending_events); } - /* turn off Green LED */ - pciehp_green_led_off(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + INDICATOR_NOOP); } static int pciehp_enable_slot(struct controller *ctrl); @@ -314,7 +315,9 @@ static int pciehp_enable_slot(struct controller *ctrl) pm_runtime_get_sync(&ctrl->pcie->port->dev); ret = __pciehp_enable_slot(ctrl); if (ret && ATTN_BUTTN(ctrl)) - pciehp_green_led_off(ctrl); /* may be blinking */ + /* may be blinking */ + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + INDICATOR_NOOP); pm_runtime_put(&ctrl->pcie->port->dev); mutex_lock(&ctrl->state_lock); diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 6527345ccd8e..1a522c1c4177 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -454,42 +454,6 @@ void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn) } } -void pciehp_green_led_on(struct controller *ctrl) -{ - if (!PWR_LED(ctrl)) - return; - - pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_PWR_IND_ON, - PCI_EXP_SLTCTL_PIC); - ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, - pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, - PCI_EXP_SLTCTL_PWR_IND_ON); -} - -void pciehp_green_led_off(struct controller *ctrl) -{ - if (!PWR_LED(ctrl)) - return; - - pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, - PCI_EXP_SLTCTL_PIC); - ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, - pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, - PCI_EXP_SLTCTL_PWR_IND_OFF); -} - -void pciehp_green_led_blink(struct controller *ctrl) -{ - if (!PWR_LED(ctrl)) - return; - - pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK, - PCI_EXP_SLTCTL_PIC); - ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, - pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, - PCI_EXP_SLTCTL_PWR_IND_BLINK); -} - int pciehp_power_on_slot(struct controller *ctrl) { struct pci_dev *pdev = ctrl_dev(ctrl); From 4a06c2c38235d4a0bc436131591e2927288992fe Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 5 Sep 2019 15:52:24 -0500 Subject: [PATCH 0347/2880] PCI: pciehp: Refer to "Indicators" instead of "LEDs" in comments The PCIe spec doesn't mention "green LEDs" or "amber LEDs". Replace those terms with "Power Indicator" and "Attention Indicator" so the comments match the spec language. Comment changes only. Signed-off-by: Bjorn Helgaas --- drivers/pci/hotplug/pciehp.h | 4 ++-- drivers/pci/hotplug/pciehp_ctrl.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 01706448b04a..654c972b8ea0 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -110,9 +110,9 @@ struct controller { * * @OFF_STATE: slot is powered off, no subordinate devices are enumerated * @BLINKINGON_STATE: slot will be powered on after the 5 second delay, - * green led is blinking + * Power Indicator is blinking * @BLINKINGOFF_STATE: slot will be powered off after the 5 second delay, - * green led is blinking + * Power Indicator is blinking * @POWERON_STATE: slot is currently powering on * @POWEROFF_STATE: slot is currently powering off * @ON_STATE: slot is powered on, subordinate devices have been enumerated diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c index 71141abfba70..21af7b16d7a4 100644 --- a/drivers/pci/hotplug/pciehp_ctrl.c +++ b/drivers/pci/hotplug/pciehp_ctrl.c @@ -104,7 +104,7 @@ err_exit: } /** - * remove_board - Turns off slot and LEDs + * remove_board - Turn off slot and Power Indicator * @ctrl: PCIe hotplug controller where board is being removed * @safe_removal: whether the board is safely removed (versus surprise removed) */ From 273b177cac4b649c3c6d448e85bbc64cebfe7a0a Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 22 Aug 2019 10:10:12 -0600 Subject: [PATCH 0348/2880] PCI: Move pci_[get|set]_resource_alignment_param() into their callers Both the functions pci_get_resource_alignment_param() and pci_set_resource_alignment_param() are now only called in one place: resource_alignment_show() and resource_alignment_store() respectively. There is no value in this extra set of functions so move both into their callers respectively. [bhelgaas: fold in "GFP_KERNEL while atomic" fix from Christoph Hellwig https://lore.kernel.org/r/20190902075006.GB754@infradead.org] Link: https://lore.kernel.org/r/20190822161013.5481-3-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index fbfb64ba447d..ff3abc577a25 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6117,39 +6117,31 @@ void pci_reassigndev_resource_alignment(struct pci_dev *dev) } } -static ssize_t pci_set_resource_alignment_param(const char *buf, size_t count) -{ - spin_lock(&resource_alignment_lock); - - kfree(resource_alignment_param); - resource_alignment_param = kstrndup(buf, count, GFP_KERNEL); - - spin_unlock(&resource_alignment_lock); - - return resource_alignment_param ? count : -ENOMEM; -} - -static ssize_t pci_get_resource_alignment_param(char *buf, size_t size) +static ssize_t resource_alignment_show(struct bus_type *bus, char *buf) { size_t count = 0; spin_lock(&resource_alignment_lock); if (resource_alignment_param) - count = snprintf(buf, size, "%s", resource_alignment_param); + count = snprintf(buf, PAGE_SIZE, "%s", resource_alignment_param); spin_unlock(&resource_alignment_lock); return count; } -static ssize_t resource_alignment_show(struct bus_type *bus, char *buf) -{ - return pci_get_resource_alignment_param(buf, PAGE_SIZE); -} - static ssize_t resource_alignment_store(struct bus_type *bus, const char *buf, size_t count) { - return pci_set_resource_alignment_param(buf, count); + char *param = kstrndup(buf, count, GFP_KERNEL); + + if (!param) + return -ENOMEM; + + spin_lock(&resource_alignment_lock); + kfree(resource_alignment_param); + resource_alignment_param = param; + spin_unlock(&resource_alignment_lock); + return count; } static BUS_ATTR_RW(resource_alignment); From e499081da1a247a8a5a234211c7127fb0b91ca92 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 22 Aug 2019 10:10:13 -0600 Subject: [PATCH 0349/2880] PCI: Force trailing new line to resource_alignment_param in sysfs When 'pci=resource_alignment=' is specified on the command line, there is no trailing new line. Then, when it's read through the corresponding sysfs attribute, there will be no newline and a cat command will not show correctly in a shell. If the parameter is set through sysfs a new line will be stored and it will 'cat' correctly. To solve this, append a new line character in the show function if one does not already exist. Link: https://lore.kernel.org/r/20190822161013.5481-4-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index ff3abc577a25..5f70ff1031d2 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6126,6 +6126,16 @@ static ssize_t resource_alignment_show(struct bus_type *bus, char *buf) count = snprintf(buf, PAGE_SIZE, "%s", resource_alignment_param); spin_unlock(&resource_alignment_lock); + /* + * When set by the command line, resource_alignment_param will not + * have a trailing line feed, which is ugly. So conditionally add + * it here. + */ + if (count >= 2 && buf[count - 2] != '\n' && count < PAGE_SIZE - 1) { + buf[count - 1] = '\n'; + buf[count++] = 0; + } + return count; } From 46b2c32df7a462d0e64b68c513e5c4c1b2a399a7 Mon Sep 17 00:00:00 2001 From: Abhinav Ratna Date: Tue, 20 Aug 2019 10:09:45 +0530 Subject: [PATCH 0350/2880] PCI: Add ACS quirk for iProc PAXB iProc PAXB Root Ports don't advertise an ACS capability, but they do not allow peer-to-peer transactions between Root Ports. Add an ACS quirk so each Root Port can be in a separate IOMMU group. [bhelgaas: commit log, comment, use common implementation style] Link: https://lore.kernel.org/r/1566275985-25670-1-git-send-email-srinath.mannam@broadcom.com Signed-off-by: Abhinav Ratna Signed-off-by: Srinath Mannam Signed-off-by: Bjorn Helgaas Acked-by: Scott Branden --- drivers/pci/quirks.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 208aacf39329..2edbce35e8c5 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4466,6 +4466,19 @@ static int pci_quirk_mf_endpoint_acs(struct pci_dev *dev, u16 acs_flags) return acs_flags ? 0 : 1; } +static int pci_quirk_brcm_acs(struct pci_dev *dev, u16 acs_flags) +{ + /* + * iProc PAXB Root Ports don't advertise an ACS capability, but + * they do not allow peer-to-peer transactions between Root Ports. + * Allow each Root Port to be in a separate IOMMU group by masking + * SV/RR/CR/UF bits. + */ + acs_flags &= ~(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF); + + return acs_flags ? 0 : 1; +} + static const struct pci_dev_acs_enabled { u16 vendor; u16 device; @@ -4559,6 +4572,7 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_AMPERE, 0xE00A, pci_quirk_xgene_acs }, { PCI_VENDOR_ID_AMPERE, 0xE00B, pci_quirk_xgene_acs }, { PCI_VENDOR_ID_AMPERE, 0xE00C, pci_quirk_xgene_acs }, + { PCI_VENDOR_ID_BROADCOM, 0xD714, pci_quirk_brcm_acs }, { 0 } }; From e6fa0dc86734f99b037b36b8682133efc2b6e16b Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 2 Sep 2019 14:09:58 +0530 Subject: [PATCH 0351/2880] swiotlb-xen: Convert to use macro Rather than using static int max_dma_bits, this can be coverted to use as macro. Signed-off-by: Souptick Joarder Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/swiotlb-xen.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index ae1df496bf38..d1eced53bdda 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -38,6 +38,7 @@ #include #include +#define MAX_DMA_BITS 32 /* * Used to do a quick range check in swiotlb_tbl_unmap_single and * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this @@ -114,8 +115,6 @@ static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) return 0; } -static int max_dma_bits = 32; - static int xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) { @@ -135,7 +134,7 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) p + (i << IO_TLB_SHIFT), get_order(slabs << IO_TLB_SHIFT), dma_bits, &dma_handle); - } while (rc && dma_bits++ < max_dma_bits); + } while (rc && dma_bits++ < MAX_DMA_BITS); if (rc) return rc; From 4957eccf979b025286b39388fd1a60cde601a10a Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Wed, 28 Aug 2019 13:33:49 -0500 Subject: [PATCH 0352/2880] ARM: omap2plus_defconfig: Fix missing video When the panel-dpi driver was removed, the simple-panels driver was never enabled, so anyone who used the panel-dpi driver lost video, and those who used it inconjunction with simple-panels would have to manually enable CONFIG_DRM_PANEL_SIMPLE. This patch makes CONFIG_DRM_PANEL_SIMPLE a module in the same way the deprecated panel-dpi was. Fixes: 8bf4b1621178 ("drm/omap: Remove panel-dpi driver") Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- arch/arm/configs/omap2plus_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index c7bf9c493646..64eb896907bf 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -363,6 +363,7 @@ CONFIG_DRM_OMAP_PANEL_TPO_TD028TTEC1=m CONFIG_DRM_OMAP_PANEL_TPO_TD043MTEA1=m CONFIG_DRM_OMAP_PANEL_NEC_NL8048HL11=m CONFIG_DRM_TILCDC=m +CONFIG_DRM_PANEL_SIMPLE=m CONFIG_FB=y CONFIG_FIRMWARE_EDID=y CONFIG_FB_MODE_HELPERS=y From f9f5518a38684e031d913f40482721ff553f5ba2 Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Wed, 28 Aug 2019 13:33:50 -0500 Subject: [PATCH 0353/2880] ARM: dts: logicpd-torpedo-baseboard: Fix missing video A previous commit removed the panel-dpi driver, which made the Torpedo video stop working because it relied on the dpi driver for setting video timings. Now that the simple-panel driver is available in omap2plus, this patch migrates the Torpedo dev kits to use a similar panel and remove the manual timing requirements. Fixes: 8bf4b1621178 ("drm/omap: Remove panel-dpi driver") Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- .../boot/dts/logicpd-torpedo-baseboard.dtsi | 39 ++++--------------- 1 file changed, 7 insertions(+), 32 deletions(-) diff --git a/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi b/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi index 642e809e757a..449cc7616da6 100644 --- a/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi +++ b/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi @@ -108,7 +108,6 @@ &dss { status = "ok"; vdds_dsi-supply = <&vpll2>; - vdda_video-supply = <&video_reg>; pinctrl-names = "default"; pinctrl-0 = <&dss_dpi_pins1>; port { @@ -124,44 +123,20 @@ display0 = &lcd0; }; - video_reg: video_reg { + lcd0: display { + /* This isn't the exact LCD, but the timings meet spec */ + /* To make it work, set CONFIG_OMAP2_DSS_MIN_FCK_PER_PCK=4 */ + compatible = "newhaven,nhd-4.3-480272ef-atxl"; + label = "15"; pinctrl-names = "default"; pinctrl-0 = <&panel_pwr_pins>; - compatible = "regulator-fixed"; - regulator-name = "fixed-supply"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - gpio = <&gpio5 27 GPIO_ACTIVE_HIGH>; /* gpio155, lcd INI */ - }; - - lcd0: display { - compatible = "panel-dpi"; - label = "15"; - status = "okay"; - /* default-on; */ - pinctrl-names = "default"; - + backlight = <&bl>; + enable-gpios = <&gpio5 27 GPIO_ACTIVE_HIGH>; port { lcd_in: endpoint { remote-endpoint = <&dpi_out>; }; }; - - panel-timing { - clock-frequency = <9000000>; - hactive = <480>; - vactive = <272>; - hfront-porch = <3>; - hback-porch = <2>; - hsync-len = <42>; - vback-porch = <3>; - vfront-porch = <4>; - vsync-len = <11>; - hsync-active = <0>; - vsync-active = <0>; - de-active = <1>; - pixelclk-active = <1>; - }; }; bl: backlight { From 24cf23276a54dd2825d3e3965c1b1b453e2a113d Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Wed, 28 Aug 2019 13:33:51 -0500 Subject: [PATCH 0354/2880] ARM: dts: am3517-evm: Fix missing video A previous commit removed the panel-dpi driver, which made the video on the AM3517-evm stop working because it relied on the dpi driver for setting video timings. Now that the simple-panel driver is available in omap2plus, this patch migrates the am3517-evm to use a similar panel and remove the manual timing requirements. Fixes: 8bf4b1621178 ("drm/omap: Remove panel-dpi driver") Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am3517-evm.dts | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/arch/arm/boot/dts/am3517-evm.dts b/arch/arm/boot/dts/am3517-evm.dts index ebfe28c2f544..a1fd3e63e86e 100644 --- a/arch/arm/boot/dts/am3517-evm.dts +++ b/arch/arm/boot/dts/am3517-evm.dts @@ -124,10 +124,11 @@ }; lcd0: display@0 { - compatible = "panel-dpi"; + /* This isn't the exact LCD, but the timings meet spec */ + /* To make it work, set CONFIG_OMAP2_DSS_MIN_FCK_PER_PCK=4 */ + compatible = "newhaven,nhd-4.3-480272ef-atxl"; label = "15"; - status = "okay"; - pinctrl-names = "default"; + backlight = <&bl>; enable-gpios = <&gpio6 16 GPIO_ACTIVE_HIGH>; /* gpio176, lcd INI */ vcc-supply = <&vdd_io_reg>; @@ -136,22 +137,6 @@ remote-endpoint = <&dpi_out>; }; }; - - panel-timing { - clock-frequency = <9000000>; - hactive = <480>; - vactive = <272>; - hfront-porch = <3>; - hback-porch = <2>; - hsync-len = <42>; - vback-porch = <3>; - vfront-porch = <4>; - vsync-len = <11>; - hsync-active = <0>; - vsync-active = <0>; - de-active = <1>; - pixelclk-active = <1>; - }; }; bl: backlight { From a932b77b4d1939ad173f18be87da409427fb705c Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Tue, 20 Aug 2019 07:17:27 -0500 Subject: [PATCH 0355/2880] ARM: dts: logicpd-som-lv: Fix i2c2 and i2c3 Pin mux When the pinmux configuration was added, it was accidentally placed into the omap3_pmx_wkup node when it should have been placed into the omap3_pmx_core. This error was accidentally propagated to stable by me when I blindly requested the pull after seeing I2C issues without actually reviewing the content of the pinout. Since the bootloader previously muxed these correctly in the past, was a hidden error. This patch moves the i2c2_pins and i2c3_pins to the correct node which should eliminate i2c bus errors and timeouts due to the fact the bootloader uses the save device tree that no longer properly assigns these pins. Fixes: 5fe3c0fa0d54 ("ARM: dts: Add pinmuxing for i2c2 and i2c3 for LogicPD SOM-LV") #4.9+ Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/logicpd-som-lv.dtsi | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/arm/boot/dts/logicpd-som-lv.dtsi b/arch/arm/boot/dts/logicpd-som-lv.dtsi index 5563ee54c960..b56524cc7fe2 100644 --- a/arch/arm/boot/dts/logicpd-som-lv.dtsi +++ b/arch/arm/boot/dts/logicpd-som-lv.dtsi @@ -228,6 +228,20 @@ >; }; + i2c2_pins: pinmux_i2c2_pins { + pinctrl-single,pins = < + OMAP3_CORE1_IOPAD(0x21be, PIN_INPUT | MUX_MODE0) /* i2c2_scl */ + OMAP3_CORE1_IOPAD(0x21c0, PIN_INPUT | MUX_MODE0) /* i2c2_sda */ + >; + }; + + i2c3_pins: pinmux_i2c3_pins { + pinctrl-single,pins = < + OMAP3_CORE1_IOPAD(0x21c2, PIN_INPUT | MUX_MODE0) /* i2c3_scl */ + OMAP3_CORE1_IOPAD(0x21c4, PIN_INPUT | MUX_MODE0) /* i2c3_sda */ + >; + }; + tsc2004_pins: pinmux_tsc2004_pins { pinctrl-single,pins = < OMAP3_CORE1_IOPAD(0x2186, PIN_INPUT | MUX_MODE4) /* mcbsp4_dr.gpio_153 */ @@ -249,18 +263,6 @@ OMAP3_WKUP_IOPAD(0x2a0c, PIN_OUTPUT | MUX_MODE4) /* sys_boot1.gpio_3 */ >; }; - i2c2_pins: pinmux_i2c2_pins { - pinctrl-single,pins = < - OMAP3_CORE1_IOPAD(0x21be, PIN_INPUT | MUX_MODE0) /* i2c2_scl */ - OMAP3_CORE1_IOPAD(0x21c0, PIN_INPUT | MUX_MODE0) /* i2c2_sda */ - >; - }; - i2c3_pins: pinmux_i2c3_pins { - pinctrl-single,pins = < - OMAP3_CORE1_IOPAD(0x21c2, PIN_INPUT | MUX_MODE0) /* i2c3_scl */ - OMAP3_CORE1_IOPAD(0x21c4, PIN_INPUT | MUX_MODE0) /* i2c3_sda */ - >; - }; }; &omap3_pmx_core2 { From dba61cda30469a6c4fed0f8d5bf2b6001ca80a51 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:15 +0000 Subject: [PATCH 0356/2880] Drivers: hv: vmbus: Break out synic enable and disable operations Break out synic enable and disable operations into separate hv_synic_disable_regs() and hv_synic_enable_regs() functions for use by a later patch to support hibernation. There is no functional change except the unnecessary check "if (sctrl.enable != 1) return -EFAULT;" which is removed, because when we're in hv_synic_cleanup(), we're absolutely sure sctrl.enable must be 1. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/hv.c | 66 ++++++++++++++++++++++----------------- drivers/hv/hyperv_vmbus.h | 2 ++ 2 files changed, 39 insertions(+), 29 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 6188fb7dda42..fcc52797c169 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -154,7 +154,7 @@ void hv_synic_free(void) * retrieve the initialized message and event pages. Otherwise, we create and * initialize the message and event pages. */ -int hv_synic_init(unsigned int cpu) +void hv_synic_enable_regs(unsigned int cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); @@ -196,6 +196,11 @@ int hv_synic_init(unsigned int cpu) sctrl.enable = 1; hv_set_synic_state(sctrl.as_uint64); +} + +int hv_synic_init(unsigned int cpu) +{ + hv_synic_enable_regs(cpu); hv_stimer_init(cpu); @@ -205,20 +210,45 @@ int hv_synic_init(unsigned int cpu) /* * hv_synic_cleanup - Cleanup routine for hv_synic_init(). */ -int hv_synic_cleanup(unsigned int cpu) +void hv_synic_disable_regs(unsigned int cpu) { union hv_synic_sint shared_sint; union hv_synic_simp simp; union hv_synic_siefp siefp; union hv_synic_scontrol sctrl; + + hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + + shared_sint.masked = 1; + + /* Need to correctly cleanup in the case of SMP!!! */ + /* Disable the interrupt */ + hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + + hv_get_simp(simp.as_uint64); + simp.simp_enabled = 0; + simp.base_simp_gpa = 0; + + hv_set_simp(simp.as_uint64); + + hv_get_siefp(siefp.as_uint64); + siefp.siefp_enabled = 0; + siefp.base_siefp_gpa = 0; + + hv_set_siefp(siefp.as_uint64); + + /* Disable the global synic bit */ + hv_get_synic_state(sctrl.as_uint64); + sctrl.enable = 0; + hv_set_synic_state(sctrl.as_uint64); +} + +int hv_synic_cleanup(unsigned int cpu) +{ struct vmbus_channel *channel, *sc; bool channel_found = false; unsigned long flags; - hv_get_synic_state(sctrl.as_uint64); - if (sctrl.enable != 1) - return -EFAULT; - /* * Search for channels which are bound to the CPU we're about to * cleanup. In case we find one and vmbus is still connected we need to @@ -249,29 +279,7 @@ int hv_synic_cleanup(unsigned int cpu) hv_stimer_cleanup(cpu); - hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); - - shared_sint.masked = 1; - - /* Need to correctly cleanup in the case of SMP!!! */ - /* Disable the interrupt */ - hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); - - hv_get_simp(simp.as_uint64); - simp.simp_enabled = 0; - simp.base_simp_gpa = 0; - - hv_set_simp(simp.as_uint64); - - hv_get_siefp(siefp.as_uint64); - siefp.siefp_enabled = 0; - siefp.base_siefp_gpa = 0; - - hv_set_siefp(siefp.as_uint64); - - /* Disable the global synic bit */ - sctrl.enable = 0; - hv_set_synic_state(sctrl.as_uint64); + hv_synic_disable_regs(cpu); return 0; } diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 362e70e9d145..26ea161a6876 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -171,8 +171,10 @@ extern int hv_synic_alloc(void); extern void hv_synic_free(void); +extern void hv_synic_enable_regs(unsigned int cpu); extern int hv_synic_init(unsigned int cpu); +extern void hv_synic_disable_regs(unsigned int cpu); extern int hv_synic_cleanup(unsigned int cpu); /* Interface */ From 63ecc6d22ce46643165c391a9c90ba67e22e1c0f Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:16 +0000 Subject: [PATCH 0357/2880] Drivers: hv: vmbus: Suspend/resume the synic for hibernation This is needed when we resume the old kernel from the "current" kernel. Note: when hv_synic_suspend() and hv_synic_resume() run, all the non-boot CPUs have been offlined, and interrupts are disabled on CPU0. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/vmbus_drv.c | 46 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index ebd35fc35290..2ef375ce58ac 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "hyperv_vmbus.h" @@ -2086,6 +2087,47 @@ static void hv_crash_handler(struct pt_regs *regs) hyperv_cleanup(); }; +static int hv_synic_suspend(void) +{ + /* + * When we reach here, all the non-boot CPUs have been offlined, and + * the stimers on them have been unbound in hv_synic_cleanup() -> + * hv_stimer_cleanup() -> clockevents_unbind_device(). + * + * hv_synic_suspend() only runs on CPU0 with interrupts disabled. Here + * we do not unbind the stimer on CPU0 because: 1) it's unnecessary + * because the interrupts remain disabled between syscore_suspend() + * and syscore_resume(): see create_image() and resume_target_kernel(); + * 2) the stimer on CPU0 is automatically disabled later by + * syscore_suspend() -> timekeeping_suspend() -> tick_suspend() -> ... + * -> clockevents_shutdown() -> ... -> hv_ce_shutdown(); 3) a warning + * would be triggered if we call clockevents_unbind_device(), which + * may sleep, in an interrupts-disabled context. So, we intentionally + * don't call hv_stimer_cleanup(0) here. + */ + + hv_synic_disable_regs(0); + + return 0; +} + +static void hv_synic_resume(void) +{ + hv_synic_enable_regs(0); + + /* + * Note: we don't need to call hv_stimer_init(0), because the timer + * on CPU0 is not unbound in hv_synic_suspend(), and the timer is + * automatically re-enabled in timekeeping_resume(). + */ +} + +/* The callbacks run only on CPU0, with irqs_disabled. */ +static struct syscore_ops hv_synic_syscore_ops = { + .suspend = hv_synic_suspend, + .resume = hv_synic_resume, +}; + static int __init hv_acpi_init(void) { int ret, t; @@ -2116,6 +2158,8 @@ static int __init hv_acpi_init(void) hv_setup_kexec_handler(hv_kexec_handler); hv_setup_crash_handler(hv_crash_handler); + register_syscore_ops(&hv_synic_syscore_ops); + return 0; cleanup: @@ -2128,6 +2172,8 @@ static void __exit vmbus_exit(void) { int cpu; + unregister_syscore_ops(&hv_synic_syscore_ops); + hv_remove_kexec_handler(); hv_remove_crash_handler(); vmbus_connection.conn_state = DISCONNECTED; From ed56ef675ae6ef0e6f7d42b9c42dae61172f0960 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:16 +0000 Subject: [PATCH 0358/2880] Drivers: hv: vmbus: Add a helper function is_sub_channel() The existing method of telling if a channel is sub-channel in vmbus_process_offer() is cumbersome. This new simple helper function is preferred in future. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- include/linux/hyperv.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 6256cc34c4a6..2d39248cff96 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -245,7 +245,10 @@ struct vmbus_channel_offer { } pipe; } u; /* - * The sub_channel_index is defined in win8. + * The sub_channel_index is defined in Win8: a value of zero means a + * primary channel and a value of non-zero means a sub-channel. + * + * Before Win8, the field is reserved, meaning it's always zero. */ u16 sub_channel_index; u16 reserved3; @@ -934,6 +937,11 @@ static inline bool is_hvsock_channel(const struct vmbus_channel *c) VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); } +static inline bool is_sub_channel(const struct vmbus_channel *c) +{ + return c->offermsg.offer.sub_channel_index != 0; +} + static inline void set_channel_affinity_state(struct vmbus_channel *c, enum hv_numa_policy policy) { From 271b2224d42f88870e6b060924ee374871c131fc Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:17 +0000 Subject: [PATCH 0359/2880] Drivers: hv: vmbus: Implement suspend/resume for VSC drivers for hibernation The high-level VSC drivers will implement device-specific callbacks. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/vmbus_drv.c | 46 ++++++++++++++++++++++++++++++++++++++++++ include/linux/hyperv.h | 3 +++ 2 files changed, 49 insertions(+) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 2ef375ce58ac..a30c70adf9a0 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -911,6 +911,43 @@ static void vmbus_shutdown(struct device *child_device) drv->shutdown(dev); } +/* + * vmbus_suspend - Suspend a vmbus device + */ +static int vmbus_suspend(struct device *child_device) +{ + struct hv_driver *drv; + struct hv_device *dev = device_to_hv_device(child_device); + + /* The device may not be attached yet */ + if (!child_device->driver) + return 0; + + drv = drv_to_hv_drv(child_device->driver); + if (!drv->suspend) + return -EOPNOTSUPP; + + return drv->suspend(dev); +} + +/* + * vmbus_resume - Resume a vmbus device + */ +static int vmbus_resume(struct device *child_device) +{ + struct hv_driver *drv; + struct hv_device *dev = device_to_hv_device(child_device); + + /* The device may not be attached yet */ + if (!child_device->driver) + return 0; + + drv = drv_to_hv_drv(child_device->driver); + if (!drv->resume) + return -EOPNOTSUPP; + + return drv->resume(dev); +} /* * vmbus_device_release - Final callback release of the vmbus child device @@ -926,6 +963,14 @@ static void vmbus_device_release(struct device *device) kfree(hv_dev); } +/* + * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than + * SET_SYSTEM_SLEEP_PM_OPS: see the comment before vmbus_bus_pm. + */ +static const struct dev_pm_ops vmbus_pm = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_suspend, vmbus_resume) +}; + /* The one and only one */ static struct bus_type hv_bus = { .name = "vmbus", @@ -936,6 +981,7 @@ static struct bus_type hv_bus = { .uevent = vmbus_uevent, .dev_groups = vmbus_dev_groups, .drv_groups = vmbus_drv_groups, + .pm = &vmbus_pm, }; struct onmessage_work_context { diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 2d39248cff96..8a60e7766037 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1157,6 +1157,9 @@ struct hv_driver { int (*remove)(struct hv_device *); void (*shutdown)(struct hv_device *); + int (*suspend)(struct hv_device *); + int (*resume)(struct hv_device *); + }; /* Base device object */ From e3ede02add7e6df315afc6b4120520f7d0c5a258 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:18 +0000 Subject: [PATCH 0360/2880] Drivers: hv: vmbus: Ignore the offers when resuming from hibernation When the VM resumes, the host re-sends the offers. We should not add the offers to the global vmbus_connection.chn_list again. This patch assumes the RELIDs of the channels don't change across hibernation. Actually this is not always true, especially in the case of NIC SR-IOV the VF vmbus device's RELID sometimes can change. A later patch will address this issue by mapping the new offers to the old channels and fixing up the old channels, if necessary. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/channel_mgmt.c | 58 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index addcef50df7a..44b92fa1b7fa 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -847,6 +847,37 @@ void vmbus_initiate_unload(bool crash) vmbus_wait_for_unload(); } +/* + * find_primary_channel_by_offer - Get the channel object given the new offer. + * This is only used in the resume path of hibernation. + */ +static struct vmbus_channel * +find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer) +{ + struct vmbus_channel *channel = NULL, *iter; + const guid_t *inst1, *inst2; + + /* Ignore sub-channel offers. */ + if (offer->offer.sub_channel_index != 0) + return NULL; + + mutex_lock(&vmbus_connection.channel_mutex); + + list_for_each_entry(iter, &vmbus_connection.chn_list, listentry) { + inst1 = &iter->offermsg.offer.if_instance; + inst2 = &offer->offer.if_instance; + + if (guid_equal(inst1, inst2)) { + channel = iter; + break; + } + } + + mutex_unlock(&vmbus_connection.channel_mutex); + + return channel; +} + /* * vmbus_onoffer - Handler for channel offers from vmbus in parent partition. * @@ -854,12 +885,37 @@ void vmbus_initiate_unload(bool crash) static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) { struct vmbus_channel_offer_channel *offer; - struct vmbus_channel *newchannel; + struct vmbus_channel *oldchannel, *newchannel; + size_t offer_sz; offer = (struct vmbus_channel_offer_channel *)hdr; trace_vmbus_onoffer(offer); + oldchannel = find_primary_channel_by_offer(offer); + + if (oldchannel != NULL) { + atomic_dec(&vmbus_connection.offer_in_progress); + + /* + * We're resuming from hibernation: we expect the host to send + * exactly the same offers that we had before the hibernation. + */ + offer_sz = sizeof(*offer); + if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) + return; + + pr_debug("Mismatched offer from the host (relid=%d)\n", + offer->child_relid); + + print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET, + 16, 4, &oldchannel->offermsg, offer_sz, + false); + print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET, + 16, 4, offer, offer_sz, false); + return; + } + /* Allocate the channel object and save this offer. */ newchannel = alloc_channel(); if (!newchannel) { From f53335e3289f9ac3a6a8faf6c2f819eee508bd39 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:19 +0000 Subject: [PATCH 0361/2880] Drivers: hv: vmbus: Suspend/resume the vmbus itself for hibernation Before Linux enters hibernation, it sends the CHANNELMSG_UNLOAD message to the host so all the offers are gone. After hibernation, Linux needs to re-negotiate with the host using the same vmbus protocol version (which was in use before hibernation), and ask the host to re-offer the vmbus devices. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/connection.c | 3 +- drivers/hv/hyperv_vmbus.h | 2 ++ drivers/hv/vmbus_drv.c | 59 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 09829e15d4a0..806319cd5ccf 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -59,8 +59,7 @@ static __u32 vmbus_get_next_version(__u32 current_version) } } -static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, - __u32 version) +int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) { int ret = 0; unsigned int cur_cpu; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 26ea161a6876..e657197a027a 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -274,6 +274,8 @@ struct vmbus_msginfo { extern struct vmbus_connection vmbus_connection; +int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version); + static inline void vmbus_send_interrupt(u32 relid) { sync_set_bit(relid, vmbus_connection.send_int_page); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index a30c70adf9a0..ce9974bf683f 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2089,6 +2089,51 @@ acpi_walk_err: return ret_val; } +static int vmbus_bus_suspend(struct device *dev) +{ + vmbus_initiate_unload(false); + + vmbus_connection.conn_state = DISCONNECTED; + + return 0; +} + +static int vmbus_bus_resume(struct device *dev) +{ + struct vmbus_channel_msginfo *msginfo; + size_t msgsize; + int ret; + + /* + * We only use the 'vmbus_proto_version', which was in use before + * hibernation, to re-negotiate with the host. + */ + if (vmbus_proto_version == VERSION_INVAL || + vmbus_proto_version == 0) { + pr_err("Invalid proto version = 0x%x\n", vmbus_proto_version); + return -EINVAL; + } + + msgsize = sizeof(*msginfo) + + sizeof(struct vmbus_channel_initiate_contact); + + msginfo = kzalloc(msgsize, GFP_KERNEL); + + if (msginfo == NULL) + return -ENOMEM; + + ret = vmbus_negotiate_version(msginfo, vmbus_proto_version); + + kfree(msginfo); + + if (ret != 0) + return ret; + + vmbus_request_offers(); + + return 0; +} + static const struct acpi_device_id vmbus_acpi_device_ids[] = { {"VMBUS", 0}, {"VMBus", 0}, @@ -2096,6 +2141,19 @@ static const struct acpi_device_id vmbus_acpi_device_ids[] = { }; MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids); +/* + * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than + * SET_SYSTEM_SLEEP_PM_OPS, otherwise NIC SR-IOV can not work, because the + * "pci_dev_pm_ops" uses the "noirq" callbacks: in the resume path, the + * pci "noirq" restore callback runs before "non-noirq" callbacks (see + * resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() -> + * dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's + * resume callback must also run via the "noirq" callbacks. + */ +static const struct dev_pm_ops vmbus_bus_pm = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_bus_suspend, vmbus_bus_resume) +}; + static struct acpi_driver vmbus_acpi_driver = { .name = "vmbus", .ids = vmbus_acpi_device_ids, @@ -2103,6 +2161,7 @@ static struct acpi_driver vmbus_acpi_driver = { .add = vmbus_acpi_add, .remove = vmbus_acpi_remove, }, + .drv.pm = &vmbus_bus_pm, }; static void hv_kexec_handler(void) From 1f48dcf180e5422b1a633b24680dd0f5c3f540f5 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:20 +0000 Subject: [PATCH 0362/2880] Drivers: hv: vmbus: Clean up hv_sock channels by force upon suspend Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for hibernation. There is no better method to clean up the channels since some of the channels may still be referenced by the userspace apps when hibernation is triggered: in this case, with this patch, the "rescind" fields of the channels are set, and the apps will thoroughly destroy the channels after hibernation. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/vmbus_drv.c | 55 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index ce9974bf683f..45b976ec6e79 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -1069,6 +1070,41 @@ msg_handled: vmbus_signal_eom(msg, message_type); } +/* + * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for + * hibernation, because hv_sock connections can not persist across hibernation. + */ +static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) +{ + struct onmessage_work_context *ctx; + struct vmbus_channel_rescind_offer *rescind; + + WARN_ON(!is_hvsock_channel(channel)); + + /* + * sizeof(*ctx) is small and the allocation should really not fail, + * otherwise the state of the hv_sock connections ends up in limbo. + */ + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL); + + /* + * So far, these are not really used by Linux. Just set them to the + * reasonable values conforming to the definitions of the fields. + */ + ctx->msg.header.message_type = 1; + ctx->msg.header.payload_size = sizeof(*rescind); + + /* These values are actually used by Linux. */ + rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.u.payload; + rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER; + rescind->child_relid = channel->offermsg.child_relid; + + INIT_WORK(&ctx->work, vmbus_onmessage_work); + + queue_work_on(vmbus_connection.connect_cpu, + vmbus_connection.work_queue, + &ctx->work); +} /* * Direct callback for channels using other deferred processing @@ -2091,6 +2127,25 @@ acpi_walk_err: static int vmbus_bus_suspend(struct device *dev) { + struct vmbus_channel *channel; + + while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { + /* + * We wait here until the completion of any channel + * offers that are currently in progress. + */ + msleep(1); + } + + mutex_lock(&vmbus_connection.channel_mutex); + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (!is_hvsock_channel(channel)) + continue; + + vmbus_force_channel_rescinded(channel); + } + mutex_unlock(&vmbus_connection.channel_mutex); + vmbus_initiate_unload(false); vmbus_connection.conn_state = DISCONNECTED; From b307b38962eb0f22d1aa6dcf53cb7d3c2ed5eec7 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:21 +0000 Subject: [PATCH 0363/2880] Drivers: hv: vmbus: Suspend after cleaning up hv_sock and sub channels Before suspend, Linux must make sure all the hv_sock channels have been properly cleaned up, because a hv_sock connection can not persist across hibernation, and the user-space app must be properly notified of the state change of the connection. Before suspend, Linux also must make sure all the sub-channels have been destroyed, i.e. the related channel structs of the sub-channels must be properly removed, otherwise they would cause a conflict when the sub-channels are recreated upon resume. Add a counter to track such channels, and vmbus_bus_suspend() should wait for the counter to drop to zero. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/channel_mgmt.c | 26 +++++++++++++++++++++++ drivers/hv/connection.c | 3 +++ drivers/hv/hyperv_vmbus.h | 12 +++++++++++ drivers/hv/vmbus_drv.c | 44 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 84 insertions(+), 1 deletion(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 44b92fa1b7fa..5518d031f62a 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -545,6 +545,10 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) mutex_lock(&vmbus_connection.channel_mutex); + /* Remember the channels that should be cleaned up upon suspend. */ + if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel)) + atomic_inc(&vmbus_connection.nr_chan_close_on_suspend); + /* * Now that we have acquired the channel_mutex, * we can release the potentially racing rescind thread. @@ -944,6 +948,16 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) vmbus_process_offer(newchannel); } +static void check_ready_for_suspend_event(void) +{ + /* + * If all the sub-channels or hv_sock channels have been cleaned up, + * then it's safe to suspend. + */ + if (atomic_dec_and_test(&vmbus_connection.nr_chan_close_on_suspend)) + complete(&vmbus_connection.ready_for_suspend_event); +} + /* * vmbus_onoffer_rescind - Rescind offer handler. * @@ -954,6 +968,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) struct vmbus_channel_rescind_offer *rescind; struct vmbus_channel *channel; struct device *dev; + bool clean_up_chan_for_suspend; rescind = (struct vmbus_channel_rescind_offer *)hdr; @@ -993,6 +1008,8 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) return; } + clean_up_chan_for_suspend = is_hvsock_channel(channel) || + is_sub_channel(channel); /* * Before setting channel->rescind in vmbus_rescind_cleanup(), we * should make sure the channel callback is not running any more. @@ -1018,6 +1035,10 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) if (channel->device_obj) { if (channel->chn_rescind_callback) { channel->chn_rescind_callback(channel); + + if (clean_up_chan_for_suspend) + check_ready_for_suspend_event(); + return; } /* @@ -1050,6 +1071,11 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) } mutex_unlock(&vmbus_connection.channel_mutex); } + + /* The "channel" may have been freed. Do not access it any longer. */ + + if (clean_up_chan_for_suspend) + check_ready_for_suspend_event(); } void vmbus_hvsock_device_unregister(struct vmbus_channel *channel) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 806319cd5ccf..99851ea682eb 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -26,6 +26,9 @@ struct vmbus_connection vmbus_connection = { .conn_state = DISCONNECTED, .next_gpadl_handle = ATOMIC_INIT(0xE1E10), + + .ready_for_suspend_event= COMPLETION_INITIALIZER( + vmbus_connection.ready_for_suspend_event), }; EXPORT_SYMBOL_GPL(vmbus_connection); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index e657197a027a..974b747ca1fc 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -260,6 +260,18 @@ struct vmbus_connection { struct workqueue_struct *work_queue; struct workqueue_struct *handle_primary_chan_wq; struct workqueue_struct *handle_sub_chan_wq; + + /* + * The number of sub-channels and hv_sock channels that should be + * cleaned up upon suspend: sub-channels will be re-created upon + * resume, and hv_sock channels should not survive suspend. + */ + atomic_t nr_chan_close_on_suspend; + /* + * vmbus_bus_suspend() waits for "nr_chan_close_on_suspend" to + * drop to zero. + */ + struct completion ready_for_suspend_event; }; diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 45b976ec6e79..32ec951d334f 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2127,7 +2127,8 @@ acpi_walk_err: static int vmbus_bus_suspend(struct device *dev) { - struct vmbus_channel *channel; + struct vmbus_channel *channel, *sc; + unsigned long flags; while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { /* @@ -2146,6 +2147,44 @@ static int vmbus_bus_suspend(struct device *dev) } mutex_unlock(&vmbus_connection.channel_mutex); + /* + * Wait until all the sub-channels and hv_sock channels have been + * cleaned up. Sub-channels should be destroyed upon suspend, otherwise + * they would conflict with the new sub-channels that will be created + * in the resume path. hv_sock channels should also be destroyed, but + * a hv_sock channel of an established hv_sock connection can not be + * really destroyed since it may still be referenced by the userspace + * application, so we just force the hv_sock channel to be rescinded + * by vmbus_force_channel_rescinded(), and the userspace application + * will thoroughly destroy the channel after hibernation. + * + * Note: the counter nr_chan_close_on_suspend may never go above 0 if + * the VM has no sub-channel and hv_sock channel, e.g. a 1-vCPU VM. + */ + if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0) + wait_for_completion(&vmbus_connection.ready_for_suspend_event); + + mutex_lock(&vmbus_connection.channel_mutex); + + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (is_hvsock_channel(channel)) { + if (!channel->rescind) { + pr_err("hv_sock channel not rescinded!\n"); + WARN_ON_ONCE(1); + } + continue; + } + + spin_lock_irqsave(&channel->lock, flags); + list_for_each_entry(sc, &channel->sc_list, sc_list) { + pr_err("Sub-channel not deleted!\n"); + WARN_ON_ONCE(1); + } + spin_unlock_irqrestore(&channel->lock, flags); + } + + mutex_unlock(&vmbus_connection.channel_mutex); + vmbus_initiate_unload(false); vmbus_connection.conn_state = DISCONNECTED; @@ -2186,6 +2225,9 @@ static int vmbus_bus_resume(struct device *dev) vmbus_request_offers(); + /* Reset the event for the next suspend. */ + reinit_completion(&vmbus_connection.ready_for_suspend_event); + return 0; } From d8bd2d442bb2688b428ac7164e5dc6d95d4fa65b Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:22 +0000 Subject: [PATCH 0364/2880] Drivers: hv: vmbus: Resume after fixing up old primary channels When the host re-offers the primary channels upon resume, the host only guarantees the Instance GUID doesn't change, so vmbus_bus_suspend() should invalidate channel->offermsg.child_relid and figure out the number of primary channels that need to be fixed up upon resume. Upon resume, vmbus_onoffer() finds the old channel structs, and maps the new offers to the old channels, and fixes up the old structs, and finally the resume callbacks of the VSC drivers will re-open the channels. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/channel_mgmt.c | 89 +++++++++++++++++++++++++++++---------- drivers/hv/connection.c | 2 + drivers/hv/hyperv_vmbus.h | 14 ++++++ drivers/hv/vmbus_drv.c | 17 ++++++++ include/linux/hyperv.h | 3 ++ 5 files changed, 103 insertions(+), 22 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 5518d031f62a..8eb167540b4f 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -407,7 +407,15 @@ void hv_process_channel_removal(struct vmbus_channel *channel) cpumask_clear_cpu(channel->target_cpu, &primary_channel->alloced_cpus_in_node); - vmbus_release_relid(channel->offermsg.child_relid); + /* + * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and + * the relid is invalidated; after hibernation, when the user-space app + * destroys the channel, the relid is INVALID_RELID, and in this case + * it's unnecessary and unsafe to release the old relid, since the same + * relid can refer to a completely different channel now. + */ + if (channel->offermsg.child_relid != INVALID_RELID) + vmbus_release_relid(channel->offermsg.child_relid); free_channel(channel); } @@ -851,6 +859,36 @@ void vmbus_initiate_unload(bool crash) vmbus_wait_for_unload(); } +static void check_ready_for_resume_event(void) +{ + /* + * If all the old primary channels have been fixed up, then it's safe + * to resume. + */ + if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume)) + complete(&vmbus_connection.ready_for_resume_event); +} + +static void vmbus_setup_channel_state(struct vmbus_channel *channel, + struct vmbus_channel_offer_channel *offer) +{ + /* + * Setup state for signalling the host. + */ + channel->sig_event = VMBUS_EVENT_CONNECTION_ID; + + if (vmbus_proto_version != VERSION_WS2008) { + channel->is_dedicated_interrupt = + (offer->is_dedicated_interrupt != 0); + channel->sig_event = offer->connection_id; + } + + memcpy(&channel->offermsg, offer, + sizeof(struct vmbus_channel_offer_channel)); + channel->monitor_grp = (u8)offer->monitorid / 32; + channel->monitor_bit = (u8)offer->monitorid % 32; +} + /* * find_primary_channel_by_offer - Get the channel object given the new offer. * This is only used in the resume path of hibernation. @@ -902,14 +940,29 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) atomic_dec(&vmbus_connection.offer_in_progress); /* - * We're resuming from hibernation: we expect the host to send - * exactly the same offers that we had before the hibernation. + * We're resuming from hibernation: all the sub-channel and + * hv_sock channels we had before the hibernation should have + * been cleaned up, and now we must be seeing a re-offered + * primary channel that we had before the hibernation. */ - offer_sz = sizeof(*offer); - if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) - return; - pr_debug("Mismatched offer from the host (relid=%d)\n", + WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID); + /* Fix up the relid. */ + oldchannel->offermsg.child_relid = offer->child_relid; + + offer_sz = sizeof(*offer); + if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) { + check_ready_for_resume_event(); + return; + } + + /* + * This is not an error, since the host can also change the + * other field(s) of the offer, e.g. on WS RS5 (Build 17763), + * the offer->connection_id of the Mellanox VF vmbus device + * can change when the host reoffers the device upon resume. + */ + pr_debug("vmbus offer changed: relid=%d\n", offer->child_relid); print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET, @@ -917,6 +970,12 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) false); print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET, 16, 4, offer, offer_sz, false); + + /* Fix up the old channel. */ + vmbus_setup_channel_state(oldchannel, offer); + + check_ready_for_resume_event(); + return; } @@ -929,21 +988,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) return; } - /* - * Setup state for signalling the host. - */ - newchannel->sig_event = VMBUS_EVENT_CONNECTION_ID; - - if (vmbus_proto_version != VERSION_WS2008) { - newchannel->is_dedicated_interrupt = - (offer->is_dedicated_interrupt != 0); - newchannel->sig_event = offer->connection_id; - } - - memcpy(&newchannel->offermsg, offer, - sizeof(struct vmbus_channel_offer_channel)); - newchannel->monitor_grp = (u8)offer->monitorid / 32; - newchannel->monitor_bit = (u8)offer->monitorid % 32; + vmbus_setup_channel_state(newchannel, offer); vmbus_process_offer(newchannel); } diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 99851ea682eb..6e4c015783ff 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -29,6 +29,8 @@ struct vmbus_connection vmbus_connection = { .ready_for_suspend_event= COMPLETION_INITIALIZER( vmbus_connection.ready_for_suspend_event), + .ready_for_resume_event = COMPLETION_INITIALIZER( + vmbus_connection.ready_for_resume_event), }; EXPORT_SYMBOL_GPL(vmbus_connection); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 974b747ca1fc..f7a5f5615f34 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -272,6 +272,20 @@ struct vmbus_connection { * drop to zero. */ struct completion ready_for_suspend_event; + + /* + * The number of primary channels that should be "fixed up" + * upon resume: these channels are re-offered upon resume, and some + * fields of the channel offers (i.e. child_relid and connection_id) + * can change, so the old offermsg must be fixed up, before the resume + * callbacks of the VSC drivers start to further touch the channels. + */ + atomic_t nr_chan_fixup_on_resume; + /* + * vmbus_bus_resume() waits for "nr_chan_fixup_on_resume" to + * drop to zero. + */ + struct completion ready_for_resume_event; }; diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 32ec951d334f..391f0b225c9a 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2164,9 +2164,17 @@ static int vmbus_bus_suspend(struct device *dev) if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0) wait_for_completion(&vmbus_connection.ready_for_suspend_event); + WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0); + mutex_lock(&vmbus_connection.channel_mutex); list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + /* + * Invalidate the field. Upon resume, vmbus_onoffer() will fix + * up the field, and the other fields (if necessary). + */ + channel->offermsg.child_relid = INVALID_RELID; + if (is_hvsock_channel(channel)) { if (!channel->rescind) { pr_err("hv_sock channel not rescinded!\n"); @@ -2181,6 +2189,8 @@ static int vmbus_bus_suspend(struct device *dev) WARN_ON_ONCE(1); } spin_unlock_irqrestore(&channel->lock, flags); + + atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume); } mutex_unlock(&vmbus_connection.channel_mutex); @@ -2189,6 +2199,9 @@ static int vmbus_bus_suspend(struct device *dev) vmbus_connection.conn_state = DISCONNECTED; + /* Reset the event for the next resume. */ + reinit_completion(&vmbus_connection.ready_for_resume_event); + return 0; } @@ -2223,8 +2236,12 @@ static int vmbus_bus_resume(struct device *dev) if (ret != 0) return ret; + WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) == 0); + vmbus_request_offers(); + wait_for_completion(&vmbus_connection.ready_for_resume_event); + /* Reset the event for the next suspend. */ reinit_completion(&vmbus_connection.ready_for_suspend_event); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 8a60e7766037..a3aa9e9ef6f2 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -426,6 +426,9 @@ enum vmbus_channel_message_type { CHANNELMSG_COUNT }; +/* Hyper-V supports about 2048 channels, and the RELIDs start with 1. */ +#define INVALID_RELID U32_MAX + struct vmbus_channel_message_header { enum vmbus_channel_message_type msgtype; u32 padding; From c30da2e981a703c6b1d49911511f7ade8dac20be Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Mar 2019 16:38:31 +0000 Subject: [PATCH 0365/2880] fuse: convert to use the new mount API Convert the fuse filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Signed-off-by: David Howells Signed-off-by: Al Viro Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 292 +++++++++++++++++++++++++++--------------------- 1 file changed, 167 insertions(+), 125 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4bb885b0f032..c334f95e799a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -15,7 +15,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -59,7 +60,13 @@ MODULE_PARM_DESC(max_user_congthresh, /** Congestion starts at 75% of maximum */ #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) -struct fuse_mount_data { +#ifdef CONFIG_BLOCK +static struct file_system_type fuseblk_fs_type; +#endif + +struct fuse_fs_context { + const char *subtype; + bool is_bdev; int fd; unsigned rootmode; kuid_t user_id; @@ -443,6 +450,8 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) } enum { + OPT_SOURCE, + OPT_SUBTYPE, OPT_FD, OPT_ROOTMODE, OPT_USER_ID, @@ -454,111 +463,110 @@ enum { OPT_ERR }; -static const match_table_t tokens = { - {OPT_FD, "fd=%u"}, - {OPT_ROOTMODE, "rootmode=%o"}, - {OPT_USER_ID, "user_id=%u"}, - {OPT_GROUP_ID, "group_id=%u"}, - {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, - {OPT_ALLOW_OTHER, "allow_other"}, - {OPT_MAX_READ, "max_read=%u"}, - {OPT_BLKSIZE, "blksize=%u"}, - {OPT_ERR, NULL} +static const struct fs_parameter_spec fuse_param_specs[] = { + fsparam_string ("source", OPT_SOURCE), + fsparam_u32 ("fd", OPT_FD), + fsparam_u32oct ("rootmode", OPT_ROOTMODE), + fsparam_u32 ("user_id", OPT_USER_ID), + fsparam_u32 ("group_id", OPT_GROUP_ID), + fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS), + fsparam_flag ("allow_other", OPT_ALLOW_OTHER), + fsparam_u32 ("max_read", OPT_MAX_READ), + fsparam_u32 ("blksize", OPT_BLKSIZE), + __fsparam(fs_param_is_string, "subtype", OPT_SUBTYPE, + fs_param_v_optional), + {} }; -static int fuse_match_uint(substring_t *s, unsigned int *res) +static const struct fs_parameter_description fuse_fs_parameters = { + .name = "fuse", + .specs = fuse_param_specs, +}; + +static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) { - int err = -ENOMEM; - char *buf = match_strdup(s); - if (buf) { - err = kstrtouint(buf, 10, res); - kfree(buf); - } - return err; -} + struct fs_parse_result result; + struct fuse_fs_context *ctx = fc->fs_private; + int opt; -static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, - struct user_namespace *user_ns) -{ - char *p; - memset(d, 0, sizeof(struct fuse_mount_data)); - d->max_read = ~0; - d->blksize = FUSE_DEFAULT_BLKSIZE; + opt = fs_parse(fc, &fuse_fs_parameters, param, &result); + if (opt < 0) + return opt; - while ((p = strsep(&opt, ",")) != NULL) { - int token; - int value; - unsigned uv; - substring_t args[MAX_OPT_ARGS]; - if (!*p) - continue; + switch (opt) { + case OPT_SOURCE: + if (fc->source) + return invalf(fc, "fuse: Multiple sources specified"); + fc->source = param->string; + param->string = NULL; + break; - token = match_token(p, tokens, args); - switch (token) { - case OPT_FD: - if (match_int(&args[0], &value)) - return 0; - d->fd = value; - d->fd_present = 1; - break; - - case OPT_ROOTMODE: - if (match_octal(&args[0], &value)) - return 0; - if (!fuse_valid_type(value)) - return 0; - d->rootmode = value; - d->rootmode_present = 1; - break; - - case OPT_USER_ID: - if (fuse_match_uint(&args[0], &uv)) - return 0; - d->user_id = make_kuid(user_ns, uv); - if (!uid_valid(d->user_id)) - return 0; - d->user_id_present = 1; - break; - - case OPT_GROUP_ID: - if (fuse_match_uint(&args[0], &uv)) - return 0; - d->group_id = make_kgid(user_ns, uv); - if (!gid_valid(d->group_id)) - return 0; - d->group_id_present = 1; - break; - - case OPT_DEFAULT_PERMISSIONS: - d->default_permissions = 1; - break; - - case OPT_ALLOW_OTHER: - d->allow_other = 1; - break; - - case OPT_MAX_READ: - if (match_int(&args[0], &value)) - return 0; - d->max_read = value; - break; - - case OPT_BLKSIZE: - if (!is_bdev || match_int(&args[0], &value)) - return 0; - d->blksize = value; - break; - - default: - return 0; - } - } - - if (!d->fd_present || !d->rootmode_present || - !d->user_id_present || !d->group_id_present) + case OPT_SUBTYPE: + if (ctx->subtype) + return invalf(fc, "fuse: Multiple subtypes specified"); + ctx->subtype = param->string; + param->string = NULL; return 0; - return 1; + case OPT_FD: + ctx->fd = result.uint_32; + ctx->fd_present = 1; + break; + + case OPT_ROOTMODE: + if (!fuse_valid_type(result.uint_32)) + return invalf(fc, "fuse: Invalid rootmode"); + ctx->rootmode = result.uint_32; + ctx->rootmode_present = 1; + break; + + case OPT_USER_ID: + ctx->user_id = make_kuid(fc->user_ns, result.uint_32); + if (!uid_valid(ctx->user_id)) + return invalf(fc, "fuse: Invalid user_id"); + ctx->user_id_present = 1; + break; + + case OPT_GROUP_ID: + ctx->group_id = make_kgid(fc->user_ns, result.uint_32); + if (!gid_valid(ctx->group_id)) + return invalf(fc, "fuse: Invalid group_id"); + ctx->group_id_present = 1; + break; + + case OPT_DEFAULT_PERMISSIONS: + ctx->default_permissions = 1; + break; + + case OPT_ALLOW_OTHER: + ctx->allow_other = 1; + break; + + case OPT_MAX_READ: + ctx->max_read = result.uint_32; + break; + + case OPT_BLKSIZE: + if (!ctx->is_bdev) + return invalf(fc, "fuse: blksize only supported for fuseblk"); + ctx->blksize = result.uint_32; + break; + + default: + return -EINVAL; + } + + return 0; +} + +static void fuse_free_fc(struct fs_context *fc) +{ + struct fuse_fs_context *ctx = fc->fs_private; + + if (ctx) { + kfree(ctx->subtype); + kfree(ctx); + } } static int fuse_show_options(struct seq_file *m, struct dentry *root) @@ -1075,12 +1083,12 @@ void fuse_dev_free(struct fuse_dev *fud) } EXPORT_SYMBOL_GPL(fuse_dev_free); -static int fuse_fill_super(struct super_block *sb, void *data, int silent) +static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { + struct fuse_fs_context *ctx = fsc->fs_private; struct fuse_dev *fud; struct fuse_conn *fc; struct inode *root; - struct fuse_mount_data d; struct file *file; struct dentry *root_dentry; struct fuse_req *init_req; @@ -1093,19 +1101,19 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) - goto err; - if (is_bdev) { #ifdef CONFIG_BLOCK err = -EINVAL; - if (!sb_set_blocksize(sb, d.blksize)) + if (!sb_set_blocksize(sb, ctx->blksize)) goto err; #endif } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; } + + sb->s_subtype = ctx->subtype; + ctx->subtype = NULL; sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; sb->s_xattr = fuse_xattr_handlers; @@ -1116,7 +1124,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; - file = fget(d.fd); + file = fget(ctx->fd); err = -EINVAL; if (!file) goto err; @@ -1159,17 +1167,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->dont_mask = 1; sb->s_flags |= SB_POSIXACL; - fc->default_permissions = d.default_permissions; - fc->allow_other = d.allow_other; - fc->user_id = d.user_id; - fc->group_id = d.group_id; - fc->max_read = max_t(unsigned, 4096, d.max_read); + fc->default_permissions = ctx->default_permissions; + fc->allow_other = ctx->allow_other; + fc->user_id = ctx->user_id; + fc->group_id = ctx->group_id; + fc->max_read = max_t(unsigned, 4096, ctx->max_read); /* Used by get_root_inode() */ sb->s_fs_info = fc; err = -ENOMEM; - root = fuse_get_root_inode(sb, d.rootmode); + root = fuse_get_root_inode(sb, ctx->rootmode); sb->s_d_op = &fuse_root_dentry_operations; root_dentry = d_make_root(root); if (!root_dentry) @@ -1229,11 +1237,50 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) return err; } -static struct dentry *fuse_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *raw_data) +static int fuse_get_tree(struct fs_context *fc) { - return mount_nodev(fs_type, flags, raw_data, fuse_fill_super); + struct fuse_fs_context *ctx = fc->fs_private; + + if (!ctx->fd_present || !ctx->rootmode_present || + !ctx->user_id_present || !ctx->group_id_present) + return -EINVAL; + +#ifdef CONFIG_BLOCK + if (ctx->is_bdev) + return get_tree_bdev(fc, fuse_fill_super); +#endif + + return get_tree_nodev(fc, fuse_fill_super); +} + +static const struct fs_context_operations fuse_context_ops = { + .free = fuse_free_fc, + .parse_param = fuse_parse_param, + .get_tree = fuse_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int fuse_init_fs_context(struct fs_context *fc) +{ + struct fuse_fs_context *ctx; + + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max_read = ~0; + ctx->blksize = FUSE_DEFAULT_BLKSIZE; + +#ifdef CONFIG_BLOCK + if (fc->fs_type == &fuseblk_fs_type) + ctx->is_bdev = true; +#endif + + fc->fs_private = ctx; + fc->ops = &fuse_context_ops; + return 0; } static void fuse_sb_destroy(struct super_block *sb) @@ -1262,19 +1309,13 @@ static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, - .mount = fuse_mount, + .init_fs_context = fuse_init_fs_context, + .parameters = &fuse_fs_parameters, .kill_sb = fuse_kill_sb_anon, }; MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK -static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *raw_data) -{ - return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super); -} - static void fuse_kill_sb_blk(struct super_block *sb) { fuse_sb_destroy(sb); @@ -1284,7 +1325,8 @@ static void fuse_kill_sb_blk(struct super_block *sb) static struct file_system_type fuseblk_fs_type = { .owner = THIS_MODULE, .name = "fuseblk", - .mount = fuse_mount_blk, + .init_fs_context = fuse_init_fs_context, + .parameters = &fuse_fs_parameters, .kill_sb = fuse_kill_sb_blk, .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, }; From c7eb6869632a5d33b41d0a00d683b8395392b7ee Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Mar 2019 16:38:31 +0000 Subject: [PATCH 0366/2880] vfs: subtype handling moved to fuse The unused vfs code can be removed. Don't pass empty subtype (same as if ->parse callback isn't called). The bits that are left involve determining whether it's permitted to split the filesystem type string passed in to mount(2). Consequently, this means that we cannot get rid of the FS_HAS_SUBTYPE flag unless we define that a type string with a dot in it always indicates a subtype specification. Signed-off-by: David Howells Signed-off-by: Al Viro Signed-off-by: Miklos Szeredi --- fs/fs_context.c | 14 -------------- fs/fuse/inode.c | 3 +-- fs/namespace.c | 2 -- fs/proc_namespace.c | 2 +- fs/super.c | 5 ----- include/linux/fs_context.h | 1 - 6 files changed, 2 insertions(+), 25 deletions(-) diff --git a/fs/fs_context.c b/fs/fs_context.c index 87c2c9687d90..138b5b4d621d 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -504,7 +504,6 @@ void put_fs_context(struct fs_context *fc) put_net(fc->net_ns); put_user_ns(fc->user_ns); put_cred(fc->cred); - kfree(fc->subtype); put_fc_log(fc); put_filesystem(fc->fs_type); kfree(fc->source); @@ -571,17 +570,6 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } - if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) && - strcmp(param->key, "subtype") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "VFS: Legacy: Non-string subtype"); - if (fc->subtype) - return invalf(fc, "VFS: Legacy: Multiple subtype"); - fc->subtype = param->string; - param->string = NULL; - return 0; - } - if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); @@ -738,8 +726,6 @@ void vfs_clean_context(struct fs_context *fc) fc->s_fs_info = NULL; fc->sb_flags = 0; security_free_mnt_opts(&fc->security); - kfree(fc->subtype); - fc->subtype = NULL; kfree(fc->source); fc->source = NULL; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c334f95e799a..2183967261a4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -473,8 +473,7 @@ static const struct fs_parameter_spec fuse_param_specs[] = { fsparam_flag ("allow_other", OPT_ALLOW_OTHER), fsparam_u32 ("max_read", OPT_MAX_READ), fsparam_u32 ("blksize", OPT_BLKSIZE), - __fsparam(fs_param_is_string, "subtype", OPT_SUBTYPE, - fs_param_v_optional), + fsparam_string ("subtype", OPT_SUBTYPE), {} }; diff --git a/fs/namespace.c b/fs/namespace.c index d28d30b13043..105f995543f6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2768,8 +2768,6 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, put_filesystem(type); return -EINVAL; } - } else { - subtype = ""; } } diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index e16fb8f2049e..273ee82d8aa9 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -88,7 +88,7 @@ static inline void mangle(struct seq_file *m, const char *s) static void show_type(struct seq_file *m, struct super_block *sb) { mangle(m, sb->s_type->name); - if (sb->s_subtype && sb->s_subtype[0]) { + if (sb->s_subtype) { seq_putc(m, '.'); mangle(m, sb->s_subtype); } diff --git a/fs/super.c b/fs/super.c index da223b4cfbca..a6ceed7bcd89 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1530,11 +1530,6 @@ int vfs_get_tree(struct fs_context *fc) sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); - if (fc->subtype && !sb->s_subtype) { - sb->s_subtype = fc->subtype; - fc->subtype = NULL; - } - /* * Write barrier is for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is the diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 84a5eaa09f19..f6c86d58ea91 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -95,7 +95,6 @@ struct fs_context { const struct cred *cred; /* The mounter's credentials */ struct fc_log *log; /* Logging buffer */ const char *source; /* The source name (eg. dev path) */ - const char *subtype; /* The subtype to set on the superblock */ void *security; /* Linux S&M options */ void *s_fs_info; /* Proposed s_fs_info */ unsigned int sb_flags; /* Proposed superblock flags (SB_*) */ From a4c8723a162e6244fb01944fbf446750575dba59 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 6 Sep 2019 12:57:46 -0700 Subject: [PATCH 0367/2880] bus: ti-sysc: Remove unpaired sysc_clkdm_deny_idle() Commit d098913a10f8 ("bus: ti-sysc: Fix clock handling for no-idle quirks") fixed handling for no-idle quirk modules that are not enabled by the bootloader. But it also caused unpaired clockdomain calls that won't allow idling the system. That's because clkdm_allow_idle_nolock() and clkdm_deny_idle_nolock() have usage count with clkdm->forcewake_count. Let's drop the unpaired sysc_clkdm_deny_idle() to fix idling of devices. Fixes: d098913a10f8 ("bus: ti-sysc: Fix clock handling for no-idle quirks") Cc: Keerthy Cc: Vignesh Raghavendra Signed-off-by: Tony Lindgren --- drivers/bus/ti-sysc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 24583d82b584..364ee498feb3 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -2363,7 +2363,6 @@ static void ti_sysc_idle(struct work_struct *work) */ if (ddata->cfg.quirks & (SYSC_QUIRK_NO_IDLE | SYSC_QUIRK_NO_IDLE_ON_INIT)) { - sysc_clkdm_deny_idle(ddata); sysc_disable_main_clocks(ddata); sysc_disable_opt_clocks(ddata); sysc_clkdm_allow_idle(ddata); From 984998e3404e9073479281dbba8af36b104e8c00 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 22 Aug 2019 11:55:52 +0300 Subject: [PATCH 0368/2880] PCI: Make pcie_downstream_port() available outside of access.c pcie_downstream_port() is useful in other places where code needs to determine whether the PCIe port is downstream so make it available outside of access.c. Link: https://lore.kernel.org/r/20190822085553.62697-1-mika.westerberg@linux.intel.com Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko --- drivers/pci/access.c | 9 --------- drivers/pci/pci.h | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/pci/access.c b/drivers/pci/access.c index 544922f097c0..2fccb5762c76 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -336,15 +336,6 @@ static inline int pcie_cap_version(const struct pci_dev *dev) return pcie_caps_reg(dev) & PCI_EXP_FLAGS_VERS; } -static bool pcie_downstream_port(const struct pci_dev *dev) -{ - int type = pci_pcie_type(dev); - - return type == PCI_EXP_TYPE_ROOT_PORT || - type == PCI_EXP_TYPE_DOWNSTREAM || - type == PCI_EXP_TYPE_PCIE_BRIDGE; -} - bool pcie_cap_has_lnkctl(const struct pci_dev *dev) { int type = pci_pcie_type(dev); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 97180274c60b..ce9f4d3ca075 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -118,6 +118,15 @@ static inline bool pci_power_manageable(struct pci_dev *pci_dev) return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3; } +static inline bool pcie_downstream_port(const struct pci_dev *dev) +{ + int type = pci_pcie_type(dev); + + return type == PCI_EXP_TYPE_ROOT_PORT || + type == PCI_EXP_TYPE_DOWNSTREAM || + type == PCI_EXP_TYPE_PCIE_BRIDGE; +} + int pci_vpd_init(struct pci_dev *dev); void pci_vpd_release(struct pci_dev *dev); void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev); From ca78410403dd64ac0ee0e3cc8646b38335271bfd Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 22 Aug 2019 11:55:53 +0300 Subject: [PATCH 0369/2880] PCI: Get rid of dev->has_secondary_link flag In some systems, the Device/Port Type in the PCI Express Capabilities register incorrectly identifies upstream ports as downstream ports. d0751b98dfa3 ("PCI: Add dev->has_secondary_link to track downstream PCIe links") addressed this by adding pci_dev.has_secondary_link, which is set for downstream ports. But this is confusing because pci_pcie_type() sometimes gives the wrong answer, and it's not obvious that we should use pci_dev.has_secondary_link instead. Reduce the confusion by correcting the type of the port itself so that pci_pcie_type() returns the actual type regardless of what the Device/Port Type register claims it is. Update the users to call pci_pcie_type() and pcie_downstream_port() accordingly, and remove pci_dev.has_secondary_link completely. Link: https://lore.kernel.org/linux-pci/20190703133953.GK128603@google.com/ Suggested-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20190822085553.62697-2-mika.westerberg@linux.intel.com Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko --- drivers/pci/pci.c | 2 +- drivers/pci/pcie/aspm.c | 8 +++---- drivers/pci/pcie/err.c | 2 +- drivers/pci/probe.c | 48 ++++++++++++++++++++++++----------------- drivers/pci/vc.c | 4 +++- include/linux/pci.h | 1 - 6 files changed, 37 insertions(+), 28 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 29ed5ec1ac27..97e7f6e0821e 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3576,7 +3576,7 @@ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask) } /* Ensure upstream ports don't block AtomicOps on egress */ - if (!bridge->has_secondary_link) { + if (pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM) { pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, &ctl2); if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index e44af7f4d37f..dcda96818eb6 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -913,10 +913,10 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev) /* * We allocate pcie_link_state for the component on the upstream - * end of a Link, so there's nothing to do unless this device has a - * Link on its secondary side. + * end of a Link, so there's nothing to do unless this device is + * downstream port. */ - if (!pdev->has_secondary_link) + if (!pcie_downstream_port(pdev)) return; /* VIA has a strange chipset, root port is under a bridge */ @@ -1070,7 +1070,7 @@ static int __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem) if (!pci_is_pcie(pdev)) return 0; - if (pdev->has_secondary_link) + if (pcie_downstream_port(pdev)) parent = pdev; if (!parent || !parent->link_state) return -EINVAL; diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index 773197a12568..b0e6048a9208 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -166,7 +166,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service) driver = pcie_port_find_service(dev, service); if (driver && driver->reset_link) { status = driver->reset_link(dev); - } else if (dev->has_secondary_link) { + } else if (pcie_downstream_port(dev)) { status = default_reset_link(dev); } else { pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n", diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index cdf34a3c531a..3bfa57d58402 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1431,26 +1431,38 @@ void set_pcie_port_type(struct pci_dev *pdev) pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, ®16); pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD; + parent = pci_upstream_bridge(pdev); + if (!parent) + return; + /* - * A Root Port or a PCI-to-PCIe bridge is always the upstream end - * of a Link. No PCIe component has two Links. Two Links are - * connected by a Switch that has a Port on each Link and internal - * logic to connect the two Ports. + * Some systems do not identify their upstream/downstream ports + * correctly so detect impossible configurations here and correct + * the port type accordingly. */ type = pci_pcie_type(pdev); - if (type == PCI_EXP_TYPE_ROOT_PORT || - type == PCI_EXP_TYPE_PCIE_BRIDGE) - pdev->has_secondary_link = 1; - else if (type == PCI_EXP_TYPE_UPSTREAM || - type == PCI_EXP_TYPE_DOWNSTREAM) { - parent = pci_upstream_bridge(pdev); - + if (type == PCI_EXP_TYPE_DOWNSTREAM) { /* - * Usually there's an upstream device (Root Port or Switch - * Downstream Port), but we can't assume one exists. + * If pdev claims to be downstream port but the parent + * device is also downstream port assume pdev is actually + * upstream port. */ - if (parent && !parent->has_secondary_link) - pdev->has_secondary_link = 1; + if (pcie_downstream_port(parent)) { + pci_info(pdev, "claims to be downstream port but is acting as upstream port, correcting type\n"); + pdev->pcie_flags_reg &= ~PCI_EXP_FLAGS_TYPE; + pdev->pcie_flags_reg |= PCI_EXP_TYPE_UPSTREAM; + } + } else if (type == PCI_EXP_TYPE_UPSTREAM) { + /* + * If pdev claims to be upstream port but the parent + * device is also upstream port assume pdev is actually + * downstream port. + */ + if (pci_pcie_type(parent) == PCI_EXP_TYPE_UPSTREAM) { + pci_info(pdev, "claims to be upstream port but is acting as downstream port, correcting type\n"); + pdev->pcie_flags_reg &= ~PCI_EXP_FLAGS_TYPE; + pdev->pcie_flags_reg |= PCI_EXP_TYPE_DOWNSTREAM; + } } } @@ -2488,12 +2500,8 @@ static int only_one_child(struct pci_bus *bus) * A PCIe Downstream Port normally leads to a Link with only Device * 0 on it (PCIe spec r3.1, sec 7.3.1). As an optimization, scan * only for Device 0 in that situation. - * - * Checking has_secondary_link is a hack to identify Downstream - * Ports because sometimes Switches are configured such that the - * PCIe Port Type labels are backwards. */ - if (bridge && pci_is_pcie(bridge) && bridge->has_secondary_link) + if (bridge && pci_is_pcie(bridge) && pcie_downstream_port(bridge)) return 1; return 0; diff --git a/drivers/pci/vc.c b/drivers/pci/vc.c index 5acd9c02683a..9ae9fb9339e8 100644 --- a/drivers/pci/vc.c +++ b/drivers/pci/vc.c @@ -13,6 +13,8 @@ #include #include +#include "pci.h" + /** * pci_vc_save_restore_dwords - Save or restore a series of dwords * @dev: device @@ -105,7 +107,7 @@ static void pci_vc_enable(struct pci_dev *dev, int pos, int res) struct pci_dev *link = NULL; /* Enable VCs from the downstream device */ - if (!dev->has_secondary_link) + if (!pci_is_pcie(dev) || !pcie_downstream_port(dev)) return; ctrl_pos = pos + PCI_VC_RES_CTRL + (res * PCI_CAP_VC_PER_VC_SIZEOF); diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..0153ede227e5 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -418,7 +418,6 @@ struct pci_dev { unsigned int broken_intx_masking:1; /* INTx masking can't be used */ unsigned int io_window_1k:1; /* Intel bridge 1K I/O windows */ unsigned int irq_managed:1; - unsigned int has_secondary_link:1; unsigned int non_compliant_bars:1; /* Broken BARs; ignore them */ unsigned int is_probed:1; /* Device probing in progress */ unsigned int link_active_reporting:1;/* Device capable of reporting link active */ From de10ac47597e7a3596b27631d0d5ce5f48d2c099 Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Sun, 1 Sep 2019 12:54:10 +0200 Subject: [PATCH 0370/2880] iio: adc: meson_saradc: Fix memory allocation order meson_saradc's irq handler uses priv->regmap so make sure that it is allocated before the irq get enabled. This also fixes crash when CONFIG_DEBUG_SHIRQ is enabled, as device managed resources are freed in the inverted order they had been allocated, priv->regmap was freed before the spurious fake irq that CONFIG_DEBUG_SHIRQ adds called the handler. Fixes: 3af109131b7eb8 ("iio: adc: meson-saradc: switch from polling to interrupt mode") Reported-by: Elie Roudninski Signed-off-by: Remi Pommarel Reviewed-by: Martin Blumenstingl Tested-by: Elie ROUDNINSKI Reviewed-by: Kevin Hilman Signed-off-by: Jonathan Cameron --- drivers/iio/adc/meson_saradc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/iio/adc/meson_saradc.c b/drivers/iio/adc/meson_saradc.c index 7b28d045d271..7b27306330a3 100644 --- a/drivers/iio/adc/meson_saradc.c +++ b/drivers/iio/adc/meson_saradc.c @@ -1219,6 +1219,11 @@ static int meson_sar_adc_probe(struct platform_device *pdev) if (IS_ERR(base)) return PTR_ERR(base); + priv->regmap = devm_regmap_init_mmio(&pdev->dev, base, + priv->param->regmap_config); + if (IS_ERR(priv->regmap)) + return PTR_ERR(priv->regmap); + irq = irq_of_parse_and_map(pdev->dev.of_node, 0); if (!irq) return -EINVAL; @@ -1228,11 +1233,6 @@ static int meson_sar_adc_probe(struct platform_device *pdev) if (ret) return ret; - priv->regmap = devm_regmap_init_mmio(&pdev->dev, base, - priv->param->regmap_config); - if (IS_ERR(priv->regmap)) - return PTR_ERR(priv->regmap); - priv->clkin = devm_clk_get(&pdev->dev, "clkin"); if (IS_ERR(priv->clkin)) { dev_err(&pdev->dev, "failed to get clkin\n"); From 85ae3aeedeccb6febb0c6f9d5346d9c6419ad925 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 31 Aug 2019 13:18:22 +0200 Subject: [PATCH 0371/2880] iio: imu: st_lsm6dsx: forbid 0 sensor sensitivity Do not allow configuring null sensor gain since it will force to 0 device outputs Fixes: c8d4066c7246 ("iio: imu: st_lsm6dsx: remove invalid gain value for LSM9DS1") Signed-off-by: Lorenzo Bianconi Signed-off-by: Jonathan Cameron --- drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h | 2 ++ drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c | 28 +++++++++++++------- drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_shub.c | 11 +++----- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h index 80e42c7dbcbe..0fe6999b8257 100644 --- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h +++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h @@ -99,7 +99,9 @@ struct st_lsm6dsx_fs { #define ST_LSM6DSX_FS_LIST_SIZE 4 struct st_lsm6dsx_fs_table_entry { struct st_lsm6dsx_reg reg; + struct st_lsm6dsx_fs fs_avl[ST_LSM6DSX_FS_LIST_SIZE]; + int fs_len; }; /** diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c index 2d3495560136..fd5ebe1e1594 100644 --- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c +++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c @@ -145,6 +145,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_G_TO_M_S_2(122), 0x2 }, .fs_avl[2] = { IIO_G_TO_M_S_2(244), 0x3 }, .fs_avl[3] = { IIO_G_TO_M_S_2(732), 0x1 }, + .fs_len = 4, }, [ST_LSM6DSX_ID_GYRO] = { .reg = { @@ -154,6 +155,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[0] = { IIO_DEGREE_TO_RAD(245), 0x0 }, .fs_avl[1] = { IIO_DEGREE_TO_RAD(500), 0x1 }, .fs_avl[2] = { IIO_DEGREE_TO_RAD(2000), 0x3 }, + .fs_len = 3, }, }, }, @@ -215,6 +217,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_G_TO_M_S_2(122), 0x2 }, .fs_avl[2] = { IIO_G_TO_M_S_2(244), 0x3 }, .fs_avl[3] = { IIO_G_TO_M_S_2(488), 0x1 }, + .fs_len = 4, }, [ST_LSM6DSX_ID_GYRO] = { .reg = { @@ -225,6 +228,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_DEGREE_TO_RAD(17500), 0x1 }, .fs_avl[2] = { IIO_DEGREE_TO_RAD(35000), 0x2 }, .fs_avl[3] = { IIO_DEGREE_TO_RAD(70000), 0x3 }, + .fs_len = 4, }, }, .decimator = { @@ -327,6 +331,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_G_TO_M_S_2(122), 0x2 }, .fs_avl[2] = { IIO_G_TO_M_S_2(244), 0x3 }, .fs_avl[3] = { IIO_G_TO_M_S_2(488), 0x1 }, + .fs_len = 4, }, [ST_LSM6DSX_ID_GYRO] = { .reg = { @@ -337,6 +342,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_DEGREE_TO_RAD(17500), 0x1 }, .fs_avl[2] = { IIO_DEGREE_TO_RAD(35000), 0x2 }, .fs_avl[3] = { IIO_DEGREE_TO_RAD(70000), 0x3 }, + .fs_len = 4, }, }, .decimator = { @@ -448,6 +454,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_G_TO_M_S_2(122), 0x2 }, .fs_avl[2] = { IIO_G_TO_M_S_2(244), 0x3 }, .fs_avl[3] = { IIO_G_TO_M_S_2(488), 0x1 }, + .fs_len = 4, }, [ST_LSM6DSX_ID_GYRO] = { .reg = { @@ -458,6 +465,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_DEGREE_TO_RAD(17500), 0x1 }, .fs_avl[2] = { IIO_DEGREE_TO_RAD(35000), 0x2 }, .fs_avl[3] = { IIO_DEGREE_TO_RAD(70000), 0x3 }, + .fs_len = 4, }, }, .decimator = { @@ -563,6 +571,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_G_TO_M_S_2(122), 0x2 }, .fs_avl[2] = { IIO_G_TO_M_S_2(244), 0x3 }, .fs_avl[3] = { IIO_G_TO_M_S_2(488), 0x1 }, + .fs_len = 4, }, [ST_LSM6DSX_ID_GYRO] = { .reg = { @@ -573,6 +582,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_DEGREE_TO_RAD(17500), 0x1 }, .fs_avl[2] = { IIO_DEGREE_TO_RAD(35000), 0x2 }, .fs_avl[3] = { IIO_DEGREE_TO_RAD(70000), 0x3 }, + .fs_len = 4, }, }, .batch = { @@ -693,6 +703,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_G_TO_M_S_2(122), 0x2 }, .fs_avl[2] = { IIO_G_TO_M_S_2(244), 0x3 }, .fs_avl[3] = { IIO_G_TO_M_S_2(488), 0x1 }, + .fs_len = 4, }, [ST_LSM6DSX_ID_GYRO] = { .reg = { @@ -703,6 +714,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_DEGREE_TO_RAD(17500), 0x1 }, .fs_avl[2] = { IIO_DEGREE_TO_RAD(35000), 0x2 }, .fs_avl[3] = { IIO_DEGREE_TO_RAD(70000), 0x3 }, + .fs_len = 4, }, }, .batch = { @@ -800,6 +812,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_G_TO_M_S_2(122), 0x2 }, .fs_avl[2] = { IIO_G_TO_M_S_2(244), 0x3 }, .fs_avl[3] = { IIO_G_TO_M_S_2(488), 0x1 }, + .fs_len = 4, }, [ST_LSM6DSX_ID_GYRO] = { .reg = { @@ -810,6 +823,7 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .fs_avl[1] = { IIO_DEGREE_TO_RAD(17500), 0x1 }, .fs_avl[2] = { IIO_DEGREE_TO_RAD(35000), 0x2 }, .fs_avl[3] = { IIO_DEGREE_TO_RAD(70000), 0x3 }, + .fs_len = 4, }, }, .batch = { @@ -933,11 +947,12 @@ static int st_lsm6dsx_set_full_scale(struct st_lsm6dsx_sensor *sensor, int i, err; fs_table = &sensor->hw->settings->fs_table[sensor->id]; - for (i = 0; i < ST_LSM6DSX_FS_LIST_SIZE; i++) + for (i = 0; i < fs_table->fs_len; i++) { if (fs_table->fs_avl[i].gain == gain) break; + } - if (i == ST_LSM6DSX_FS_LIST_SIZE) + if (i == fs_table->fs_len) return -EINVAL; data = ST_LSM6DSX_SHIFT_VAL(fs_table->fs_avl[i].val, @@ -1196,18 +1211,13 @@ static ssize_t st_lsm6dsx_sysfs_scale_avail(struct device *dev, { struct st_lsm6dsx_sensor *sensor = iio_priv(dev_get_drvdata(dev)); const struct st_lsm6dsx_fs_table_entry *fs_table; - enum st_lsm6dsx_sensor_id id = sensor->id; struct st_lsm6dsx_hw *hw = sensor->hw; int i, len = 0; - fs_table = &hw->settings->fs_table[id]; - for (i = 0; i < ST_LSM6DSX_FS_LIST_SIZE; i++) { - if (!fs_table->fs_avl[i].gain) - break; - + fs_table = &hw->settings->fs_table[sensor->id]; + for (i = 0; i < fs_table->fs_len; i++) len += scnprintf(buf + len, PAGE_SIZE - len, "0.%06u ", fs_table->fs_avl[i].gain); - } buf[len - 1] = '\n'; return len; diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_shub.c b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_shub.c index 66fbcd94642d..f5fca2171954 100644 --- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_shub.c +++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_shub.c @@ -61,6 +61,7 @@ static const struct st_lsm6dsx_ext_dev_settings st_lsm6dsx_ext_dev_table[] = { .gain = 1500, .val = 0x0, }, /* 1500 uG/LSB */ + .fs_len = 1, }, .temp_comp = { .addr = 0x60, @@ -555,13 +556,9 @@ static ssize_t st_lsm6dsx_shub_scale_avail(struct device *dev, int i, len = 0; settings = sensor->ext_info.settings; - for (i = 0; i < ST_LSM6DSX_FS_LIST_SIZE; i++) { - u16 val = settings->fs_table.fs_avl[i].gain; - - if (val > 0) - len += scnprintf(buf + len, PAGE_SIZE - len, "0.%06u ", - val); - } + for (i = 0; i < settings->fs_table.fs_len; i++) + len += scnprintf(buf + len, PAGE_SIZE - len, "0.%06u ", + settings->fs_table.fs_avl[i].gain); buf[len - 1] = '\n'; return len; From 56e15a238d92788a2d09e0c5c26a5de1b3156931 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:27 +0530 Subject: [PATCH 0372/2880] PCI: tegra: Add Tegra194 PCIe support Add support for Synopsys DesignWare core IP based PCIe host controller present in the Tegra194 SoC. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Acked-by: Thierry Reding --- drivers/pci/controller/dwc/Kconfig | 10 + drivers/pci/controller/dwc/Makefile | 1 + drivers/pci/controller/dwc/pcie-designware.c | 2 +- drivers/pci/controller/dwc/pcie-tegra194.c | 1641 ++++++++++++++++++ 4 files changed, 1653 insertions(+), 1 deletion(-) create mode 100644 drivers/pci/controller/dwc/pcie-tegra194.c diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 6ea778ae4877..49475f5c42c3 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -220,6 +220,16 @@ config PCI_MESON and therefore the driver re-uses the DesignWare core functions to implement the driver. +config PCIE_TEGRA194 + tristate "NVIDIA Tegra194 (and later) PCIe controller" + depends on ARCH_TEGRA_194_SOC || COMPILE_TEST + depends on PCI_MSI_IRQ_DOMAIN + select PCIE_DW_HOST + select PHY_TEGRA194_P2U + help + Say Y here if you want support for DesignWare core based PCIe host + controller found in NVIDIA Tegra194 SoC. + config PCIE_UNIPHIER bool "Socionext UniPhier PCIe controllers" depends on ARCH_UNIPHIER || COMPILE_TEST diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index b085dfd4fab7..b30336181d46 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o obj-$(CONFIG_PCIE_KIRIN) += pcie-kirin.o obj-$(CONFIG_PCIE_HISI_STB) += pcie-histb.o obj-$(CONFIG_PCI_MESON) += pci-meson.o +obj-$(CONFIG_PCIE_TEGRA194) += pcie-tegra194.o obj-$(CONFIG_PCIE_UNIPHIER) += pcie-uniphier.o # The following drivers are for devices that use the generic ACPI diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 59eaeeb21dbe..4d6690b6ca36 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -456,7 +456,7 @@ int dw_pcie_wait_for_link(struct dw_pcie *pci) usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX); } - dev_err(pci->dev, "Phy link never came up\n"); + dev_info(pci->dev, "Phy link never came up\n"); return -ETIMEDOUT; } diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c new file mode 100644 index 000000000000..6056414c07d4 --- /dev/null +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -0,0 +1,1641 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * PCIe host controller driver for Tegra194 SoC + * + * Copyright (C) 2019 NVIDIA Corporation. + * + * Author: Vidya Sagar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pcie-designware.h" +#include +#include +#include "../../pci.h" + +#define APPL_PINMUX 0x0 +#define APPL_PINMUX_PEX_RST BIT(0) +#define APPL_PINMUX_CLKREQ_OVERRIDE_EN BIT(2) +#define APPL_PINMUX_CLKREQ_OVERRIDE BIT(3) +#define APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE_EN BIT(4) +#define APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE BIT(5) +#define APPL_PINMUX_CLKREQ_OUT_OVRD_EN BIT(9) +#define APPL_PINMUX_CLKREQ_OUT_OVRD BIT(10) + +#define APPL_CTRL 0x4 +#define APPL_CTRL_SYS_PRE_DET_STATE BIT(6) +#define APPL_CTRL_LTSSM_EN BIT(7) +#define APPL_CTRL_HW_HOT_RST_EN BIT(20) +#define APPL_CTRL_HW_HOT_RST_MODE_MASK GENMASK(1, 0) +#define APPL_CTRL_HW_HOT_RST_MODE_SHIFT 22 +#define APPL_CTRL_HW_HOT_RST_MODE_IMDT_RST 0x1 + +#define APPL_INTR_EN_L0_0 0x8 +#define APPL_INTR_EN_L0_0_LINK_STATE_INT_EN BIT(0) +#define APPL_INTR_EN_L0_0_MSI_RCV_INT_EN BIT(4) +#define APPL_INTR_EN_L0_0_INT_INT_EN BIT(8) +#define APPL_INTR_EN_L0_0_CDM_REG_CHK_INT_EN BIT(19) +#define APPL_INTR_EN_L0_0_SYS_INTR_EN BIT(30) +#define APPL_INTR_EN_L0_0_SYS_MSI_INTR_EN BIT(31) + +#define APPL_INTR_STATUS_L0 0xC +#define APPL_INTR_STATUS_L0_LINK_STATE_INT BIT(0) +#define APPL_INTR_STATUS_L0_INT_INT BIT(8) +#define APPL_INTR_STATUS_L0_CDM_REG_CHK_INT BIT(18) + +#define APPL_INTR_EN_L1_0_0 0x1C +#define APPL_INTR_EN_L1_0_0_LINK_REQ_RST_NOT_INT_EN BIT(1) + +#define APPL_INTR_STATUS_L1_0_0 0x20 +#define APPL_INTR_STATUS_L1_0_0_LINK_REQ_RST_NOT_CHGED BIT(1) + +#define APPL_INTR_STATUS_L1_1 0x2C +#define APPL_INTR_STATUS_L1_2 0x30 +#define APPL_INTR_STATUS_L1_3 0x34 +#define APPL_INTR_STATUS_L1_6 0x3C +#define APPL_INTR_STATUS_L1_7 0x40 + +#define APPL_INTR_EN_L1_8_0 0x44 +#define APPL_INTR_EN_L1_8_BW_MGT_INT_EN BIT(2) +#define APPL_INTR_EN_L1_8_AUTO_BW_INT_EN BIT(3) +#define APPL_INTR_EN_L1_8_INTX_EN BIT(11) +#define APPL_INTR_EN_L1_8_AER_INT_EN BIT(15) + +#define APPL_INTR_STATUS_L1_8_0 0x4C +#define APPL_INTR_STATUS_L1_8_0_EDMA_INT_MASK GENMASK(11, 6) +#define APPL_INTR_STATUS_L1_8_0_BW_MGT_INT_STS BIT(2) +#define APPL_INTR_STATUS_L1_8_0_AUTO_BW_INT_STS BIT(3) + +#define APPL_INTR_STATUS_L1_9 0x54 +#define APPL_INTR_STATUS_L1_10 0x58 +#define APPL_INTR_STATUS_L1_11 0x64 +#define APPL_INTR_STATUS_L1_13 0x74 +#define APPL_INTR_STATUS_L1_14 0x78 +#define APPL_INTR_STATUS_L1_15 0x7C +#define APPL_INTR_STATUS_L1_17 0x88 + +#define APPL_INTR_EN_L1_18 0x90 +#define APPL_INTR_EN_L1_18_CDM_REG_CHK_CMPLT BIT(2) +#define APPL_INTR_EN_L1_18_CDM_REG_CHK_CMP_ERR BIT(1) +#define APPL_INTR_EN_L1_18_CDM_REG_CHK_LOGIC_ERR BIT(0) + +#define APPL_INTR_STATUS_L1_18 0x94 +#define APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMPLT BIT(2) +#define APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMP_ERR BIT(1) +#define APPL_INTR_STATUS_L1_18_CDM_REG_CHK_LOGIC_ERR BIT(0) + +#define APPL_MSI_CTRL_2 0xB0 + +#define APPL_LTR_MSG_1 0xC4 +#define LTR_MSG_REQ BIT(15) +#define LTR_MST_NO_SNOOP_SHIFT 16 + +#define APPL_LTR_MSG_2 0xC8 +#define APPL_LTR_MSG_2_LTR_MSG_REQ_STATE BIT(3) + +#define APPL_LINK_STATUS 0xCC +#define APPL_LINK_STATUS_RDLH_LINK_UP BIT(0) + +#define APPL_DEBUG 0xD0 +#define APPL_DEBUG_PM_LINKST_IN_L2_LAT BIT(21) +#define APPL_DEBUG_PM_LINKST_IN_L0 0x11 +#define APPL_DEBUG_LTSSM_STATE_MASK GENMASK(8, 3) +#define APPL_DEBUG_LTSSM_STATE_SHIFT 3 +#define LTSSM_STATE_PRE_DETECT 5 + +#define APPL_RADM_STATUS 0xE4 +#define APPL_PM_XMT_TURNOFF_STATE BIT(0) + +#define APPL_DM_TYPE 0x100 +#define APPL_DM_TYPE_MASK GENMASK(3, 0) +#define APPL_DM_TYPE_RP 0x4 +#define APPL_DM_TYPE_EP 0x0 + +#define APPL_CFG_BASE_ADDR 0x104 +#define APPL_CFG_BASE_ADDR_MASK GENMASK(31, 12) + +#define APPL_CFG_IATU_DMA_BASE_ADDR 0x108 +#define APPL_CFG_IATU_DMA_BASE_ADDR_MASK GENMASK(31, 18) + +#define APPL_CFG_MISC 0x110 +#define APPL_CFG_MISC_SLV_EP_MODE BIT(14) +#define APPL_CFG_MISC_ARCACHE_MASK GENMASK(13, 10) +#define APPL_CFG_MISC_ARCACHE_SHIFT 10 +#define APPL_CFG_MISC_ARCACHE_VAL 3 + +#define APPL_CFG_SLCG_OVERRIDE 0x114 +#define APPL_CFG_SLCG_OVERRIDE_SLCG_EN_MASTER BIT(0) + +#define APPL_CAR_RESET_OVRD 0x12C +#define APPL_CAR_RESET_OVRD_CYA_OVERRIDE_CORE_RST_N BIT(0) + +#define IO_BASE_IO_DECODE BIT(0) +#define IO_BASE_IO_DECODE_BIT8 BIT(8) + +#define CFG_PREF_MEM_LIMIT_BASE_MEM_DECODE BIT(0) +#define CFG_PREF_MEM_LIMIT_BASE_MEM_LIMIT_DECODE BIT(16) + +#define CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF 0x718 +#define CFG_TIMER_CTRL_ACK_NAK_SHIFT (19) + +#define EVENT_COUNTER_ALL_CLEAR 0x3 +#define EVENT_COUNTER_ENABLE_ALL 0x7 +#define EVENT_COUNTER_ENABLE_SHIFT 2 +#define EVENT_COUNTER_EVENT_SEL_MASK GENMASK(7, 0) +#define EVENT_COUNTER_EVENT_SEL_SHIFT 16 +#define EVENT_COUNTER_EVENT_Tx_L0S 0x2 +#define EVENT_COUNTER_EVENT_Rx_L0S 0x3 +#define EVENT_COUNTER_EVENT_L1 0x5 +#define EVENT_COUNTER_EVENT_L1_1 0x7 +#define EVENT_COUNTER_EVENT_L1_2 0x8 +#define EVENT_COUNTER_GROUP_SEL_SHIFT 24 +#define EVENT_COUNTER_GROUP_5 0x5 + +#define PORT_LOGIC_ACK_F_ASPM_CTRL 0x70C +#define ENTER_ASPM BIT(30) +#define L0S_ENTRANCE_LAT_SHIFT 24 +#define L0S_ENTRANCE_LAT_MASK GENMASK(26, 24) +#define L1_ENTRANCE_LAT_SHIFT 27 +#define L1_ENTRANCE_LAT_MASK GENMASK(29, 27) +#define N_FTS_SHIFT 8 +#define N_FTS_MASK GENMASK(7, 0) +#define N_FTS_VAL 52 + +#define PORT_LOGIC_GEN2_CTRL 0x80C +#define PORT_LOGIC_GEN2_CTRL_DIRECT_SPEED_CHANGE BIT(17) +#define FTS_MASK GENMASK(7, 0) +#define FTS_VAL 52 + +#define PORT_LOGIC_MSI_CTRL_INT_0_EN 0x828 + +#define GEN3_EQ_CONTROL_OFF 0x8a8 +#define GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_SHIFT 8 +#define GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_MASK GENMASK(23, 8) +#define GEN3_EQ_CONTROL_OFF_FB_MODE_MASK GENMASK(3, 0) + +#define GEN3_RELATED_OFF 0x890 +#define GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL BIT(0) +#define GEN3_RELATED_OFF_GEN3_EQ_DISABLE BIT(16) +#define GEN3_RELATED_OFF_RATE_SHADOW_SEL_SHIFT 24 +#define GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK GENMASK(25, 24) + +#define PORT_LOGIC_AMBA_ERROR_RESPONSE_DEFAULT 0x8D0 +#define AMBA_ERROR_RESPONSE_CRS_SHIFT 3 +#define AMBA_ERROR_RESPONSE_CRS_MASK GENMASK(1, 0) +#define AMBA_ERROR_RESPONSE_CRS_OKAY 0 +#define AMBA_ERROR_RESPONSE_CRS_OKAY_FFFFFFFF 1 +#define AMBA_ERROR_RESPONSE_CRS_OKAY_FFFF0001 2 + +#define PORT_LOGIC_MSIX_DOORBELL 0x948 + +#define CAP_SPCIE_CAP_OFF 0x154 +#define CAP_SPCIE_CAP_OFF_DSP_TX_PRESET0_MASK GENMASK(3, 0) +#define CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_MASK GENMASK(11, 8) +#define CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_SHIFT 8 + +#define PME_ACK_TIMEOUT 10000 + +#define LTSSM_TIMEOUT 50000 /* 50ms */ + +#define GEN3_GEN4_EQ_PRESET_INIT 5 + +#define GEN1_CORE_CLK_FREQ 62500000 +#define GEN2_CORE_CLK_FREQ 125000000 +#define GEN3_CORE_CLK_FREQ 250000000 +#define GEN4_CORE_CLK_FREQ 500000000 + +static const unsigned int pcie_gen_freq[] = { + GEN1_CORE_CLK_FREQ, + GEN2_CORE_CLK_FREQ, + GEN3_CORE_CLK_FREQ, + GEN4_CORE_CLK_FREQ +}; + +static const u32 event_cntr_ctrl_offset[] = { + 0x1d8, + 0x1a8, + 0x1a8, + 0x1a8, + 0x1c4, + 0x1d8 +}; + +static const u32 event_cntr_data_offset[] = { + 0x1dc, + 0x1ac, + 0x1ac, + 0x1ac, + 0x1c8, + 0x1dc +}; + +struct tegra_pcie_dw { + struct device *dev; + struct resource *appl_res; + struct resource *dbi_res; + struct resource *atu_dma_res; + void __iomem *appl_base; + struct clk *core_clk; + struct reset_control *core_apb_rst; + struct reset_control *core_rst; + struct dw_pcie pci; + struct tegra_bpmp *bpmp; + + bool supports_clkreq; + bool enable_cdm_check; + bool link_state; + bool update_fc_fixup; + u8 init_link_width; + u32 msi_ctrl_int; + u32 num_lanes; + u32 max_speed; + u32 cid; + u32 cfg_link_cap_l1sub; + u32 pcie_cap_base; + u32 aspm_cmrt; + u32 aspm_pwr_on_t; + u32 aspm_l0s_enter_lat; + + struct regulator *pex_ctl_supply; + + unsigned int phy_count; + struct phy **phys; + + struct dentry *debugfs; +}; + +static inline struct tegra_pcie_dw *to_tegra_pcie(struct dw_pcie *pci) +{ + return container_of(pci, struct tegra_pcie_dw, pci); +} + +static inline void appl_writel(struct tegra_pcie_dw *pcie, const u32 value, + const u32 reg) +{ + writel_relaxed(value, pcie->appl_base + reg); +} + +static inline u32 appl_readl(struct tegra_pcie_dw *pcie, const u32 reg) +{ + return readl_relaxed(pcie->appl_base + reg); +} + +struct tegra_pcie_soc { + enum dw_pcie_device_mode mode; +}; + +static void apply_bad_link_workaround(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 current_link_width; + u16 val; + + /* + * NOTE:- Since this scenario is uncommon and link as such is not + * stable anyway, not waiting to confirm if link is really + * transitioning to Gen-2 speed + */ + val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA); + if (val & PCI_EXP_LNKSTA_LBMS) { + current_link_width = (val & PCI_EXP_LNKSTA_NLW) >> + PCI_EXP_LNKSTA_NLW_SHIFT; + if (pcie->init_link_width > current_link_width) { + dev_warn(pci->dev, "PCIe link is bad, width reduced\n"); + val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL2); + val &= ~PCI_EXP_LNKCTL2_TLS; + val |= PCI_EXP_LNKCTL2_TLS_2_5GT; + dw_pcie_writew_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL2, val); + + val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL); + val |= PCI_EXP_LNKCTL_RL; + dw_pcie_writew_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL, val); + } + } +} + +static irqreturn_t tegra_pcie_rp_irq_handler(struct tegra_pcie_dw *pcie) +{ + struct dw_pcie *pci = &pcie->pci; + struct pcie_port *pp = &pci->pp; + u32 val, tmp; + u16 val_w; + + val = appl_readl(pcie, APPL_INTR_STATUS_L0); + if (val & APPL_INTR_STATUS_L0_LINK_STATE_INT) { + val = appl_readl(pcie, APPL_INTR_STATUS_L1_0_0); + if (val & APPL_INTR_STATUS_L1_0_0_LINK_REQ_RST_NOT_CHGED) { + appl_writel(pcie, val, APPL_INTR_STATUS_L1_0_0); + + /* SBR & Surprise Link Down WAR */ + val = appl_readl(pcie, APPL_CAR_RESET_OVRD); + val &= ~APPL_CAR_RESET_OVRD_CYA_OVERRIDE_CORE_RST_N; + appl_writel(pcie, val, APPL_CAR_RESET_OVRD); + udelay(1); + val = appl_readl(pcie, APPL_CAR_RESET_OVRD); + val |= APPL_CAR_RESET_OVRD_CYA_OVERRIDE_CORE_RST_N; + appl_writel(pcie, val, APPL_CAR_RESET_OVRD); + + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_GEN2_CTRL); + val |= PORT_LOGIC_GEN2_CTRL_DIRECT_SPEED_CHANGE; + dw_pcie_writel_dbi(pci, PORT_LOGIC_GEN2_CTRL, val); + } + } + + if (val & APPL_INTR_STATUS_L0_INT_INT) { + val = appl_readl(pcie, APPL_INTR_STATUS_L1_8_0); + if (val & APPL_INTR_STATUS_L1_8_0_AUTO_BW_INT_STS) { + appl_writel(pcie, + APPL_INTR_STATUS_L1_8_0_AUTO_BW_INT_STS, + APPL_INTR_STATUS_L1_8_0); + apply_bad_link_workaround(pp); + } + if (val & APPL_INTR_STATUS_L1_8_0_BW_MGT_INT_STS) { + appl_writel(pcie, + APPL_INTR_STATUS_L1_8_0_BW_MGT_INT_STS, + APPL_INTR_STATUS_L1_8_0); + + val_w = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKSTA); + dev_dbg(pci->dev, "Link Speed : Gen-%u\n", val_w & + PCI_EXP_LNKSTA_CLS); + } + } + + val = appl_readl(pcie, APPL_INTR_STATUS_L0); + if (val & APPL_INTR_STATUS_L0_CDM_REG_CHK_INT) { + val = appl_readl(pcie, APPL_INTR_STATUS_L1_18); + tmp = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS); + if (val & APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMPLT) { + dev_info(pci->dev, "CDM check complete\n"); + tmp |= PCIE_PL_CHK_REG_CHK_REG_COMPLETE; + } + if (val & APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMP_ERR) { + dev_err(pci->dev, "CDM comparison mismatch\n"); + tmp |= PCIE_PL_CHK_REG_CHK_REG_COMPARISON_ERROR; + } + if (val & APPL_INTR_STATUS_L1_18_CDM_REG_CHK_LOGIC_ERR) { + dev_err(pci->dev, "CDM Logic error\n"); + tmp |= PCIE_PL_CHK_REG_CHK_REG_LOGIC_ERROR; + } + dw_pcie_writel_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS, tmp); + tmp = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_ERR_ADDR); + dev_err(pci->dev, "CDM Error Address Offset = 0x%08X\n", tmp); + } + + return IRQ_HANDLED; +} + +static irqreturn_t tegra_pcie_irq_handler(int irq, void *arg) +{ + struct tegra_pcie_dw *pcie = arg; + + return tegra_pcie_rp_irq_handler(pcie); +} + +static int tegra_pcie_dw_rd_own_conf(struct pcie_port *pp, int where, int size, + u32 *val) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + + /* + * This is an endpoint mode specific register happen to appear even + * when controller is operating in root port mode and system hangs + * when it is accessed with link being in ASPM-L1 state. + * So skip accessing it altogether + */ + if (where == PORT_LOGIC_MSIX_DOORBELL) { + *val = 0x00000000; + return PCIBIOS_SUCCESSFUL; + } + + return dw_pcie_read(pci->dbi_base + where, size, val); +} + +static int tegra_pcie_dw_wr_own_conf(struct pcie_port *pp, int where, int size, + u32 val) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + + /* + * This is an endpoint mode specific register happen to appear even + * when controller is operating in root port mode and system hangs + * when it is accessed with link being in ASPM-L1 state. + * So skip accessing it altogether + */ + if (where == PORT_LOGIC_MSIX_DOORBELL) + return PCIBIOS_SUCCESSFUL; + + return dw_pcie_write(pci->dbi_base + where, size, val); +} + +#if defined(CONFIG_PCIEASPM) +static void disable_aspm_l11(struct tegra_pcie_dw *pcie) +{ + u32 val; + + val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub); + val &= ~PCI_L1SS_CAP_ASPM_L1_1; + dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val); +} + +static void disable_aspm_l12(struct tegra_pcie_dw *pcie) +{ + u32 val; + + val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub); + val &= ~PCI_L1SS_CAP_ASPM_L1_2; + dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val); +} + +static inline u32 event_counter_prog(struct tegra_pcie_dw *pcie, u32 event) +{ + u32 val; + + val = dw_pcie_readl_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid]); + val &= ~(EVENT_COUNTER_EVENT_SEL_MASK << EVENT_COUNTER_EVENT_SEL_SHIFT); + val |= EVENT_COUNTER_GROUP_5 << EVENT_COUNTER_GROUP_SEL_SHIFT; + val |= event << EVENT_COUNTER_EVENT_SEL_SHIFT; + val |= EVENT_COUNTER_ENABLE_ALL << EVENT_COUNTER_ENABLE_SHIFT; + dw_pcie_writel_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid], val); + val = dw_pcie_readl_dbi(&pcie->pci, event_cntr_data_offset[pcie->cid]); + + return val; +} + +static int aspm_state_cnt(struct seq_file *s, void *data) +{ + struct tegra_pcie_dw *pcie = (struct tegra_pcie_dw *) + dev_get_drvdata(s->private); + u32 val; + + seq_printf(s, "Tx L0s entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_Tx_L0S)); + + seq_printf(s, "Rx L0s entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_Rx_L0S)); + + seq_printf(s, "Link L1 entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_L1)); + + seq_printf(s, "Link L1.1 entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_L1_1)); + + seq_printf(s, "Link L1.2 entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_L1_2)); + + /* Clear all counters */ + dw_pcie_writel_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid], + EVENT_COUNTER_ALL_CLEAR); + + /* Re-enable counting */ + val = EVENT_COUNTER_ENABLE_ALL << EVENT_COUNTER_ENABLE_SHIFT; + val |= EVENT_COUNTER_GROUP_5 << EVENT_COUNTER_GROUP_SEL_SHIFT; + dw_pcie_writel_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid], val); + + return 0; +} + +static void init_host_aspm(struct tegra_pcie_dw *pcie) +{ + struct dw_pcie *pci = &pcie->pci; + u32 val; + + val = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS); + pcie->cfg_link_cap_l1sub = val + PCI_L1SS_CAP; + + /* Enable ASPM counters */ + val = EVENT_COUNTER_ENABLE_ALL << EVENT_COUNTER_ENABLE_SHIFT; + val |= EVENT_COUNTER_GROUP_5 << EVENT_COUNTER_GROUP_SEL_SHIFT; + dw_pcie_writel_dbi(pci, event_cntr_ctrl_offset[pcie->cid], val); + + /* Program T_cmrt and T_pwr_on values */ + val = dw_pcie_readl_dbi(pci, pcie->cfg_link_cap_l1sub); + val &= ~(PCI_L1SS_CAP_CM_RESTORE_TIME | PCI_L1SS_CAP_P_PWR_ON_VALUE); + val |= (pcie->aspm_cmrt << 8); + val |= (pcie->aspm_pwr_on_t << 19); + dw_pcie_writel_dbi(pci, pcie->cfg_link_cap_l1sub, val); + + /* Program L0s and L1 entrance latencies */ + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL); + val &= ~L0S_ENTRANCE_LAT_MASK; + val |= (pcie->aspm_l0s_enter_lat << L0S_ENTRANCE_LAT_SHIFT); + val |= ENTER_ASPM; + dw_pcie_writel_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL, val); +} + +static int init_debugfs(struct tegra_pcie_dw *pcie) +{ + struct dentry *d; + + d = debugfs_create_devm_seqfile(pcie->dev, "aspm_state_cnt", + pcie->debugfs, aspm_state_cnt); + if (IS_ERR_OR_NULL(d)) + dev_err(pcie->dev, + "Failed to create debugfs file \"aspm_state_cnt\"\n"); + + return 0; +} +#else +static inline void disable_aspm_l12(struct tegra_pcie_dw *pcie) { return; } +static inline void disable_aspm_l11(struct tegra_pcie_dw *pcie) { return; } +static inline void init_host_aspm(struct tegra_pcie_dw *pcie) { return; } +static inline int init_debugfs(struct tegra_pcie_dw *pcie) { return 0; } +#endif + +static void tegra_pcie_enable_system_interrupts(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val; + u16 val_w; + + val = appl_readl(pcie, APPL_INTR_EN_L0_0); + val |= APPL_INTR_EN_L0_0_LINK_STATE_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L0_0); + + val = appl_readl(pcie, APPL_INTR_EN_L1_0_0); + val |= APPL_INTR_EN_L1_0_0_LINK_REQ_RST_NOT_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L1_0_0); + + if (pcie->enable_cdm_check) { + val = appl_readl(pcie, APPL_INTR_EN_L0_0); + val |= APPL_INTR_EN_L0_0_CDM_REG_CHK_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L0_0); + + val = appl_readl(pcie, APPL_INTR_EN_L1_18); + val |= APPL_INTR_EN_L1_18_CDM_REG_CHK_CMP_ERR; + val |= APPL_INTR_EN_L1_18_CDM_REG_CHK_LOGIC_ERR; + appl_writel(pcie, val, APPL_INTR_EN_L1_18); + } + + val_w = dw_pcie_readw_dbi(&pcie->pci, pcie->pcie_cap_base + + PCI_EXP_LNKSTA); + pcie->init_link_width = (val_w & PCI_EXP_LNKSTA_NLW) >> + PCI_EXP_LNKSTA_NLW_SHIFT; + + val_w = dw_pcie_readw_dbi(&pcie->pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL); + val_w |= PCI_EXP_LNKCTL_LBMIE; + dw_pcie_writew_dbi(&pcie->pci, pcie->pcie_cap_base + PCI_EXP_LNKCTL, + val_w); +} + +static void tegra_pcie_enable_legacy_interrupts(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val; + + /* Enable legacy interrupt generation */ + val = appl_readl(pcie, APPL_INTR_EN_L0_0); + val |= APPL_INTR_EN_L0_0_SYS_INTR_EN; + val |= APPL_INTR_EN_L0_0_INT_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L0_0); + + val = appl_readl(pcie, APPL_INTR_EN_L1_8_0); + val |= APPL_INTR_EN_L1_8_INTX_EN; + val |= APPL_INTR_EN_L1_8_AUTO_BW_INT_EN; + val |= APPL_INTR_EN_L1_8_BW_MGT_INT_EN; + if (IS_ENABLED(CONFIG_PCIEAER)) + val |= APPL_INTR_EN_L1_8_AER_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L1_8_0); +} + +static void tegra_pcie_enable_msi_interrupts(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val; + + dw_pcie_msi_init(pp); + + /* Enable MSI interrupt generation */ + val = appl_readl(pcie, APPL_INTR_EN_L0_0); + val |= APPL_INTR_EN_L0_0_SYS_MSI_INTR_EN; + val |= APPL_INTR_EN_L0_0_MSI_RCV_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L0_0); +} + +static void tegra_pcie_enable_interrupts(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + + /* Clear interrupt statuses before enabling interrupts */ + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L0); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_0_0); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_1); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_2); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_3); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_6); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_7); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_8_0); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_9); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_10); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_11); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_13); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_14); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_15); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_17); + + tegra_pcie_enable_system_interrupts(pp); + tegra_pcie_enable_legacy_interrupts(pp); + if (IS_ENABLED(CONFIG_PCI_MSI)) + tegra_pcie_enable_msi_interrupts(pp); +} + +static void config_gen3_gen4_eq_presets(struct tegra_pcie_dw *pcie) +{ + struct dw_pcie *pci = &pcie->pci; + u32 val, offset, i; + + /* Program init preset */ + for (i = 0; i < pcie->num_lanes; i++) { + dw_pcie_read(pci->dbi_base + CAP_SPCIE_CAP_OFF + + (i * 2), 2, &val); + val &= ~CAP_SPCIE_CAP_OFF_DSP_TX_PRESET0_MASK; + val |= GEN3_GEN4_EQ_PRESET_INIT; + val &= ~CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_MASK; + val |= (GEN3_GEN4_EQ_PRESET_INIT << + CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_SHIFT); + dw_pcie_write(pci->dbi_base + CAP_SPCIE_CAP_OFF + + (i * 2), 2, val); + + offset = dw_pcie_find_ext_capability(pci, + PCI_EXT_CAP_ID_PL_16GT) + + PCI_PL_16GT_LE_CTRL; + dw_pcie_read(pci->dbi_base + offset + i, 1, &val); + val &= ~PCI_PL_16GT_LE_CTRL_DSP_TX_PRESET_MASK; + val |= GEN3_GEN4_EQ_PRESET_INIT; + val &= ~PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK; + val |= (GEN3_GEN4_EQ_PRESET_INIT << + PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT); + dw_pcie_write(pci->dbi_base + offset + i, 1, val); + } + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val &= ~GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK; + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); + + val = dw_pcie_readl_dbi(pci, GEN3_EQ_CONTROL_OFF); + val &= ~GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_MASK; + val |= (0x3ff << GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_SHIFT); + val &= ~GEN3_EQ_CONTROL_OFF_FB_MODE_MASK; + dw_pcie_writel_dbi(pci, GEN3_EQ_CONTROL_OFF, val); + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val &= ~GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK; + val |= (0x1 << GEN3_RELATED_OFF_RATE_SHADOW_SEL_SHIFT); + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); + + val = dw_pcie_readl_dbi(pci, GEN3_EQ_CONTROL_OFF); + val &= ~GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_MASK; + val |= (0x360 << GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_SHIFT); + val &= ~GEN3_EQ_CONTROL_OFF_FB_MODE_MASK; + dw_pcie_writel_dbi(pci, GEN3_EQ_CONTROL_OFF, val); + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val &= ~GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK; + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); +} + +static void tegra_pcie_prepare_host(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val; + + val = dw_pcie_readl_dbi(pci, PCI_IO_BASE); + val &= ~(IO_BASE_IO_DECODE | IO_BASE_IO_DECODE_BIT8); + dw_pcie_writel_dbi(pci, PCI_IO_BASE, val); + + val = dw_pcie_readl_dbi(pci, PCI_PREF_MEMORY_BASE); + val |= CFG_PREF_MEM_LIMIT_BASE_MEM_DECODE; + val |= CFG_PREF_MEM_LIMIT_BASE_MEM_LIMIT_DECODE; + dw_pcie_writel_dbi(pci, PCI_PREF_MEMORY_BASE, val); + + dw_pcie_writel_dbi(pci, PCI_BASE_ADDRESS_0, 0); + + /* Configure FTS */ + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL); + val &= ~(N_FTS_MASK << N_FTS_SHIFT); + val |= N_FTS_VAL << N_FTS_SHIFT; + dw_pcie_writel_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL, val); + + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_GEN2_CTRL); + val &= ~FTS_MASK; + val |= FTS_VAL; + dw_pcie_writel_dbi(pci, PORT_LOGIC_GEN2_CTRL, val); + + /* Enable as 0xFFFF0001 response for CRS */ + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_AMBA_ERROR_RESPONSE_DEFAULT); + val &= ~(AMBA_ERROR_RESPONSE_CRS_MASK << AMBA_ERROR_RESPONSE_CRS_SHIFT); + val |= (AMBA_ERROR_RESPONSE_CRS_OKAY_FFFF0001 << + AMBA_ERROR_RESPONSE_CRS_SHIFT); + dw_pcie_writel_dbi(pci, PORT_LOGIC_AMBA_ERROR_RESPONSE_DEFAULT, val); + + /* Configure Max Speed from DT */ + if (pcie->max_speed && pcie->max_speed != -EINVAL) { + val = dw_pcie_readl_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCAP); + val &= ~PCI_EXP_LNKCAP_SLS; + val |= pcie->max_speed; + dw_pcie_writel_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKCAP, + val); + } + + /* Configure Max lane width from DT */ + val = dw_pcie_readl_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKCAP); + val &= ~PCI_EXP_LNKCAP_MLW; + val |= (pcie->num_lanes << PCI_EXP_LNKSTA_NLW_SHIFT); + dw_pcie_writel_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKCAP, val); + + config_gen3_gen4_eq_presets(pcie); + + init_host_aspm(pcie); + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val &= ~GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL; + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); + + if (pcie->update_fc_fixup) { + val = dw_pcie_readl_dbi(pci, CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF); + val |= 0x1 << CFG_TIMER_CTRL_ACK_NAK_SHIFT; + dw_pcie_writel_dbi(pci, CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF, val); + } + + dw_pcie_setup_rc(pp); + + clk_set_rate(pcie->core_clk, GEN4_CORE_CLK_FREQ); + + /* Assert RST */ + val = appl_readl(pcie, APPL_PINMUX); + val &= ~APPL_PINMUX_PEX_RST; + appl_writel(pcie, val, APPL_PINMUX); + + usleep_range(100, 200); + + /* Enable LTSSM */ + val = appl_readl(pcie, APPL_CTRL); + val |= APPL_CTRL_LTSSM_EN; + appl_writel(pcie, val, APPL_CTRL); + + /* De-assert RST */ + val = appl_readl(pcie, APPL_PINMUX); + val |= APPL_PINMUX_PEX_RST; + appl_writel(pcie, val, APPL_PINMUX); + + msleep(100); +} + +static int tegra_pcie_dw_host_init(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val, tmp, offset, speed; + + tegra_pcie_prepare_host(pp); + + if (dw_pcie_wait_for_link(pci)) { + /* + * There are some endpoints which can't get the link up if + * root port has Data Link Feature (DLF) enabled. + * Refer Spec rev 4.0 ver 1.0 sec 3.4.2 & 7.7.4 for more info + * on Scaled Flow Control and DLF. + * So, need to confirm that is indeed the case here and attempt + * link up once again with DLF disabled. + */ + val = appl_readl(pcie, APPL_DEBUG); + val &= APPL_DEBUG_LTSSM_STATE_MASK; + val >>= APPL_DEBUG_LTSSM_STATE_SHIFT; + tmp = appl_readl(pcie, APPL_LINK_STATUS); + tmp &= APPL_LINK_STATUS_RDLH_LINK_UP; + if (!(val == 0x11 && !tmp)) { + /* Link is down for all good reasons */ + return 0; + } + + dev_info(pci->dev, "Link is down in DLL"); + dev_info(pci->dev, "Trying again with DLFE disabled\n"); + /* Disable LTSSM */ + val = appl_readl(pcie, APPL_CTRL); + val &= ~APPL_CTRL_LTSSM_EN; + appl_writel(pcie, val, APPL_CTRL); + + reset_control_assert(pcie->core_rst); + reset_control_deassert(pcie->core_rst); + + offset = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_DLF); + val = dw_pcie_readl_dbi(pci, offset + PCI_DLF_CAP); + val &= ~PCI_DLF_EXCHANGE_ENABLE; + dw_pcie_writel_dbi(pci, offset, val); + + tegra_pcie_prepare_host(pp); + + if (dw_pcie_wait_for_link(pci)) + return 0; + } + + speed = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA) & + PCI_EXP_LNKSTA_CLS; + clk_set_rate(pcie->core_clk, pcie_gen_freq[speed - 1]); + + tegra_pcie_enable_interrupts(pp); + + return 0; +} + +static int tegra_pcie_dw_link_up(struct dw_pcie *pci) +{ + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA); + + return !!(val & PCI_EXP_LNKSTA_DLLLA); +} + +static void tegra_pcie_set_msi_vec_num(struct pcie_port *pp) +{ + pp->num_vectors = MAX_MSI_IRQS; +} + +static const struct dw_pcie_ops tegra_dw_pcie_ops = { + .link_up = tegra_pcie_dw_link_up, +}; + +static struct dw_pcie_host_ops tegra_pcie_dw_host_ops = { + .rd_own_conf = tegra_pcie_dw_rd_own_conf, + .wr_own_conf = tegra_pcie_dw_wr_own_conf, + .host_init = tegra_pcie_dw_host_init, + .set_num_vectors = tegra_pcie_set_msi_vec_num, +}; + +static void tegra_pcie_disable_phy(struct tegra_pcie_dw *pcie) +{ + unsigned int phy_count = pcie->phy_count; + + while (phy_count--) { + phy_power_off(pcie->phys[phy_count]); + phy_exit(pcie->phys[phy_count]); + } +} + +static int tegra_pcie_enable_phy(struct tegra_pcie_dw *pcie) +{ + unsigned int i; + int ret; + + for (i = 0; i < pcie->phy_count; i++) { + ret = phy_init(pcie->phys[i]); + if (ret < 0) + goto phy_power_off; + + ret = phy_power_on(pcie->phys[i]); + if (ret < 0) + goto phy_exit; + } + + return 0; + +phy_power_off: + while (i--) { + phy_power_off(pcie->phys[i]); +phy_exit: + phy_exit(pcie->phys[i]); + } + + return ret; +} + +static int tegra_pcie_dw_parse_dt(struct tegra_pcie_dw *pcie) +{ + struct device_node *np = pcie->dev->of_node; + int ret; + + ret = of_property_read_u32(np, "nvidia,aspm-cmrt-us", &pcie->aspm_cmrt); + if (ret < 0) { + dev_info(pcie->dev, "Failed to read ASPM T_cmrt: %d\n", ret); + return ret; + } + + ret = of_property_read_u32(np, "nvidia,aspm-pwr-on-t-us", + &pcie->aspm_pwr_on_t); + if (ret < 0) + dev_info(pcie->dev, "Failed to read ASPM Power On time: %d\n", + ret); + + ret = of_property_read_u32(np, "nvidia,aspm-l0s-entrance-latency-us", + &pcie->aspm_l0s_enter_lat); + if (ret < 0) + dev_info(pcie->dev, + "Failed to read ASPM L0s Entrance latency: %d\n", ret); + + ret = of_property_read_u32(np, "num-lanes", &pcie->num_lanes); + if (ret < 0) { + dev_err(pcie->dev, "Failed to read num-lanes: %d\n", ret); + return ret; + } + + pcie->max_speed = of_pci_get_max_link_speed(np); + + ret = of_property_read_u32_index(np, "nvidia,bpmp", 1, &pcie->cid); + if (ret) { + dev_err(pcie->dev, "Failed to read Controller-ID: %d\n", ret); + return ret; + } + + ret = of_property_count_strings(np, "phy-names"); + if (ret < 0) { + dev_err(pcie->dev, "Failed to find PHY entries: %d\n", + ret); + return ret; + } + pcie->phy_count = ret; + + if (of_property_read_bool(np, "nvidia,update-fc-fixup")) + pcie->update_fc_fixup = true; + + pcie->supports_clkreq = + of_property_read_bool(pcie->dev->of_node, "supports-clkreq"); + + pcie->enable_cdm_check = + of_property_read_bool(np, "snps,enable-cdm-check"); + + return 0; +} + +static int tegra_pcie_bpmp_set_ctrl_state(struct tegra_pcie_dw *pcie, + bool enable) +{ + struct mrq_uphy_response resp; + struct tegra_bpmp_message msg; + struct mrq_uphy_request req; + + /* Controller-5 doesn't need to have its state set by BPMP-FW */ + if (pcie->cid == 5) + return 0; + + memset(&req, 0, sizeof(req)); + memset(&resp, 0, sizeof(resp)); + + req.cmd = CMD_UPHY_PCIE_CONTROLLER_STATE; + req.controller_state.pcie_controller = pcie->cid; + req.controller_state.enable = enable; + + memset(&msg, 0, sizeof(msg)); + msg.mrq = MRQ_UPHY; + msg.tx.data = &req; + msg.tx.size = sizeof(req); + msg.rx.data = &resp; + msg.rx.size = sizeof(resp); + + return tegra_bpmp_transfer(pcie->bpmp, &msg); +} + +static void tegra_pcie_downstream_dev_to_D0(struct tegra_pcie_dw *pcie) +{ + struct pcie_port *pp = &pcie->pci.pp; + struct pci_bus *child, *root_bus = NULL; + struct pci_dev *pdev; + + /* + * link doesn't go into L2 state with some of the endpoints with Tegra + * if they are not in D0 state. So, need to make sure that immediate + * downstream devices are in D0 state before sending PME_TurnOff to put + * link into L2 state. + * This is as per PCI Express Base r4.0 v1.0 September 27-2017, + * 5.2 Link State Power Management (Page #428). + */ + + list_for_each_entry(child, &pp->root_bus->children, node) { + /* Bring downstream devices to D0 if they are not already in */ + if (child->parent == pp->root_bus) { + root_bus = child; + break; + } + } + + if (!root_bus) { + dev_err(pcie->dev, "Failed to find downstream devices\n"); + return; + } + + list_for_each_entry(pdev, &root_bus->devices, bus_list) { + if (PCI_SLOT(pdev->devfn) == 0) { + if (pci_set_power_state(pdev, PCI_D0)) + dev_err(pcie->dev, + "Failed to transition %s to D0 state\n", + dev_name(&pdev->dev)); + } + } +} + +static int tegra_pcie_config_controller(struct tegra_pcie_dw *pcie, + bool en_hw_hot_rst) +{ + int ret; + u32 val; + + ret = tegra_pcie_bpmp_set_ctrl_state(pcie, true); + if (ret) { + dev_err(pcie->dev, + "Failed to enable controller %u: %d\n", pcie->cid, ret); + return ret; + } + + ret = regulator_enable(pcie->pex_ctl_supply); + if (ret < 0) { + dev_err(pcie->dev, "Failed to enable regulator: %d\n", ret); + goto fail_reg_en; + } + + ret = clk_prepare_enable(pcie->core_clk); + if (ret) { + dev_err(pcie->dev, "Failed to enable core clock: %d\n", ret); + goto fail_core_clk; + } + + ret = reset_control_deassert(pcie->core_apb_rst); + if (ret) { + dev_err(pcie->dev, "Failed to deassert core APB reset: %d\n", + ret); + goto fail_core_apb_rst; + } + + if (en_hw_hot_rst) { + /* Enable HW_HOT_RST mode */ + val = appl_readl(pcie, APPL_CTRL); + val &= ~(APPL_CTRL_HW_HOT_RST_MODE_MASK << + APPL_CTRL_HW_HOT_RST_MODE_SHIFT); + val |= APPL_CTRL_HW_HOT_RST_EN; + appl_writel(pcie, val, APPL_CTRL); + } + + ret = tegra_pcie_enable_phy(pcie); + if (ret) { + dev_err(pcie->dev, "Failed to enable PHY: %d\n", ret); + goto fail_phy; + } + + /* Update CFG base address */ + appl_writel(pcie, pcie->dbi_res->start & APPL_CFG_BASE_ADDR_MASK, + APPL_CFG_BASE_ADDR); + + /* Configure this core for RP mode operation */ + appl_writel(pcie, APPL_DM_TYPE_RP, APPL_DM_TYPE); + + appl_writel(pcie, 0x0, APPL_CFG_SLCG_OVERRIDE); + + val = appl_readl(pcie, APPL_CTRL); + appl_writel(pcie, val | APPL_CTRL_SYS_PRE_DET_STATE, APPL_CTRL); + + val = appl_readl(pcie, APPL_CFG_MISC); + val |= (APPL_CFG_MISC_ARCACHE_VAL << APPL_CFG_MISC_ARCACHE_SHIFT); + appl_writel(pcie, val, APPL_CFG_MISC); + + if (!pcie->supports_clkreq) { + val = appl_readl(pcie, APPL_PINMUX); + val |= APPL_PINMUX_CLKREQ_OUT_OVRD_EN; + val |= APPL_PINMUX_CLKREQ_OUT_OVRD; + appl_writel(pcie, val, APPL_PINMUX); + } + + /* Update iATU_DMA base address */ + appl_writel(pcie, + pcie->atu_dma_res->start & APPL_CFG_IATU_DMA_BASE_ADDR_MASK, + APPL_CFG_IATU_DMA_BASE_ADDR); + + reset_control_deassert(pcie->core_rst); + + pcie->pcie_cap_base = dw_pcie_find_capability(&pcie->pci, + PCI_CAP_ID_EXP); + + /* Disable ASPM-L1SS advertisement as there is no CLKREQ routing */ + if (!pcie->supports_clkreq) { + disable_aspm_l11(pcie); + disable_aspm_l12(pcie); + } + + return ret; + +fail_phy: + reset_control_assert(pcie->core_apb_rst); +fail_core_apb_rst: + clk_disable_unprepare(pcie->core_clk); +fail_core_clk: + regulator_disable(pcie->pex_ctl_supply); +fail_reg_en: + tegra_pcie_bpmp_set_ctrl_state(pcie, false); + + return ret; +} + +static int __deinit_controller(struct tegra_pcie_dw *pcie) +{ + int ret; + + ret = reset_control_assert(pcie->core_rst); + if (ret) { + dev_err(pcie->dev, "Failed to assert \"core\" reset: %d\n", + ret); + return ret; + } + + tegra_pcie_disable_phy(pcie); + + ret = reset_control_assert(pcie->core_apb_rst); + if (ret) { + dev_err(pcie->dev, "Failed to assert APB reset: %d\n", ret); + return ret; + } + + clk_disable_unprepare(pcie->core_clk); + + ret = regulator_disable(pcie->pex_ctl_supply); + if (ret) { + dev_err(pcie->dev, "Failed to disable regulator: %d\n", ret); + return ret; + } + + ret = tegra_pcie_bpmp_set_ctrl_state(pcie, false); + if (ret) { + dev_err(pcie->dev, "Failed to disable controller %d: %d\n", + pcie->cid, ret); + return ret; + } + + return ret; +} + +static int tegra_pcie_init_controller(struct tegra_pcie_dw *pcie) +{ + struct dw_pcie *pci = &pcie->pci; + struct pcie_port *pp = &pci->pp; + int ret; + + ret = tegra_pcie_config_controller(pcie, false); + if (ret < 0) + return ret; + + pp->ops = &tegra_pcie_dw_host_ops; + + ret = dw_pcie_host_init(pp); + if (ret < 0) { + dev_err(pcie->dev, "Failed to add PCIe port: %d\n", ret); + goto fail_host_init; + } + + return 0; + +fail_host_init: + return __deinit_controller(pcie); +} + +static int tegra_pcie_try_link_l2(struct tegra_pcie_dw *pcie) +{ + u32 val; + + if (!tegra_pcie_dw_link_up(&pcie->pci)) + return 0; + + val = appl_readl(pcie, APPL_RADM_STATUS); + val |= APPL_PM_XMT_TURNOFF_STATE; + appl_writel(pcie, val, APPL_RADM_STATUS); + + return readl_poll_timeout_atomic(pcie->appl_base + APPL_DEBUG, val, + val & APPL_DEBUG_PM_LINKST_IN_L2_LAT, + 1, PME_ACK_TIMEOUT); +} + +static void tegra_pcie_dw_pme_turnoff(struct tegra_pcie_dw *pcie) +{ + u32 data; + int err; + + if (!tegra_pcie_dw_link_up(&pcie->pci)) { + dev_dbg(pcie->dev, "PCIe link is not up...!\n"); + return; + } + + if (tegra_pcie_try_link_l2(pcie)) { + dev_info(pcie->dev, "Link didn't transition to L2 state\n"); + /* + * TX lane clock freq will reset to Gen1 only if link is in L2 + * or detect state. + * So apply pex_rst to end point to force RP to go into detect + * state + */ + data = appl_readl(pcie, APPL_PINMUX); + data &= ~APPL_PINMUX_PEX_RST; + appl_writel(pcie, data, APPL_PINMUX); + + err = readl_poll_timeout_atomic(pcie->appl_base + APPL_DEBUG, + data, + ((data & + APPL_DEBUG_LTSSM_STATE_MASK) >> + APPL_DEBUG_LTSSM_STATE_SHIFT) == + LTSSM_STATE_PRE_DETECT, + 1, LTSSM_TIMEOUT); + if (err) { + dev_info(pcie->dev, "Link didn't go to detect state\n"); + } else { + /* Disable LTSSM after link is in detect state */ + data = appl_readl(pcie, APPL_CTRL); + data &= ~APPL_CTRL_LTSSM_EN; + appl_writel(pcie, data, APPL_CTRL); + } + } + /* + * DBI registers may not be accessible after this as PLL-E would be + * down depending on how CLKREQ is pulled by end point + */ + data = appl_readl(pcie, APPL_PINMUX); + data |= (APPL_PINMUX_CLKREQ_OVERRIDE_EN | APPL_PINMUX_CLKREQ_OVERRIDE); + /* Cut REFCLK to slot */ + data |= APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE_EN; + data &= ~APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE; + appl_writel(pcie, data, APPL_PINMUX); +} + +static int tegra_pcie_deinit_controller(struct tegra_pcie_dw *pcie) +{ + tegra_pcie_downstream_dev_to_D0(pcie); + dw_pcie_host_deinit(&pcie->pci.pp); + tegra_pcie_dw_pme_turnoff(pcie); + + return __deinit_controller(pcie); +} + +static int tegra_pcie_config_rp(struct tegra_pcie_dw *pcie) +{ + struct pcie_port *pp = &pcie->pci.pp; + struct device *dev = pcie->dev; + char *name; + int ret; + + if (IS_ENABLED(CONFIG_PCI_MSI)) { + pp->msi_irq = of_irq_get_byname(dev->of_node, "msi"); + if (!pp->msi_irq) { + dev_err(dev, "Failed to get MSI interrupt\n"); + return -ENODEV; + } + } + + pm_runtime_enable(dev); + + ret = pm_runtime_get_sync(dev); + if (ret < 0) { + dev_err(dev, "Failed to get runtime sync for PCIe dev: %d\n", + ret); + pm_runtime_disable(dev); + return ret; + } + + tegra_pcie_init_controller(pcie); + + pcie->link_state = tegra_pcie_dw_link_up(&pcie->pci); + if (!pcie->link_state) { + ret = -ENOMEDIUM; + goto fail_host_init; + } + + name = devm_kasprintf(dev, GFP_KERNEL, "%pOFP", dev->of_node); + if (!name) { + ret = -ENOMEM; + goto fail_host_init; + } + + pcie->debugfs = debugfs_create_dir(name, NULL); + if (!pcie->debugfs) + dev_err(dev, "Failed to create debugfs\n"); + else + init_debugfs(pcie); + + return ret; + +fail_host_init: + tegra_pcie_deinit_controller(pcie); + pm_runtime_put_sync(dev); + pm_runtime_disable(dev); + return ret; +} + +static int tegra_pcie_dw_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct resource *atu_dma_res; + struct tegra_pcie_dw *pcie; + struct resource *dbi_res; + struct pcie_port *pp; + struct dw_pcie *pci; + struct phy **phys; + char *name; + int ret; + u32 i; + + pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); + if (!pcie) + return -ENOMEM; + + pci = &pcie->pci; + pci->dev = &pdev->dev; + pci->ops = &tegra_dw_pcie_ops; + pp = &pci->pp; + pcie->dev = &pdev->dev; + + ret = tegra_pcie_dw_parse_dt(pcie); + if (ret < 0) { + dev_err(dev, "Failed to parse device tree: %d\n", ret); + return ret; + } + + pcie->pex_ctl_supply = devm_regulator_get(dev, "vddio-pex-ctl"); + if (IS_ERR(pcie->pex_ctl_supply)) { + ret = PTR_ERR(pcie->pex_ctl_supply); + if (ret != -EPROBE_DEFER) + dev_err(dev, "Failed to get regulator: %ld\n", + PTR_ERR(pcie->pex_ctl_supply)); + return ret; + } + + pcie->core_clk = devm_clk_get(dev, "core"); + if (IS_ERR(pcie->core_clk)) { + dev_err(dev, "Failed to get core clock: %ld\n", + PTR_ERR(pcie->core_clk)); + return PTR_ERR(pcie->core_clk); + } + + pcie->appl_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "appl"); + if (!pcie->appl_res) { + dev_err(dev, "Failed to find \"appl\" region\n"); + return -ENODEV; + } + + pcie->appl_base = devm_ioremap_resource(dev, pcie->appl_res); + if (IS_ERR(pcie->appl_base)) + return PTR_ERR(pcie->appl_base); + + pcie->core_apb_rst = devm_reset_control_get(dev, "apb"); + if (IS_ERR(pcie->core_apb_rst)) { + dev_err(dev, "Failed to get APB reset: %ld\n", + PTR_ERR(pcie->core_apb_rst)); + return PTR_ERR(pcie->core_apb_rst); + } + + phys = devm_kcalloc(dev, pcie->phy_count, sizeof(*phys), GFP_KERNEL); + if (!phys) + return -ENOMEM; + + for (i = 0; i < pcie->phy_count; i++) { + name = kasprintf(GFP_KERNEL, "p2u-%u", i); + if (!name) { + dev_err(dev, "Failed to create P2U string\n"); + return -ENOMEM; + } + phys[i] = devm_phy_get(dev, name); + kfree(name); + if (IS_ERR(phys[i])) { + ret = PTR_ERR(phys[i]); + if (ret != -EPROBE_DEFER) + dev_err(dev, "Failed to get PHY: %d\n", ret); + return ret; + } + } + + pcie->phys = phys; + + dbi_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); + if (!dbi_res) { + dev_err(dev, "Failed to find \"dbi\" region\n"); + return -ENODEV; + } + pcie->dbi_res = dbi_res; + + pci->dbi_base = devm_ioremap_resource(dev, dbi_res); + if (IS_ERR(pci->dbi_base)) + return PTR_ERR(pci->dbi_base); + + /* Tegra HW locates DBI2 at a fixed offset from DBI */ + pci->dbi_base2 = pci->dbi_base + 0x1000; + + atu_dma_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "atu_dma"); + if (!atu_dma_res) { + dev_err(dev, "Failed to find \"atu_dma\" region\n"); + return -ENODEV; + } + pcie->atu_dma_res = atu_dma_res; + + pci->atu_base = devm_ioremap_resource(dev, atu_dma_res); + if (IS_ERR(pci->atu_base)) + return PTR_ERR(pci->atu_base); + + pcie->core_rst = devm_reset_control_get(dev, "core"); + if (IS_ERR(pcie->core_rst)) { + dev_err(dev, "Failed to get core reset: %ld\n", + PTR_ERR(pcie->core_rst)); + return PTR_ERR(pcie->core_rst); + } + + pp->irq = platform_get_irq_byname(pdev, "intr"); + if (!pp->irq) { + dev_err(dev, "Failed to get \"intr\" interrupt\n"); + return -ENODEV; + } + + ret = devm_request_irq(dev, pp->irq, tegra_pcie_irq_handler, + IRQF_SHARED, "tegra-pcie-intr", pcie); + if (ret) { + dev_err(dev, "Failed to request IRQ %d: %d\n", pp->irq, ret); + return ret; + } + + pcie->bpmp = tegra_bpmp_get(dev); + if (IS_ERR(pcie->bpmp)) + return PTR_ERR(pcie->bpmp); + + platform_set_drvdata(pdev, pcie); + + ret = tegra_pcie_config_rp(pcie); + if (ret && ret != -ENOMEDIUM) + goto fail; + else + return 0; + +fail: + tegra_bpmp_put(pcie->bpmp); + return ret; +} + +static int tegra_pcie_dw_remove(struct platform_device *pdev) +{ + struct tegra_pcie_dw *pcie = platform_get_drvdata(pdev); + + if (!pcie->link_state) + return 0; + + debugfs_remove_recursive(pcie->debugfs); + tegra_pcie_deinit_controller(pcie); + pm_runtime_put_sync(pcie->dev); + pm_runtime_disable(pcie->dev); + tegra_bpmp_put(pcie->bpmp); + + return 0; +} + +static int tegra_pcie_dw_suspend_late(struct device *dev) +{ + struct tegra_pcie_dw *pcie = dev_get_drvdata(dev); + u32 val; + + if (!pcie->link_state) + return 0; + + /* Enable HW_HOT_RST mode */ + val = appl_readl(pcie, APPL_CTRL); + val &= ~(APPL_CTRL_HW_HOT_RST_MODE_MASK << + APPL_CTRL_HW_HOT_RST_MODE_SHIFT); + val |= APPL_CTRL_HW_HOT_RST_EN; + appl_writel(pcie, val, APPL_CTRL); + + return 0; +} + +static int tegra_pcie_dw_suspend_noirq(struct device *dev) +{ + struct tegra_pcie_dw *pcie = dev_get_drvdata(dev); + + if (!pcie->link_state) + return 0; + + /* Save MSI interrupt vector */ + pcie->msi_ctrl_int = dw_pcie_readl_dbi(&pcie->pci, + PORT_LOGIC_MSI_CTRL_INT_0_EN); + tegra_pcie_downstream_dev_to_D0(pcie); + tegra_pcie_dw_pme_turnoff(pcie); + + return __deinit_controller(pcie); +} + +static int tegra_pcie_dw_resume_noirq(struct device *dev) +{ + struct tegra_pcie_dw *pcie = dev_get_drvdata(dev); + int ret; + + if (!pcie->link_state) + return 0; + + ret = tegra_pcie_config_controller(pcie, true); + if (ret < 0) + return ret; + + ret = tegra_pcie_dw_host_init(&pcie->pci.pp); + if (ret < 0) { + dev_err(dev, "Failed to init host: %d\n", ret); + goto fail_host_init; + } + + /* Restore MSI interrupt vector */ + dw_pcie_writel_dbi(&pcie->pci, PORT_LOGIC_MSI_CTRL_INT_0_EN, + pcie->msi_ctrl_int); + + return 0; + +fail_host_init: + return __deinit_controller(pcie); +} + +static int tegra_pcie_dw_resume_early(struct device *dev) +{ + struct tegra_pcie_dw *pcie = dev_get_drvdata(dev); + u32 val; + + if (!pcie->link_state) + return 0; + + /* Disable HW_HOT_RST mode */ + val = appl_readl(pcie, APPL_CTRL); + val &= ~(APPL_CTRL_HW_HOT_RST_MODE_MASK << + APPL_CTRL_HW_HOT_RST_MODE_SHIFT); + val |= APPL_CTRL_HW_HOT_RST_MODE_IMDT_RST << + APPL_CTRL_HW_HOT_RST_MODE_SHIFT; + val &= ~APPL_CTRL_HW_HOT_RST_EN; + appl_writel(pcie, val, APPL_CTRL); + + return 0; +} + +static void tegra_pcie_dw_shutdown(struct platform_device *pdev) +{ + struct tegra_pcie_dw *pcie = platform_get_drvdata(pdev); + + if (!pcie->link_state) + return; + + debugfs_remove_recursive(pcie->debugfs); + tegra_pcie_downstream_dev_to_D0(pcie); + + disable_irq(pcie->pci.pp.irq); + if (IS_ENABLED(CONFIG_PCI_MSI)) + disable_irq(pcie->pci.pp.msi_irq); + + tegra_pcie_dw_pme_turnoff(pcie); + __deinit_controller(pcie); +} + +static const struct of_device_id tegra_pcie_dw_of_match[] = { + { + .compatible = "nvidia,tegra194-pcie", + }, + {}, +}; + +static const struct dev_pm_ops tegra_pcie_dw_pm_ops = { + .suspend_late = tegra_pcie_dw_suspend_late, + .suspend_noirq = tegra_pcie_dw_suspend_noirq, + .resume_noirq = tegra_pcie_dw_resume_noirq, + .resume_early = tegra_pcie_dw_resume_early, +}; + +static struct platform_driver tegra_pcie_dw_driver = { + .probe = tegra_pcie_dw_probe, + .remove = tegra_pcie_dw_remove, + .shutdown = tegra_pcie_dw_shutdown, + .driver = { + .name = "tegra194-pcie", + .pm = &tegra_pcie_dw_pm_ops, + .of_match_table = tegra_pcie_dw_of_match, + }, +}; +module_platform_driver(tegra_pcie_dw_driver); + +MODULE_DEVICE_TABLE(of, tegra_pcie_dw_of_match); + +MODULE_AUTHOR("Vidya Sagar "); +MODULE_DESCRIPTION("NVIDIA PCIe host controller driver"); +MODULE_LICENSE("GPL v2"); From 151481ef5e360af4ca68835943a036339d3c7826 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 5 Sep 2019 16:15:48 +0530 Subject: [PATCH 0373/2880] dt-bindings: PCI: tegra: Add sideband pins configuration entries Add optional bindings "pinctrl-names" and "pinctrl-0" to describe pin configuration information of a particular PCIe controller. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Reviewed-by: Rob Herring Acked-by: Thierry Reding --- .../devicetree/bindings/pci/nvidia,tegra194-pcie.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt index 674e5adb2895..0ac1b867ac24 100644 --- a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt +++ b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt @@ -83,6 +83,11 @@ Required properties: - vddio-pex-ctl-supply: Regulator supply for PCIe side band signals Optional properties: +- pinctrl-names: A list of pinctrl state names. + It is mandatory for C5 controller and optional for other controllers. + - "default": Configures PCIe I/O for proper operation. +- pinctrl-0: phandle for the 'default' state of pin configuration. + It is mandatory for C5 controller and optional for other controllers. - supports-clkreq: Refer to Documentation/devicetree/bindings/pci/pci.txt - nvidia,update-fc-fixup: This is a boolean property and needs to be present to improve performance when a platform is designed in such a way that it @@ -120,6 +125,9 @@ Tegra194: num-lanes = <8>; linux,pci-domain = <0>; + pinctrl-names = "default"; + pinctrl-0 = <&pex_rst_c5_out_state>, <&clkreq_c5_bi_dir_state>; + clocks = <&bpmp TEGRA194_CLK_PEX0_CORE_0>; clock-names = "core"; From 7ed106d8fdfa4e6cbf9226f49ce921a681c9f240 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 5 Sep 2019 16:15:49 +0530 Subject: [PATCH 0374/2880] dt-bindings: PCI: tegra: Add PCIe slot supplies regulator entries Add optional bindings "vpcie3v3-supply" and "vpcie12v-supply" to describe regulators of a PCIe slot's supplies 3.3V and 12V provided the platform is designed to have regulator controlled slot supplies. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Reviewed-by: Rob Herring Acked-by: Thierry Reding --- .../devicetree/bindings/pci/nvidia,tegra194-pcie.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt index 0ac1b867ac24..b739f92da58e 100644 --- a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt +++ b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt @@ -104,6 +104,12 @@ Optional properties: specified in microseconds - nvidia,aspm-l0s-entrance-latency-us: ASPM L0s entrance latency to be specified in microseconds +- vpcie3v3-supply: A phandle to the regulator node that supplies 3.3V to the slot + if the platform has one such slot. (Ex:- x16 slot owned by C5 controller + in p2972-0000 platform). +- vpcie12v-supply: A phandle to the regulator node that supplies 12V to the slot + if the platform has one such slot. (Ex:- x16 slot owned by C5 controller + in p2972-0000 platform). Examples: ========= @@ -156,6 +162,8 @@ Tegra194: 0xc2000000 0x18 0x00000000 0x18 0x00000000 0x4 0x00000000>; /* prefetchable memory (16GB) */ vddio-pex-ctl-supply = <&vdd_1v8ao>; + vpcie3v3-supply = <&vdd_3v3_pcie>; + vpcie12v-supply = <&vdd_12v_pcie>; phys = <&p2u_hsio_2>, <&p2u_hsio_3>, <&p2u_hsio_4>, <&p2u_hsio_5>; From 6c59a962e081df6d8fe43325bbfabec57e0d4751 Mon Sep 17 00:00:00 2001 From: Pascal Bouwmann Date: Thu, 29 Aug 2019 07:29:41 +0200 Subject: [PATCH 0375/2880] iio: fix center temperature of bmc150-accel-core The center temperature of the supported devices stored in the constant BMC150_ACCEL_TEMP_CENTER_VAL is not 24 degrees but 23 degrees. It seems that some datasheets were inconsistent on this value leading to the error. For most usecases will only make minor difference so not queued for stable. Signed-off-by: Pascal Bouwmann Signed-off-by: Jonathan Cameron --- drivers/iio/accel/bmc150-accel-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/accel/bmc150-accel-core.c b/drivers/iio/accel/bmc150-accel-core.c index cf6c0e3a83d3..121b4e89f038 100644 --- a/drivers/iio/accel/bmc150-accel-core.c +++ b/drivers/iio/accel/bmc150-accel-core.c @@ -117,7 +117,7 @@ #define BMC150_ACCEL_SLEEP_1_SEC 0x0F #define BMC150_ACCEL_REG_TEMP 0x08 -#define BMC150_ACCEL_TEMP_CENTER_VAL 24 +#define BMC150_ACCEL_TEMP_CENTER_VAL 23 #define BMC150_ACCEL_AXIS_TO_REG(axis) (BMC150_ACCEL_REG_XOUT_L + (axis * 2)) #define BMC150_AUTO_SUSPEND_DELAY_MS 2000 From 9c426b770bd088f18899f836093d810a83b59b98 Mon Sep 17 00:00:00 2001 From: Talel Shenhar Date: Mon, 9 Sep 2019 11:39:18 +0300 Subject: [PATCH 0376/2880] irqchip/al-fic: Add support for irq retrigger Introduce interrupts retrigger support for Amazon's Annapurna Labs Fabric Interrupt Controller. Signed-off-by: Talel Shenhar Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1568018358-18985-1-git-send-email-talel@amazon.com --- drivers/irqchip/irq-al-fic.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/irqchip/irq-al-fic.c b/drivers/irqchip/irq-al-fic.c index 1a57cee3efab..0b0a73739756 100644 --- a/drivers/irqchip/irq-al-fic.c +++ b/drivers/irqchip/irq-al-fic.c @@ -15,6 +15,7 @@ /* FIC Registers */ #define AL_FIC_CAUSE 0x00 +#define AL_FIC_SET_CAUSE 0x08 #define AL_FIC_MASK 0x10 #define AL_FIC_CONTROL 0x28 @@ -126,6 +127,16 @@ static void al_fic_irq_handler(struct irq_desc *desc) chained_irq_exit(irqchip, desc); } +static int al_fic_irq_retrigger(struct irq_data *data) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data); + struct al_fic *fic = gc->private; + + writel_relaxed(BIT(data->hwirq), fic->base + AL_FIC_SET_CAUSE); + + return 1; +} + static int al_fic_register(struct device_node *node, struct al_fic *fic) { @@ -159,6 +170,7 @@ static int al_fic_register(struct device_node *node, gc->chip_types->chip.irq_unmask = irq_gc_mask_clr_bit; gc->chip_types->chip.irq_ack = irq_gc_ack_clr_bit; gc->chip_types->chip.irq_set_type = al_fic_irq_set_type; + gc->chip_types->chip.irq_retrigger = al_fic_irq_retrigger; gc->chip_types->chip.flags = IRQCHIP_SKIP_SET_WAKE; gc->private = fic; From 212fbf2c9e84ceb267cadd8342156b69b54b8135 Mon Sep 17 00:00:00 2001 From: Sandeep Sheriker Mallikarjun Date: Mon, 9 Sep 2019 14:00:35 +0300 Subject: [PATCH 0377/2880] irqchip/atmel-aic5: Add support for sam9x60 irqchip Add support for SAM9X60 irqchip. Signed-off-by: Sandeep Sheriker Mallikarjun Signed-off-by: Claudiu Beznea Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1568026835-6646-1-git-send-email-claudiu.beznea@microchip.com [claudiu.beznea@microchip.com: update aic5_irq_fixups[], update documentation] --- .../bindings/interrupt-controller/atmel,aic.txt | 7 +++++-- drivers/irqchip/irq-atmel-aic5.c | 10 ++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/atmel,aic.txt b/Documentation/devicetree/bindings/interrupt-controller/atmel,aic.txt index f4c5d34c4111..7079d44bf3ba 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/atmel,aic.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/atmel,aic.txt @@ -1,8 +1,11 @@ * Advanced Interrupt Controller (AIC) Required properties: -- compatible: Should be "atmel,-aic" - can be "at91rm9200", "sama5d2", "sama5d3" or "sama5d4" +- compatible: Should be: + - "atmel,-aic" where can be "at91rm9200", "sama5d2", + "sama5d3" or "sama5d4" + - "microchip,-aic" where can be "sam9x60" + - interrupt-controller: Identifies the node as an interrupt controller. - #interrupt-cells: The number of cells to define the interrupts. It should be 3. The first cell is the IRQ number (aka "Peripheral IDentifier" on datasheet). diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c index 6acad2ea0fb3..29333497ba10 100644 --- a/drivers/irqchip/irq-atmel-aic5.c +++ b/drivers/irqchip/irq-atmel-aic5.c @@ -313,6 +313,7 @@ static void __init sama5d3_aic_irq_fixup(void) static const struct of_device_id aic5_irq_fixups[] __initconst = { { .compatible = "atmel,sama5d3", .data = sama5d3_aic_irq_fixup }, { .compatible = "atmel,sama5d4", .data = sama5d3_aic_irq_fixup }, + { .compatible = "microchip,sam9x60", .data = sama5d3_aic_irq_fixup }, { /* sentinel */ }, }; @@ -390,3 +391,12 @@ static int __init sama5d4_aic5_of_init(struct device_node *node, return aic5_of_init(node, parent, NR_SAMA5D4_IRQS); } IRQCHIP_DECLARE(sama5d4_aic5, "atmel,sama5d4-aic", sama5d4_aic5_of_init); + +#define NR_SAM9X60_IRQS 50 + +static int __init sam9x60_aic5_of_init(struct device_node *node, + struct device_node *parent) +{ + return aic5_of_init(node, parent, NR_SAM9X60_IRQS); +} +IRQCHIP_DECLARE(sam9x60_aic5, "microchip,sam9x60-aic", sam9x60_aic5_of_init); From b6749e20d5710c955fc6d4322f8fd98c915b2573 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 1 Sep 2019 22:12:35 +0100 Subject: [PATCH 0378/2880] arm64: KVM: Drop hyp_alternate_select for checking for ARM64_WORKAROUND_834220 There is no reason for using hyp_alternate_select when checking for ARM64_WORKAROUND_834220, as each of the capabilities is also backed by a static key. Just replace the KVM-specific construct with cpus_have_const_cap(ARM64_WORKAROUND_834220). Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Reviewed-by: Andrew Jones --- arch/arm64/kvm/hyp/switch.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index adaf266d8de8..a15baca9aca0 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -229,20 +229,6 @@ static void __hyp_text __hyp_vgic_restore_state(struct kvm_vcpu *vcpu) } } -static bool __hyp_text __true_value(void) -{ - return true; -} - -static bool __hyp_text __false_value(void) -{ - return false; -} - -static hyp_alternate_select(__check_arm_834220, - __false_value, __true_value, - ARM64_WORKAROUND_834220); - static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar) { u64 par, tmp; @@ -298,7 +284,8 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) * resolve the IPA using the AT instruction. */ if (!(esr & ESR_ELx_S1PTW) && - (__check_arm_834220()() || (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { + (cpus_have_const_cap(ARM64_WORKAROUND_834220) || + (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { if (!__translate_far_to_hpfar(far, &hpfar)) return false; } else { From aa979fa899c5cf592e63ce7b34bc767224225c6a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 1 Sep 2019 22:12:36 +0100 Subject: [PATCH 0379/2880] arm64: KVM: Replace hyp_alternate_select with has_vhe() Given that the TLB invalidation path is pretty rarely used, there was never any advantage to using hyp_alternate_select() here. has_vhe(), being a glorified static key, is the right tool for the job. Off you go. Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Reviewed-by: Andrew Jones --- arch/arm64/kvm/hyp/tlb.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index c466060b76d6..eb0efc5557f3 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -67,10 +67,14 @@ static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm, isb(); } -static hyp_alternate_select(__tlb_switch_to_guest, - __tlb_switch_to_guest_nvhe, - __tlb_switch_to_guest_vhe, - ARM64_HAS_VIRT_HOST_EXTN); +static void __hyp_text __tlb_switch_to_guest(struct kvm *kvm, + struct tlb_inv_context *cxt) +{ + if (has_vhe()) + __tlb_switch_to_guest_vhe(kvm, cxt); + else + __tlb_switch_to_guest_nvhe(kvm, cxt); +} static void __hyp_text __tlb_switch_to_host_vhe(struct kvm *kvm, struct tlb_inv_context *cxt) @@ -98,10 +102,14 @@ static void __hyp_text __tlb_switch_to_host_nvhe(struct kvm *kvm, write_sysreg(0, vttbr_el2); } -static hyp_alternate_select(__tlb_switch_to_host, - __tlb_switch_to_host_nvhe, - __tlb_switch_to_host_vhe, - ARM64_HAS_VIRT_HOST_EXTN); +static void __hyp_text __tlb_switch_to_host(struct kvm *kvm, + struct tlb_inv_context *cxt) +{ + if (has_vhe()) + __tlb_switch_to_host_vhe(kvm, cxt); + else + __tlb_switch_to_host_nvhe(kvm, cxt); +} void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { @@ -111,7 +119,7 @@ void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) /* Switch to requested VMID */ kvm = kern_hyp_va(kvm); - __tlb_switch_to_guest()(kvm, &cxt); + __tlb_switch_to_guest(kvm, &cxt); /* * We could do so much better if we had the VA as well. @@ -154,7 +162,7 @@ void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) if (!has_vhe() && icache_is_vpipt()) __flush_icache_all(); - __tlb_switch_to_host()(kvm, &cxt); + __tlb_switch_to_host(kvm, &cxt); } void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) @@ -165,13 +173,13 @@ void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) /* Switch to requested VMID */ kvm = kern_hyp_va(kvm); - __tlb_switch_to_guest()(kvm, &cxt); + __tlb_switch_to_guest(kvm, &cxt); __tlbi(vmalls12e1is); dsb(ish); isb(); - __tlb_switch_to_host()(kvm, &cxt); + __tlb_switch_to_host(kvm, &cxt); } void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) @@ -180,13 +188,13 @@ void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest()(kvm, &cxt); + __tlb_switch_to_guest(kvm, &cxt); __tlbi(vmalle1); dsb(nsh); isb(); - __tlb_switch_to_host()(kvm, &cxt); + __tlb_switch_to_host(kvm, &cxt); } void __hyp_text __kvm_flush_vm_context(void) From 084b5a80e87258997632dbd975ec3e5fda6bb943 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 1 Sep 2019 22:12:37 +0100 Subject: [PATCH 0380/2880] arm64: KVM: Kill hyp_alternate_select() hyp_alternate_select() is now completely unused. Goodbye. Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Reviewed-by: Andrew Jones --- arch/arm64/include/asm/kvm_hyp.h | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 86825aa20852..97f21cc66657 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -47,30 +47,6 @@ #define read_sysreg_el2(r) read_sysreg_elx(r, _EL2, _EL1) #define write_sysreg_el2(v,r) write_sysreg_elx(v, r, _EL2, _EL1) -/** - * hyp_alternate_select - Generates patchable code sequences that are - * used to switch between two implementations of a function, depending - * on the availability of a feature. - * - * @fname: a symbol name that will be defined as a function returning a - * function pointer whose type will match @orig and @alt - * @orig: A pointer to the default function, as returned by @fname when - * @cond doesn't hold - * @alt: A pointer to the alternate function, as returned by @fname - * when @cond holds - * @cond: a CPU feature (as described in asm/cpufeature.h) - */ -#define hyp_alternate_select(fname, orig, alt, cond) \ -typeof(orig) * __hyp_text fname(void) \ -{ \ - typeof(alt) *val = orig; \ - asm volatile(ALTERNATIVE("nop \n", \ - "mov %0, %1 \n", \ - cond) \ - : "+r" (val) : "r" (alt)); \ - return val; \ -} - int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); void __vgic_v3_save_state(struct kvm_vcpu *vcpu); From f73f8a504e27959576a2f4d85182202561e426f2 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Thu, 15 Aug 2019 17:01:45 +0000 Subject: [PATCH 0381/2880] PCI: hv: Use bytes 4 and 5 from instance ID as the PCI domain numbers As recommended by Azure host team, the bytes 4, 5 have more uniqueness (info entropy) than bytes 8, 9 so use them as the PCI domain numbers. On older hosts, bytes 4, 5 can also be used -- no backward compatibility issues are introduced and the chance of collision is greatly reduced. In the rare cases of collision, the driver code detects and finds another number that is not in use. Suggested-by: Michael Kelley Signed-off-by: Haiyang Zhang Signed-off-by: Lorenzo Pieralisi Acked-by: Sasha Levin --- drivers/pci/controller/pci-hyperv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 4caa3388692a..3a56de6b2ec2 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2590,7 +2590,7 @@ static int hv_pci_probe(struct hv_device *hdev, * (2) There will be no overlap between domains (after fixing possible * collisions) in the same VM. */ - dom_req = hdev->dev_instance.b[8] << 8 | hdev->dev_instance.b[9]; + dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; dom = hv_get_dom_num(dom_req); if (dom == HVPCI_DOM_INVALID) { From f8a9bc623a6d178f7ecd40fb7db37eb954b6929c Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 10 Sep 2019 03:03:17 -0700 Subject: [PATCH 0382/2880] security: constify some arrays in lockdown LSM No reason for these not to be const. Signed-off-by: Matthew Garrett Suggested-by: David Howells Signed-off-by: James Morris --- security/lockdown/lockdown.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 0068cec77c05..8a10b43daf74 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -16,7 +16,7 @@ static enum lockdown_reason kernel_locked_down; -static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { +static const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_NONE] = "none", [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", @@ -40,7 +40,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; -static enum lockdown_reason lockdown_levels[] = {LOCKDOWN_NONE, +static const enum lockdown_reason lockdown_levels[] = {LOCKDOWN_NONE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX}; From 45893a0abee6b5fd52994a3a1095735aeaec472b Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 10 Sep 2019 03:03:18 -0700 Subject: [PATCH 0383/2880] kexec: Fix file verification on S390 I accidentally typoed this #ifdef, so verification would always be disabled. Signed-off-by: Matthew Garrett Reported-by: Philipp Rudo Signed-off-by: James Morris --- arch/s390/kernel/kexec_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 9b4f37a4edf1..9da6fa30c447 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -130,7 +130,7 @@ static int s390_elf_probe(const char *buf, unsigned long len) const struct kexec_file_ops s390_kexec_elf_ops = { .probe = s390_elf_probe, .load = s390_elf_load, -#ifdef CONFIG_KEXEC__SIG +#ifdef CONFIG_KEXEC_SIG .verify_sig = s390_verify_sig, #endif /* CONFIG_KEXEC_SIG */ }; From 5e113224c17e2fb156b785ddbbc48a0209fddb0c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Sep 2019 13:02:55 -0400 Subject: [PATCH 0384/2880] nfsd: nfsd_file cache entries should be per net namespace Ensure that we can safely clear out the file cache entries when the nfs server is shut down on a container. Otherwise, the file cache may end up pinning the mounts. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/export.c | 2 +- fs/nfsd/filecache.c | 33 +++++++++++++++++++++------------ fs/nfsd/filecache.h | 3 ++- fs/nfsd/nfssvc.c | 1 + 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 052fac64b578..15422c951fd1 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -240,7 +240,7 @@ static void expkey_flush(void) * destroyed while we're in the middle of flushing. */ mutex_lock(&nfsd_mutex); - nfsd_file_cache_purge(); + nfsd_file_cache_purge(current->nsproxy->net_ns); mutex_unlock(&nfsd_mutex); } diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 2e1a972231e5..da9e790a055e 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -16,6 +16,7 @@ #include "vfs.h" #include "nfsd.h" #include "nfsfh.h" +#include "netns.h" #include "filecache.h" #include "trace.h" @@ -167,7 +168,8 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf) } static struct nfsd_file * -nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval) +nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval, + struct net *net) { struct nfsd_file *nf; @@ -177,6 +179,7 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval) INIT_LIST_HEAD(&nf->nf_lru); nf->nf_file = NULL; nf->nf_cred = get_current_cred(); + nf->nf_net = net; nf->nf_flags = 0; nf->nf_inode = inode; nf->nf_hashval = hashval; @@ -607,10 +610,11 @@ out_err: * Note this can deadlock with nfsd_file_lru_cb. */ void -nfsd_file_cache_purge(void) +nfsd_file_cache_purge(struct net *net) { unsigned int i; struct nfsd_file *nf; + struct hlist_node *next; LIST_HEAD(dispose); bool del; @@ -618,10 +622,12 @@ nfsd_file_cache_purge(void) return; for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { - spin_lock(&nfsd_file_hashtbl[i].nfb_lock); - while(!hlist_empty(&nfsd_file_hashtbl[i].nfb_head)) { - nf = hlist_entry(nfsd_file_hashtbl[i].nfb_head.first, - struct nfsd_file, nf_node); + struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i]; + + spin_lock(&nfb->nfb_lock); + hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) { + if (net && nf->nf_net != net) + continue; del = nfsd_file_unhash_and_release_locked(nf, &dispose); /* @@ -630,7 +636,7 @@ nfsd_file_cache_purge(void) */ WARN_ON_ONCE(!del); } - spin_unlock(&nfsd_file_hashtbl[i].nfb_lock); + spin_unlock(&nfb->nfb_lock); nfsd_file_dispose_list(&dispose); } } @@ -649,7 +655,7 @@ nfsd_file_cache_shutdown(void) * calling nfsd_file_cache_purge */ cancel_delayed_work_sync(&nfsd_filecache_laundrette); - nfsd_file_cache_purge(); + nfsd_file_cache_purge(NULL); list_lru_destroy(&nfsd_file_lru); rcu_barrier(); fsnotify_put_group(nfsd_file_fsnotify_group); @@ -685,7 +691,7 @@ nfsd_match_cred(const struct cred *c1, const struct cred *c2) static struct nfsd_file * nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, - unsigned int hashval) + unsigned int hashval, struct net *net) { struct nfsd_file *nf; unsigned char need = may_flags & NFSD_FILE_MAY_MASK; @@ -696,6 +702,8 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, continue; if (nf->nf_inode != inode) continue; + if (nf->nf_net != net) + continue; if (!nfsd_match_cred(nf->nf_cred, current_cred())) continue; if (nfsd_file_get(nf) != NULL) @@ -738,6 +746,7 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { __be32 status; + struct net *net = SVC_NET(rqstp); struct nfsd_file *nf, *new; struct inode *inode; unsigned int hashval; @@ -752,12 +761,12 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); retry: rcu_read_lock(); - nf = nfsd_file_find_locked(inode, may_flags, hashval); + nf = nfsd_file_find_locked(inode, may_flags, hashval, net); rcu_read_unlock(); if (nf) goto wait_for_construction; - new = nfsd_file_alloc(inode, may_flags, hashval); + new = nfsd_file_alloc(inode, may_flags, hashval, net); if (!new) { trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, NULL, nfserr_jukebox); @@ -765,7 +774,7 @@ retry: } spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); - nf = nfsd_file_find_locked(inode, may_flags, hashval); + nf = nfsd_file_find_locked(inode, may_flags, hashval, net); if (nf == NULL) goto open_file; spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 0c0c67166b87..851d9abf54c2 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -34,6 +34,7 @@ struct nfsd_file { struct rcu_head nf_rcu; struct file *nf_file; const struct cred *nf_cred; + struct net *nf_net; #define NFSD_FILE_HASHED (0) #define NFSD_FILE_PENDING (1) #define NFSD_FILE_BREAK_READ (2) @@ -48,7 +49,7 @@ struct nfsd_file { }; int nfsd_file_cache_init(void); -void nfsd_file_cache_purge(void); +void nfsd_file_cache_purge(struct net *); void nfsd_file_cache_shutdown(void); void nfsd_file_put(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index d02712ca2685..b944553c6927 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -387,6 +387,7 @@ static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + nfsd_file_cache_purge(net); nfs4_state_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); From 27c438f53e79b81dc8805a81f6cd74824ba57290 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Sep 2019 13:02:56 -0400 Subject: [PATCH 0385/2880] nfsd: Support the server resetting the boot verifier Add support to allow the server to reset the boot verifier in order to force clients to resend I/O after a timeout failure. Signed-off-by: Trond Myklebust Signed-off-by: Lance Shelton Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 4 ++++ fs/nfsd/nfs3xdr.c | 13 +++++++++---- fs/nfsd/nfs4proc.c | 14 ++++---------- fs/nfsd/nfsctl.c | 1 + fs/nfsd/nfssvc.c | 31 ++++++++++++++++++++++++++++++- 5 files changed, 48 insertions(+), 15 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index bdfe5bcb3dcd..9a4ef815fb8c 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -104,6 +104,7 @@ struct nfsd_net { /* Time of server startup */ struct timespec64 nfssvc_boot; + seqlock_t boot_lock; /* * Max number of connections this nfsd container will allow. Defaults @@ -179,4 +180,7 @@ struct nfsd_net { extern void nfsd_netns_free_versions(struct nfsd_net *nn); extern unsigned int nfsd_net_id; + +void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn); +void nfsd_reset_boot_verifier(struct nfsd_net *nn); #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index fcf31822c74c..86e5658651f1 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -27,6 +27,7 @@ static u32 nfs3_ftypes[] = { NF3SOCK, NF3BAD, NF3LNK, NF3BAD, }; + /* * XDR functions for basic NFS types */ @@ -751,14 +752,16 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_writeres *resp = rqstp->rq_resp; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 verf[2]; p = encode_wcc_data(rqstp, p, &resp->fh); if (resp->status == 0) { *p++ = htonl(resp->count); *p++ = htonl(resp->committed); /* unique identifier, y2038 overflow can be ignored */ - *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_nsec); + nfsd_copy_boot_verifier(verf, nn); + *p++ = verf[0]; + *p++ = verf[1]; } return xdr_ressize_check(rqstp, p); } @@ -1125,13 +1128,15 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_commitres *resp = rqstp->rq_resp; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 verf[2]; p = encode_wcc_data(rqstp, p, &resp->fh); /* Write verifier */ if (resp->status == 0) { /* unique identifier, y2038 overflow can be ignored */ - *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_nsec); + nfsd_copy_boot_verifier(verf, nn); + *p++ = verf[0]; + *p++ = verf[1]; } return xdr_ressize_check(rqstp, p); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index cb51893ec1cd..4e3e77b76411 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -568,17 +568,11 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) { - __be32 verf[2]; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + __be32 *verf = (__be32 *)verifier->data; - /* - * This is opaque to client, so no need to byte-swap. Use - * __force to keep sparse happy. y2038 time_t overflow is - * irrelevant in this usage. - */ - verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; - verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; - memcpy(verifier->data, verf, sizeof(verifier->data)); + BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data)); + + nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id)); } static __be32 diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3cf4f6aa48d6..33cbe2e5d937 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1477,6 +1477,7 @@ static __net_init int nfsd_init_net(struct net *net) atomic_set(&nn->ntf_refcnt, 0); init_waitqueue_head(&nn->ntf_wq); + seqlock_init(&nn->boot_lock); mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL); if (IS_ERR(mnt)) { diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index b944553c6927..3caaf5675259 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -344,6 +344,35 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn) return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); } +void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) +{ + int seq = 0; + + do { + read_seqbegin_or_lock(&nn->boot_lock, &seq); + /* + * This is opaque to client, so no need to byte-swap. Use + * __force to keep sparse happy. y2038 time_t overflow is + * irrelevant in this usage + */ + verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; + verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; + } while (need_seqretry(&nn->boot_lock, seq)); + done_seqretry(&nn->boot_lock, seq); +} + +void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) +{ + ktime_get_real_ts64(&nn->nfssvc_boot); +} + +void nfsd_reset_boot_verifier(struct nfsd_net *nn) +{ + write_seqlock(&nn->boot_lock); + nfsd_reset_boot_verifier_locked(nn); + write_sequnlock(&nn->boot_lock); +} + static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -596,7 +625,7 @@ int nfsd_create_serv(struct net *net) #endif } atomic_inc(&nn->ntf_refcnt); - ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */ + nfsd_reset_boot_verifier(nn); return 0; } From 055b24a8f23090043b2ce0c30f03e12c5f2c9e72 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Sep 2019 13:02:57 -0400 Subject: [PATCH 0386/2880] nfsd: Don't garbage collect files that might contain write errors If a file may contain unstable writes that can error out, then we want to avoid garbage collecting the struct nfsd_file that may be tracking those errors. So in the garbage collector, we try to avoid collecting files that aren't clean. Furthermore, we avoid immediately kicking off the garbage collector in the case where the reference drops to zero for the case where there is a write error that is being tracked. If the file is unhashed while an error is pending, then declare a reboot, to ensure the client resends any unstable writes. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/filecache.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index da9e790a055e..ef55e9b1cd4e 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -215,6 +215,36 @@ nfsd_file_free(struct nfsd_file *nf) return flush; } +static bool +nfsd_file_check_writeback(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + struct address_space *mapping; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return false; + mapping = file->f_mapping; + return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); +} + +static int +nfsd_file_check_write_error(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return 0; + return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)); +} + +static bool +nfsd_file_in_use(struct nfsd_file *nf) +{ + return nfsd_file_check_writeback(nf) || + nfsd_file_check_write_error(nf); +} + static void nfsd_file_do_unhash(struct nfsd_file *nf) { @@ -222,6 +252,8 @@ nfsd_file_do_unhash(struct nfsd_file *nf) trace_nfsd_file_unhash(nf); + if (nfsd_file_check_write_error(nf)) + nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id)); --nfsd_file_hashtbl[nf->nf_hashval].nfb_count; hlist_del_rcu(&nf->nf_node); if (!list_empty(&nf->nf_lru)) @@ -276,9 +308,10 @@ void nfsd_file_put(struct nfsd_file *nf) { bool is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0; + bool unused = !nfsd_file_in_use(nf); set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); - if (nfsd_file_put_noref(nf) == 1 && is_hashed) + if (nfsd_file_put_noref(nf) == 1 && is_hashed && unused) nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_MAY_FLUSH); } @@ -344,6 +377,14 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, */ if (atomic_read(&nf->nf_ref) > 1) goto out_skip; + + /* + * Don't throw out files that are still undergoing I/O or + * that have uncleared errors pending. + */ + if (nfsd_file_check_writeback(nf)) + goto out_skip; + if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) goto out_rescan; From bbf2f098838aa86cee240a7a11486e0601d56a3f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Sep 2019 13:02:58 -0400 Subject: [PATCH 0387/2880] nfsd: Reset the boot verifier on all write I/O errors If multiple clients are writing to the same file, then due to the fact we share a single file descriptor between all NFSv3 clients writing to the file, we have a situation where clients can miss the fact that their file data was not persisted. While this should be rare, it could cause silent data loss in situations where multiple clients are using NLM locking or O_DIRECT to write to the same file. Unfortunately, the stateless nature of NFSv3 and the fact that we can only identify clients by their IP address means that we cannot trivially cache errors; we would not know when it is safe to release them from the cache. So the solution is to declare a reboot. We understand that this should be a rare occurrence, since disks are usually stable. The most frequent occurrence is likely to be ENOSPC, at which point all writes to the given filesystem are likely to fail anyway. So the expectation is that clients will be forced to retry their writes until they hit the fatal error. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 84e87772c2b8..0867d5319fdb 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -958,8 +958,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, nfsdstats.io_write += *cnt; fsnotify_modify(file); - if (stable && use_wgather) + if (stable && use_wgather) { host_err = wait_for_concurrent_writes(file); + if (host_err < 0) + nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), + nfsd_net_id)); + } out_nfserr: if (host_err >= 0) { @@ -1063,10 +1067,17 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, if (EX_ISSYNC(fhp->fh_export)) { int err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); - if (err2 != -EINVAL) - err = nfserrno(err2); - else + switch (err2) { + case 0: + break; + case -EINVAL: err = nfserr_notsupp; + break; + default: + err = nfserrno(err2); + nfsd_reset_boot_verifier(net_generic(nf->nf_net, + nfsd_net_id)); + } } nfsd_file_put(nf); From 11a60d159259dbadf9188534b508e5003769af61 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Mon, 9 Sep 2019 16:10:30 -0400 Subject: [PATCH 0388/2880] nfsd: add a "GetVersion" upcall for nfsdcld Add a "GetVersion" upcall to allow nfsd to determine the maximum upcall version that the nfsdcld userspace daemon supports. If the daemon responds with -EOPNOTSUPP, then we know it only supports v1. Signed-off-by: Scott Mayhew Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 167 ++++++++++++++++++++++++---------- include/uapi/linux/nfsd/cld.h | 11 ++- 2 files changed, 127 insertions(+), 51 deletions(-) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 87679557d0d6..58a61339d40c 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -59,8 +59,12 @@ struct nfsd4_client_tracking_ops { void (*remove)(struct nfs4_client *); int (*check)(struct nfs4_client *); void (*grace_done)(struct nfsd_net *); + uint8_t version; + size_t msglen; }; +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops; + /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; @@ -718,6 +722,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { .remove = nfsd4_remove_clid_dir, .check = nfsd4_check_legacy_client, .grace_done = nfsd4_recdir_purge_old, + .version = 1, + .msglen = 0, }; /* Globals */ @@ -737,19 +743,24 @@ struct cld_upcall { struct list_head cu_list; struct cld_net *cu_net; struct completion cu_done; - struct cld_msg cu_msg; + union { + struct cld_msg_hdr cu_hdr; + struct cld_msg cu_msg; + } cu_u; }; static int -__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) { int ret; struct rpc_pipe_msg msg; - struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_msg); + struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u); + struct nfsd_net *nn = net_generic(pipe->dentry->d_sb->s_fs_info, + nfsd_net_id); memset(&msg, 0, sizeof(msg)); msg.data = cmsg; - msg.len = sizeof(*cmsg); + msg.len = nn->client_tracking_ops->msglen; ret = rpc_queue_upcall(pipe, &msg); if (ret < 0) { @@ -765,7 +776,7 @@ out: } static int -cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) { int ret; @@ -809,7 +820,7 @@ __cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg, kfree(name.data); return -EFAULT; } - return sizeof(*cmsg); + return nn->client_tracking_ops->msglen; } return -EFAULT; } @@ -818,6 +829,7 @@ static ssize_t cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct cld_upcall *tmp, *cup; + struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src; struct cld_msg __user *cmsg = (struct cld_msg __user *)src; uint32_t xid; struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, @@ -825,14 +837,14 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) struct cld_net *cn = nn->cld_net; int16_t status; - if (mlen != sizeof(*cmsg)) { + if (mlen != nn->client_tracking_ops->msglen) { dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen, - sizeof(*cmsg)); + nn->client_tracking_ops->msglen); return -EINVAL; } /* copy just the xid so we can try to find that */ - if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) { + if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) { dprintk("%s: error when copying xid from userspace", __func__); return -EFAULT; } @@ -842,7 +854,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) * list (for -EINPROGRESS, we just want to make sure the xid is * valid, not remove the upcall from the list) */ - if (get_user(status, &cmsg->cm_status)) { + if (get_user(status, &hdr->cm_status)) { dprintk("%s: error when copying status from userspace", __func__); return -EFAULT; } @@ -851,7 +863,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) cup = NULL; spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { - if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) { + if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) { cup = tmp; if (status != -EINPROGRESS) list_del_init(&cup->cu_list); @@ -869,7 +881,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (status == -EINPROGRESS) return __cld_pipe_inprogress_downcall(cmsg, nn); - if (copy_from_user(&cup->cu_msg, src, mlen) != 0) + if (copy_from_user(&cup->cu_u.cu_msg, src, mlen) != 0) return -EFAULT; complete(&cup->cu_done); @@ -881,7 +893,7 @@ cld_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct cld_msg *cmsg = msg->data; struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, - cu_msg); + cu_u.cu_msg); /* errno >= 0 means we got a downcall */ if (msg->errno >= 0) @@ -1012,9 +1024,10 @@ nfsd4_remove_cld_pipe(struct net *net) } static struct cld_upcall * -alloc_cld_upcall(struct cld_net *cn) +alloc_cld_upcall(struct nfsd_net *nn) { struct cld_upcall *new, *tmp; + struct cld_net *cn = nn->cld_net; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) @@ -1024,20 +1037,20 @@ alloc_cld_upcall(struct cld_net *cn) restart_search: spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { - if (tmp->cu_msg.cm_xid == cn->cn_xid) { + if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) { cn->cn_xid++; spin_unlock(&cn->cn_lock); goto restart_search; } } init_completion(&new->cu_done); - new->cu_msg.cm_vers = CLD_UPCALL_VERSION; - put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid); + new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version; + put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid); new->cu_net = cn; list_add(&new->cu_list, &cn->cn_list); spin_unlock(&cn->cn_lock); - dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid); + dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid); return new; } @@ -1066,20 +1079,20 @@ nfsd4_cld_create(struct nfs4_client *clp) if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_Create; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Create; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1103,20 +1116,20 @@ nfsd4_cld_remove(struct nfs4_client *clp) if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_Remove; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Remove; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1145,21 +1158,21 @@ nfsd4_cld_check_v0(struct nfs4_client *clp) if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { printk(KERN_ERR "NFSD: Unable to check client record on " "stable storage: %d\n", -ENOMEM); return -ENOMEM; } - cup->cu_msg.cm_cmd = Cld_Check; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Check; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1223,16 +1236,16 @@ nfsd4_cld_grace_start(struct nfsd_net *nn) struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceStart; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: @@ -1250,17 +1263,17 @@ nfsd4_cld_grace_done_v0(struct nfsd_net *nn) struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceDone; - cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; + cup->cu_u.cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: @@ -1279,16 +1292,16 @@ nfsd4_cld_grace_done(struct nfsd_net *nn) struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceDone; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: @@ -1336,6 +1349,50 @@ cld_running(struct nfsd_net *nn) return pipe->nreaders || pipe->nwriters; } +static int +nfsd4_cld_get_version(struct nfsd_net *nn) +{ + int ret = 0; + struct cld_upcall *cup; + struct cld_net *cn = nn->cld_net; + uint8_t version; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + if (!ret) { + ret = cup->cu_u.cu_msg.cm_status; + if (ret) + goto out_free; + version = cup->cu_u.cu_msg.cm_u.cm_version; + dprintk("%s: userspace returned version %u\n", + __func__, version); + if (version < 1) + version = 1; + else if (version > CLD_UPCALL_VERSION) + version = CLD_UPCALL_VERSION; + + switch (version) { + case 1: + nn->client_tracking_ops = &nfsd4_cld_tracking_ops; + break; + default: + break; + } + } +out_free: + free_cld_upcall(cup); +out_err: + if (ret) + dprintk("%s: Unable to get version from userspace: %d\n", + __func__, ret); + return ret; +} + static int nfsd4_cld_tracking_init(struct net *net) { @@ -1368,10 +1425,14 @@ nfsd4_cld_tracking_init(struct net *net) goto err_remove; } + status = nfsd4_cld_get_version(nn); + if (status == -EOPNOTSUPP) + pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n"); + status = nfsd4_cld_grace_start(nn); if (status) { if (status == -EOPNOTSUPP) - printk(KERN_WARNING "NFSD: Please upgrade nfsdcld.\n"); + pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n"); nfs4_release_reclaim(nn); goto err_remove; } else @@ -1403,6 +1464,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = { .remove = nfsd4_cld_remove, .check = nfsd4_cld_check_v0, .grace_done = nfsd4_cld_grace_done_v0, + .version = 1, + .msglen = sizeof(struct cld_msg), }; /* For newer nfsdcld's */ @@ -1413,6 +1476,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { .remove = nfsd4_cld_remove, .check = nfsd4_cld_check, .grace_done = nfsd4_cld_grace_done, + .version = 1, + .msglen = sizeof(struct cld_msg), }; /* upcall via usermodehelper */ @@ -1760,6 +1825,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { .remove = nfsd4_umh_cltrack_remove, .check = nfsd4_umh_cltrack_check, .grace_done = nfsd4_umh_cltrack_grace_done, + .version = 1, + .msglen = 0, }; int diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h index b1e9de4f07d5..c5aad16d10c0 100644 --- a/include/uapi/linux/nfsd/cld.h +++ b/include/uapi/linux/nfsd/cld.h @@ -36,7 +36,8 @@ enum cld_command { Cld_Remove, /* remove record of this cm_id */ Cld_Check, /* is this cm_id allowed? */ Cld_GraceDone, /* grace period is complete */ - Cld_GraceStart, + Cld_GraceStart, /* grace start (upload client records) */ + Cld_GetVersion, /* query max supported upcall version */ }; /* representation of long-form NFSv4 client ID */ @@ -54,7 +55,15 @@ struct cld_msg { union { __s64 cm_gracetime; /* grace period start time */ struct cld_name cm_name; + __u8 cm_version; /* for getting max version */ } __attribute__((packed)) cm_u; } __attribute__((packed)); +struct cld_msg_hdr { + __u8 cm_vers; /* upcall version */ + __u8 cm_cmd; /* upcall command */ + __s16 cm_status; /* return code */ + __u32 cm_xid; /* transaction id */ +} __attribute__((packed)); + #endif /* !_NFSD_CLD_H */ From 6ee95d1c899186c0798cafd25998d436bcdb9618 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Mon, 9 Sep 2019 16:10:31 -0400 Subject: [PATCH 0389/2880] nfsd: add support for upcall version 2 Version 2 upcalls will allow the nfsd to include a hash of the kerberos principal string in the Cld_Create upcall. If a principal is present in the svc_cred, then the hash will be included in the Cld_Create upcall. We attempt to use the svc_cred.cr_raw_principal (which is returned by gssproxy) first, and then fall back to using the svc_cred.cr_principal (which is returned by both gssproxy and rpc.svcgssd). Upon a subsequent restart, the hash will be returned in the Cld_Gracestart downcall and stored in the reclaim_str_hashtbl so it can be used when handling reclaim opens. Signed-off-by: Scott Mayhew Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4recover.c | 223 +++++++++++++++++++++++++++++++--- fs/nfsd/nfs4state.c | 6 +- fs/nfsd/state.h | 3 +- include/uapi/linux/nfsd/cld.h | 30 ++++- 4 files changed, 245 insertions(+), 17 deletions(-) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 58a61339d40c..cdc75ad4438b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -64,6 +64,7 @@ struct nfsd4_client_tracking_ops { }; static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops; +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2; /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; @@ -177,6 +178,7 @@ __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, const char *dname, int len, struct nfsd_net *nn) { struct xdr_netobj name; + struct xdr_netobj princhash = { .len = 0, .data = NULL }; struct nfs4_client_reclaim *crp; name.data = kmemdup(dname, len, GFP_KERNEL); @@ -186,7 +188,7 @@ __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, return; } name.len = len; - crp = nfs4_client_to_reclaim(name, nn); + crp = nfs4_client_to_reclaim(name, princhash, nn); if (!crp) { kfree(name.data); return; @@ -486,6 +488,7 @@ static int load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) { struct xdr_netobj name; + struct xdr_netobj princhash = { .len = 0, .data = NULL }; if (child->d_name.len != HEXDIR_LEN - 1) { printk("%s: illegal name %pd in recovery directory\n", @@ -500,7 +503,7 @@ load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) goto out; } name.len = HEXDIR_LEN; - if (!nfs4_client_to_reclaim(name, nn)) + if (!nfs4_client_to_reclaim(name, princhash, nn)) kfree(name.data); out: return 0; @@ -737,6 +740,7 @@ struct cld_net { struct list_head cn_list; unsigned int cn_xid; bool cn_has_legacy; + struct crypto_shash *cn_tfm; }; struct cld_upcall { @@ -746,6 +750,7 @@ struct cld_upcall { union { struct cld_msg_hdr cu_hdr; struct cld_msg cu_msg; + struct cld_msg_v2 cu_msg_v2; } cu_u; }; @@ -792,11 +797,11 @@ cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) } static ssize_t -__cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg, +__cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, struct nfsd_net *nn) { - uint8_t cmd; - struct xdr_netobj name; + uint8_t cmd, princhashlen; + struct xdr_netobj name, princhash = { .len = 0, .data = NULL }; uint16_t namelen; struct cld_net *cn = nn->cld_net; @@ -805,19 +810,45 @@ __cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg, return -EFAULT; } if (cmd == Cld_GraceStart) { - if (get_user(namelen, &cmsg->cm_u.cm_name.cn_len)) - return -EFAULT; - name.data = memdup_user(&cmsg->cm_u.cm_name.cn_id, namelen); - if (IS_ERR_OR_NULL(name.data)) - return -EFAULT; - name.len = namelen; + if (nn->client_tracking_ops->version >= 2) { + const struct cld_clntinfo __user *ci; + + ci = &cmsg->cm_u.cm_clntinfo; + if (get_user(namelen, &ci->cc_name.cn_len)) + return -EFAULT; + name.data = memdup_user(&ci->cc_name.cn_id, namelen); + if (IS_ERR_OR_NULL(name.data)) + return -EFAULT; + name.len = namelen; + get_user(princhashlen, &ci->cc_princhash.cp_len); + if (princhashlen > 0) { + princhash.data = memdup_user( + &ci->cc_princhash.cp_data, + princhashlen); + if (IS_ERR_OR_NULL(princhash.data)) + return -EFAULT; + princhash.len = princhashlen; + } else + princhash.len = 0; + } else { + const struct cld_name __user *cnm; + + cnm = &cmsg->cm_u.cm_name; + if (get_user(namelen, &cnm->cn_len)) + return -EFAULT; + name.data = memdup_user(&cnm->cn_id, namelen); + if (IS_ERR_OR_NULL(name.data)) + return -EFAULT; + name.len = namelen; + } if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) { name.len = name.len - 5; memmove(name.data, name.data + 5, name.len); cn->cn_has_legacy = true; } - if (!nfs4_client_to_reclaim(name, nn)) { + if (!nfs4_client_to_reclaim(name, princhash, nn)) { kfree(name.data); + kfree(princhash.data); return -EFAULT; } return nn->client_tracking_ops->msglen; @@ -830,7 +861,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct cld_upcall *tmp, *cup; struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src; - struct cld_msg __user *cmsg = (struct cld_msg __user *)src; + struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src; uint32_t xid; struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, nfsd_net_id); @@ -881,7 +912,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (status == -EINPROGRESS) return __cld_pipe_inprogress_downcall(cmsg, nn); - if (copy_from_user(&cup->cu_u.cu_msg, src, mlen) != 0) + if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0) return -EFAULT; complete(&cup->cu_done); @@ -1019,6 +1050,8 @@ nfsd4_remove_cld_pipe(struct net *net) nfsd4_cld_unregister_net(net, cn->cn_pipe); rpc_destroy_pipe_data(cn->cn_pipe); + if (cn->cn_tfm) + crypto_free_shash(cn->cn_tfm); kfree(nn->cld_net); nn->cld_net = NULL; } @@ -1103,6 +1136,75 @@ out_err: "record on stable storage: %d\n", ret); } +/* Ask daemon to create a new record */ +static void +nfsd4_cld_create_v2(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + struct cld_msg_v2 *cmsg; + struct crypto_shash *tfm = cn->cn_tfm; + struct xdr_netobj cksum; + char *principal = NULL; + SHASH_DESC_ON_STACK(desc, tfm); + + /* Don't upcall if it's already stored */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cmsg = &cup->cu_u.cu_msg_v2; + cmsg->cm_cmd = Cld_Create; + cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len; + memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + if (clp->cl_cred.cr_raw_principal) + principal = clp->cl_cred.cr_raw_principal; + else if (clp->cl_cred.cr_principal) + principal = clp->cl_cred.cr_principal; + if (principal) { + desc->tfm = tfm; + cksum.len = crypto_shash_digestsize(tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) { + ret = -ENOMEM; + goto out; + } + ret = crypto_shash_digest(desc, principal, strlen(principal), + cksum.data); + shash_desc_zero(desc); + if (ret) { + kfree(cksum.data); + goto out; + } + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len; + memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data, + cksum.data, cksum.len); + kfree(cksum.data); + } else + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; + + ret = cld_pipe_upcall(cn->cn_pipe, cmsg); + if (!ret) { + ret = cmsg->cm_status; + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + +out: + free_cld_upcall(cup); +out_err: + if (ret) + pr_err("NFSD: Unable to create client record on stable storage: %d\n", + ret); +} + /* Ask daemon to create a new record */ static void nfsd4_cld_remove(struct nfs4_client *clp) @@ -1229,6 +1331,79 @@ found: return 0; } +static int +nfsd4_cld_check_v2(struct nfs4_client *clp) +{ + struct nfs4_client_reclaim *crp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + int status; + char dname[HEXDIR_LEN]; + struct xdr_netobj name; + struct crypto_shash *tfm = cn->cn_tfm; + struct xdr_netobj cksum; + char *principal = NULL; + SHASH_DESC_ON_STACK(desc, tfm); + + /* did we already find that this client is stable? */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + /* look for it in the reclaim hashtable otherwise */ + crp = nfsd4_find_reclaim_client(clp->cl_name, nn); + if (crp) + goto found; + + if (cn->cn_has_legacy) { + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) + return -ENOENT; + + name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data\n", + __func__); + return -ENOENT; + } + name.len = HEXDIR_LEN; + crp = nfsd4_find_reclaim_client(name, nn); + kfree(name.data); + if (crp) + goto found; + + } + return -ENOENT; +found: + if (crp->cr_princhash.len) { + if (clp->cl_cred.cr_raw_principal) + principal = clp->cl_cred.cr_raw_principal; + else if (clp->cl_cred.cr_principal) + principal = clp->cl_cred.cr_principal; + if (principal == NULL) + return -ENOENT; + desc->tfm = tfm; + cksum.len = crypto_shash_digestsize(tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) + return -ENOENT; + status = crypto_shash_digest(desc, principal, strlen(principal), + cksum.data); + shash_desc_zero(desc); + if (status) { + kfree(cksum.data); + return -ENOENT; + } + if (memcmp(crp->cr_princhash.data, cksum.data, + crp->cr_princhash.len)) { + kfree(cksum.data); + return -ENOENT; + } + kfree(cksum.data); + } + crp->cr_clp = clp; + return 0; +} + static int nfsd4_cld_grace_start(struct nfsd_net *nn) { @@ -1380,6 +1555,9 @@ nfsd4_cld_get_version(struct nfsd_net *nn) case 1: nn->client_tracking_ops = &nfsd4_cld_tracking_ops; break; + case 2: + nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2; + break; default: break; } @@ -1408,6 +1586,11 @@ nfsd4_cld_tracking_init(struct net *net) status = __nfsd4_init_cld_pipe(net); if (status) goto err_shutdown; + nn->cld_net->cn_tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(nn->cld_net->cn_tfm)) { + status = PTR_ERR(nn->cld_net->cn_tfm); + goto err_remove; + } /* * rpc pipe upcalls take 30 seconds to time out, so we don't want to @@ -1480,6 +1663,18 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { .msglen = sizeof(struct cld_msg), }; +/* v2 create/check ops include the principal, if available */ +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = { + .init = nfsd4_cld_tracking_init, + .exit = nfsd4_cld_tracking_exit, + .create = nfsd4_cld_create_v2, + .remove = nfsd4_cld_remove, + .check = nfsd4_cld_check_v2, + .grace_done = nfsd4_cld_grace_done, + .version = 2, + .msglen = sizeof(struct cld_msg_v2), +}; + /* upcall via usermodehelper */ static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack"; module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog), diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 906e214dcce8..da1093dd5016 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6887,7 +6887,8 @@ nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn) * will be freed in nfs4_remove_reclaim_record in the normal case). */ struct nfs4_client_reclaim * -nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn) +nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, + struct nfsd_net *nn) { unsigned int strhashval; struct nfs4_client_reclaim *crp; @@ -6900,6 +6901,8 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn) list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]); crp->cr_name.data = name.data; crp->cr_name.len = name.len; + crp->cr_princhash.data = princhash.data; + crp->cr_princhash.len = princhash.len; crp->cr_clp = NULL; nn->reclaim_str_hashtbl_size++; } @@ -6911,6 +6914,7 @@ nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn) { list_del(&crp->cr_strhash); kfree(crp->cr_name.data); + kfree(crp->cr_princhash.data); kfree(crp); nn->reclaim_str_hashtbl_size--; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index d00c86d05beb..46f56afb6cb8 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -378,6 +378,7 @@ struct nfs4_client_reclaim { struct list_head cr_strhash; /* hash by cr_name */ struct nfs4_client *cr_clp; /* pointer to associated clp */ struct xdr_netobj cr_name; /* recovery dir name */ + struct xdr_netobj cr_princhash; }; /* A reasonable value for REPLAY_ISIZE was estimated as follows: @@ -645,7 +646,7 @@ extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfsd4_shutdown_copy(struct nfs4_client *clp); extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, - struct nfsd_net *nn); + struct xdr_netobj princhash, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); struct nfs4_file *find_file(struct knfsd_fh *fh); diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h index c5aad16d10c0..a519313af953 100644 --- a/include/uapi/linux/nfsd/cld.h +++ b/include/uapi/linux/nfsd/cld.h @@ -26,11 +26,15 @@ #include /* latest upcall version available */ -#define CLD_UPCALL_VERSION 1 +#define CLD_UPCALL_VERSION 2 /* defined by RFC3530 */ #define NFS4_OPAQUE_LIMIT 1024 +#ifndef SHA256_DIGEST_SIZE +#define SHA256_DIGEST_SIZE 32 +#endif + enum cld_command { Cld_Create, /* create a record for this cm_id */ Cld_Remove, /* remove record of this cm_id */ @@ -46,6 +50,17 @@ struct cld_name { unsigned char cn_id[NFS4_OPAQUE_LIMIT]; /* client-provided */ } __attribute__((packed)); +/* sha256 hash of the kerberos principal */ +struct cld_princhash { + __u8 cp_len; /* length of cp_data */ + unsigned char cp_data[SHA256_DIGEST_SIZE]; /* hash of principal */ +} __attribute__((packed)); + +struct cld_clntinfo { + struct cld_name cc_name; + struct cld_princhash cc_princhash; +} __attribute__((packed)); + /* message struct for communication with userspace */ struct cld_msg { __u8 cm_vers; /* upcall version */ @@ -59,6 +74,19 @@ struct cld_msg { } __attribute__((packed)) cm_u; } __attribute__((packed)); +/* version 2 message can include hash of kerberos principal */ +struct cld_msg_v2 { + __u8 cm_vers; /* upcall version */ + __u8 cm_cmd; /* upcall command */ + __s16 cm_status; /* return code */ + __u32 cm_xid; /* transaction id */ + union { + struct cld_name cm_name; + __u8 cm_version; /* for getting max version */ + struct cld_clntinfo cm_clntinfo; /* name & princ hash */ + } __attribute__((packed)) cm_u; +} __attribute__((packed)); + struct cld_msg_hdr { __u8 cm_vers; /* upcall version */ __u8 cm_cmd; /* upcall command */ From 5079bde790304959bf744c548efdd5be660ea8e2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 1 Sep 2019 14:48:19 +0200 Subject: [PATCH 0390/2880] perf python: Add missing python/perf.so dependency for libperf The python/perf.so compilation needs libperf ready, otherwise it fails: $ make python/perf.so JOBS=1 BUILD: Doing 'make -j1' parallel build GEN python/perf.so gcc: error: /home/jolsa/kernel/linux-perf/tools/perf/lib/libperf.a: No such file or directory Fixing this with by adding libperf dependency. Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190901124822.10132-2-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index f9807d8c005b..2ccc12f3730b 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -567,7 +567,7 @@ all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS) # Create python binding output directory if not already present _dummy := $(shell [ -d '$(OUTPUT)python' ] || mkdir -p '$(OUTPUT)python') -$(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(LIBTRACEEVENT_DYNAMIC_LIST) +$(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(LIBTRACEEVENT_DYNAMIC_LIST) $(LIBPERF) $(QUIET_GEN)LDSHARED="$(CC) -pthread -shared" \ CFLAGS='$(CFLAGS)' LDFLAGS='$(LDFLAGS) $(LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS)' \ $(PYTHON_WORD) util/setup.py \ From 9eab951f34dbd092ab520bda167f288899858306 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 1 Sep 2019 14:48:21 +0200 Subject: [PATCH 0391/2880] perf tests: Add libperf automated test for 'make -C tools/perf build-test' Add a libperf build test, that is triggered when one does: $ make -C tools/perf build-test Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190901124822.10132-4-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/make | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/make b/tools/perf/tests/make index 70c48475896d..6b3afed5d910 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -327,6 +327,10 @@ make_kernelsrc_tools: (make -C ../../tools $(PARALLEL_OPT) $(K_O_OPT) perf) > $@ 2>&1 && \ test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false) +make_libperf: + @echo "- make -C lib"; + make -C lib clean >$@ 2>&1; make -C lib >>$@ 2>&1 && rm $@ + FEATURES_DUMP_FILE := $(FULL_O)/BUILD_TEST_FEATURE_DUMP FEATURES_DUMP_FILE_STATIC := $(FULL_O)/BUILD_TEST_FEATURE_DUMP_STATIC @@ -365,5 +369,5 @@ $(foreach t,$(run),$(if $(findstring make_static,$(t)),\ $(eval $(t) := $($(t)) FEATURES_DUMP=$(FEATURES_DUMP_FILE)))) endif -.PHONY: all $(run) $(run_O) tarpkg clean make_kernelsrc make_kernelsrc_tools +.PHONY: all $(run) $(run_O) tarpkg clean make_kernelsrc make_kernelsrc_tools make_libperf endif # ifndef MK From 227cb129858a3eb4b874937274b09834ffcc39b0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 1 Sep 2019 14:48:22 +0200 Subject: [PATCH 0392/2880] libperf: Add missing event.h file to install rule So that this development header is properly installed and can be found by tools linking with libperf. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190901124822.10132-5-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/lib/Makefile b/tools/perf/lib/Makefile index a67efb8d9d39..e325c0503dc6 100644 --- a/tools/perf/lib/Makefile +++ b/tools/perf/lib/Makefile @@ -146,6 +146,7 @@ install_headers: $(call do_install,include/perf/threadmap.h,$(prefix)/include/perf,644); \ $(call do_install,include/perf/evlist.h,$(prefix)/include/perf,644); \ $(call do_install,include/perf/evsel.h,$(prefix)/include/perf,644); + $(call do_install,include/perf/event.h,$(prefix)/include/perf,644); install_pkgconfig: $(LIBPERF_PC) $(call QUIET_INSTALL, $(LIBPERF_PC)) \ From 4256d434935e9c85a731823be562785494ca364b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Sep 2019 14:12:53 +0200 Subject: [PATCH 0393/2880] libperf: Adopt perf_cpu_map__max() function From 'perf stat', so that it can be used from multiple places. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Joe Mario Cc: Kan Liang Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190902121255.536-2-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 14 +------------- tools/perf/lib/cpumap.c | 12 ++++++++++++ tools/perf/lib/include/perf/cpumap.h | 1 + tools/perf/lib/libperf.map | 1 + 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 7e17bf9f700a..5bc0c570b7b6 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -822,18 +822,6 @@ static int perf_stat__get_core(struct perf_stat_config *config __maybe_unused, return cpu_map__get_core(map, cpu, NULL); } -static int cpu_map__get_max(struct perf_cpu_map *map) -{ - int i, max = -1; - - for (i = 0; i < map->nr; i++) { - if (map->map[i] > max) - max = map->map[i]; - } - - return max; -} - static int perf_stat__get_aggr(struct perf_stat_config *config, aggr_get_id_t get_id, struct perf_cpu_map *map, int idx) { @@ -928,7 +916,7 @@ static int perf_stat_init_aggr_mode(void) * taking the highest cpu number to be the size of * the aggregation translate cpumap. */ - nr = cpu_map__get_max(evsel_list->core.cpus); + nr = perf_cpu_map__max(evsel_list->core.cpus); stat_config.cpus_aggr_map = perf_cpu_map__empty_new(nr + 1); return stat_config.cpus_aggr_map ? 0 : -ENOMEM; } diff --git a/tools/perf/lib/cpumap.c b/tools/perf/lib/cpumap.c index 1f0e6f334237..2ca1fafa620d 100644 --- a/tools/perf/lib/cpumap.c +++ b/tools/perf/lib/cpumap.c @@ -260,3 +260,15 @@ int perf_cpu_map__idx(struct perf_cpu_map *cpus, int cpu) return -1; } + +int perf_cpu_map__max(struct perf_cpu_map *map) +{ + int i, max = -1; + + for (i = 0; i < map->nr; i++) { + if (map->map[i] > max) + max = map->map[i]; + } + + return max; +} diff --git a/tools/perf/lib/include/perf/cpumap.h b/tools/perf/lib/include/perf/cpumap.h index 8aa995c59498..ac9aa497f84a 100644 --- a/tools/perf/lib/include/perf/cpumap.h +++ b/tools/perf/lib/include/perf/cpumap.h @@ -16,6 +16,7 @@ LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map); LIBPERF_API int perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx); LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus); LIBPERF_API bool perf_cpu_map__empty(const struct perf_cpu_map *map); +LIBPERF_API int perf_cpu_map__max(struct perf_cpu_map *map); #define perf_cpu_map__for_each_cpu(cpu, idx, cpus) \ for ((idx) = 0, (cpu) = perf_cpu_map__cpu(cpus, idx); \ diff --git a/tools/perf/lib/libperf.map b/tools/perf/lib/libperf.map index dc4d66363bc4..cd0d17b996c8 100644 --- a/tools/perf/lib/libperf.map +++ b/tools/perf/lib/libperf.map @@ -9,6 +9,7 @@ LIBPERF_0.0.1 { perf_cpu_map__nr; perf_cpu_map__cpu; perf_cpu_map__empty; + perf_cpu_map__max; perf_thread_map__new_dummy; perf_thread_map__set_pid; perf_thread_map__comm; From 76e43c8ccaa35c30d5df853013561145a0f750a5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 8 Sep 2019 20:15:18 -0700 Subject: [PATCH 0394/2880] fuse: fix deadlock with aio poll and fuse_iqueue::waitq.lock When IOCB_CMD_POLL is used on the FUSE device, aio_poll() disables IRQs and takes kioctx::ctx_lock, then fuse_iqueue::waitq.lock. This may have to wait for fuse_iqueue::waitq.lock to be released by one of many places that take it with IRQs enabled. Since the IRQ handler may take kioctx::ctx_lock, lockdep reports that a deadlock is possible. Fix it by protecting the state of struct fuse_iqueue with a separate spinlock, and only accessing fuse_iqueue::waitq using the versions of the waitqueue functions which do IRQ-safe locking internally. Reproducer: #include #include #include #include #include #include #include int main() { char opts[128]; int fd = open("/dev/fuse", O_RDWR); aio_context_t ctx = 0; struct iocb cb = { .aio_lio_opcode = IOCB_CMD_POLL, .aio_fildes = fd }; struct iocb *cbp = &cb; sprintf(opts, "fd=%d,rootmode=040000,user_id=0,group_id=0", fd); mkdir("mnt", 0700); mount("foo", "mnt", "fuse", 0, opts); syscall(__NR_io_setup, 1, &ctx); syscall(__NR_io_submit, ctx, 1, &cbp); } Beginning of lockdep output: ===================================================== WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected 5.3.0-rc5 #9 Not tainted ----------------------------------------------------- syz_fuse/135 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: 000000003590ceda (&fiq->waitq){+.+.}, at: spin_lock include/linux/spinlock.h:338 [inline] 000000003590ceda (&fiq->waitq){+.+.}, at: aio_poll fs/aio.c:1751 [inline] 000000003590ceda (&fiq->waitq){+.+.}, at: __io_submit_one.constprop.0+0x203/0x5b0 fs/aio.c:1825 and this task is already holding: 0000000075037284 (&(&ctx->ctx_lock)->rlock){..-.}, at: spin_lock_irq include/linux/spinlock.h:363 [inline] 0000000075037284 (&(&ctx->ctx_lock)->rlock){..-.}, at: aio_poll fs/aio.c:1749 [inline] 0000000075037284 (&(&ctx->ctx_lock)->rlock){..-.}, at: __io_submit_one.constprop.0+0x1f4/0x5b0 fs/aio.c:1825 which would create a new lock dependency: (&(&ctx->ctx_lock)->rlock){..-.} -> (&fiq->waitq){+.+.} but this new dependency connects a SOFTIRQ-irq-safe lock: (&(&ctx->ctx_lock)->rlock){..-.} [...] Reported-by: syzbot+af05535bb79520f95431@syzkaller.appspotmail.com Reported-by: syzbot+d86c4426a01f60feddc7@syzkaller.appspotmail.com Fixes: bfe4037e722e ("aio: implement IOCB_CMD_POLL") Cc: # v4.19+ Cc: Christoph Hellwig Signed-off-by: Eric Biggers Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 93 +++++++++++++++++++++++++----------------------- fs/fuse/fuse_i.h | 3 ++ fs/fuse/inode.c | 1 + 3 files changed, 52 insertions(+), 45 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index fdb85895737b..5a5dd6b2f91e 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -377,7 +377,7 @@ static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); list_add_tail(&req->list, &fiq->pending); - wake_up_locked(&fiq->waitq); + wake_up(&fiq->waitq); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } @@ -389,16 +389,16 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (fiq->connected) { fiq->forget_list_tail->next = forget; fiq->forget_list_tail = forget; - wake_up_locked(&fiq->waitq); + wake_up(&fiq->waitq); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } else { kfree(forget); } - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); } static void flush_bg_queue(struct fuse_conn *fc) @@ -412,10 +412,10 @@ static void flush_bg_queue(struct fuse_conn *fc) req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); req->in.h.unique = fuse_get_unique(fiq); queue_request(fiq, req); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); } } @@ -439,9 +439,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) * smp_mb() from queue_interrupt(). */ if (!list_empty(&req->intr_entry)) { - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); list_del_init(&req->intr_entry); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); } WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); @@ -483,10 +483,10 @@ put_request: static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return -EINVAL; } @@ -499,13 +499,13 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) smp_mb(); if (test_bit(FR_FINISHED, &req->flags)) { list_del_init(&req->intr_entry); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return 0; } - wake_up_locked(&fiq->waitq); + wake_up(&fiq->waitq); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return 0; } @@ -535,16 +535,16 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) if (!err) return; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); /* Request is not yet in userspace, bail out */ if (test_bit(FR_PENDING, &req->flags)) { list_del(&req->list); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); __fuse_put_request(req); req->out.h.error = -EINTR; return; } - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); } /* @@ -559,9 +559,9 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) struct fuse_iqueue *fiq = &fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (!fiq->connected) { - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); req->out.h.error = -ENOTCONN; } else { req->in.h.unique = fuse_get_unique(fiq); @@ -569,7 +569,7 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) /* acquire extra reference, since request is still needed after request_end() */ __fuse_get_request(req); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); request_wait_answer(fc, req); /* Pairs with smp_wmb() in request_end() */ @@ -700,12 +700,12 @@ static int fuse_request_send_notify_reply(struct fuse_conn *fc, __clear_bit(FR_ISREPLY, &req->flags); req->in.h.unique = unique; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (fiq->connected) { queue_request(fiq, req); err = 0; } - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return err; } @@ -1149,12 +1149,12 @@ static int request_pending(struct fuse_iqueue *fiq) * Unlike other requests this is assembled on demand, without a need * to allocate a separate fuse_req structure. * - * Called with fiq->waitq.lock held, releases it + * Called with fiq->lock held, releases it */ static int fuse_read_interrupt(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; @@ -1169,7 +1169,7 @@ __releases(fiq->waitq.lock) ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT); arg.unique = req->in.h.unique; - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); if (nbytes < reqsize) return -EINVAL; @@ -1206,7 +1206,7 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { int err; struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL); @@ -1220,7 +1220,7 @@ __releases(fiq->waitq.lock) .len = sizeof(ih) + sizeof(arg), }; - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); kfree(forget); if (nbytes < ih.len) return -EINVAL; @@ -1238,7 +1238,7 @@ __releases(fiq->waitq.lock) static int fuse_read_batch_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { int err; unsigned max_forgets; @@ -1252,13 +1252,13 @@ __releases(fiq->waitq.lock) }; if (nbytes < ih.len) { - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return -EINVAL; } max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); head = dequeue_forget(fiq, max_forgets, &count); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); arg.count = count; ih.len += count * sizeof(struct fuse_forget_one); @@ -1288,7 +1288,7 @@ __releases(fiq->waitq.lock) static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL) return fuse_read_single_forget(fiq, cs, nbytes); @@ -1336,16 +1336,19 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, return -EINVAL; restart: - spin_lock(&fiq->waitq.lock); - err = -EAGAIN; - if ((file->f_flags & O_NONBLOCK) && fiq->connected && - !request_pending(fiq)) - goto err_unlock; + for (;;) { + spin_lock(&fiq->lock); + if (!fiq->connected || request_pending(fiq)) + break; + spin_unlock(&fiq->lock); - err = wait_event_interruptible_exclusive_locked(fiq->waitq, + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + err = wait_event_interruptible_exclusive(fiq->waitq, !fiq->connected || request_pending(fiq)); - if (err) - goto err_unlock; + if (err) + return err; + } if (!fiq->connected) { err = fc->aborted ? -ECONNABORTED : -ENODEV; @@ -1369,7 +1372,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, req = list_entry(fiq->pending.next, struct fuse_req, list); clear_bit(FR_PENDING, &req->flags); list_del_init(&req->list); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); in = &req->in; reqsize = in->h.len; @@ -1427,7 +1430,7 @@ out_end: return err; err_unlock: - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return err; } @@ -2139,12 +2142,12 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) fiq = &fud->fc->iq; poll_wait(file, &fiq->waitq, wait); - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (!fiq->connected) mask = EPOLLERR; else if (request_pending(fiq)) mask |= EPOLLIN | EPOLLRDNORM; - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return mask; } @@ -2239,15 +2242,15 @@ void fuse_abort_conn(struct fuse_conn *fc) flush_bg_queue(fc); spin_unlock(&fc->bg_lock); - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); fiq->connected = 0; list_for_each_entry(req, &fiq->pending, list) clear_bit(FR_PENDING, &req->flags); list_splice_tail_init(&fiq->pending, &to_end); while (forget_pending(fiq)) kfree(dequeue_forget(fiq, 1, NULL)); - wake_up_all_locked(&fiq->waitq); - spin_unlock(&fiq->waitq.lock); + wake_up_all(&fiq->waitq); + spin_unlock(&fiq->lock); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); end_polls(fc); wake_up_all(&fc->blocked_waitq); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 24dbca777775..89bdc41e0d86 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -450,6 +450,9 @@ struct fuse_iqueue { /** Connection established */ unsigned connected; + /** Lock protecting accesses to members of this structure */ + spinlock_t lock; + /** Readers of the connection are waiting on this */ wait_queue_head_t waitq; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2183967261a4..9ae9fdd5a014 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -589,6 +589,7 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) static void fuse_iqueue_init(struct fuse_iqueue *fiq) { memset(fiq, 0, sizeof(struct fuse_iqueue)); + spin_lock_init(&fiq->lock); init_waitqueue_head(&fiq->waitq); INIT_LIST_HEAD(&fiq->pending); INIT_LIST_HEAD(&fiq->interrupts); From d5b4854357f47899ea5b0336b41b04e81b62b11d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 0395/2880] fuse: flatten 'struct fuse_args' ...to make future expansion simpler. The hiearachical structure is a historical thing that does not serve any practical purpose. The generated code is excatly the same before and after the patch. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 38 +++++----- fs/fuse/dir.c | 176 +++++++++++++++++++++++------------------------ fs/fuse/file.c | 116 +++++++++++++++---------------- fs/fuse/fuse_i.h | 21 ++---- fs/fuse/inode.c | 12 ++-- fs/fuse/xattr.c | 76 ++++++++++---------- 6 files changed, 216 insertions(+), 223 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5a5dd6b2f91e..e865c6b61652 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -590,32 +590,32 @@ EXPORT_SYMBOL_GPL(fuse_request_send); static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) { - if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS) - args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE; + if (fc->minor < 4 && args->opcode == FUSE_STATFS) + args->out_args[0].size = FUSE_COMPAT_STATFS_SIZE; if (fc->minor < 9) { - switch (args->in.h.opcode) { + switch (args->opcode) { case FUSE_LOOKUP: case FUSE_CREATE: case FUSE_MKNOD: case FUSE_MKDIR: case FUSE_SYMLINK: case FUSE_LINK: - args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + args->out_args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; break; case FUSE_GETATTR: case FUSE_SETATTR: - args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + args->out_args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; break; } } if (fc->minor < 12) { - switch (args->in.h.opcode) { + switch (args->opcode) { case FUSE_CREATE: - args->in.args[0].size = sizeof(struct fuse_open_in); + args->in_args[0].size = sizeof(struct fuse_open_in); break; case FUSE_MKNOD: - args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; + args->in_args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; break; } } @@ -633,19 +633,19 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) /* Needs to be done after fuse_get_req() so that fc->minor is valid */ fuse_adjust_compat(fc, args); - req->in.h.opcode = args->in.h.opcode; - req->in.h.nodeid = args->in.h.nodeid; - req->in.numargs = args->in.numargs; - memcpy(req->in.args, args->in.args, - args->in.numargs * sizeof(struct fuse_in_arg)); - req->out.argvar = args->out.argvar; - req->out.numargs = args->out.numargs; - memcpy(req->out.args, args->out.args, - args->out.numargs * sizeof(struct fuse_arg)); + req->in.h.opcode = args->opcode; + req->in.h.nodeid = args->nodeid; + req->in.numargs = args->in_numargs; + memcpy(req->in.args, args->in_args, + args->in_numargs * sizeof(struct fuse_in_arg)); + req->out.argvar = args->out_argvar; + req->out.numargs = args->out_numargs; + memcpy(req->out.args, args->out_args, + args->out_numargs * sizeof(struct fuse_arg)); fuse_request_send(fc, req); ret = req->out.h.error; - if (!ret && args->out.argvar) { - BUG_ON(args->out.numargs != 1); + if (!ret && args->out_argvar) { + BUG_ON(args->out_numargs != 1); ret = req->out.args[0].size; } fuse_put_request(fc, req); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index dd0f64f7bc06..30a96098e64a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -139,14 +139,14 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, struct fuse_entry_out *outarg) { memset(outarg, 0, sizeof(struct fuse_entry_out)); - args->in.h.opcode = FUSE_LOOKUP; - args->in.h.nodeid = nodeid; - args->in.numargs = 1; - args->in.args[0].size = name->len + 1; - args->in.args[0].value = name->name; - args->out.numargs = 1; - args->out.args[0].size = sizeof(struct fuse_entry_out); - args->out.args[0].value = outarg; + args->opcode = FUSE_LOOKUP; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = name->len + 1; + args->in_args[0].value = name->name; + args->out_numargs = 1; + args->out_args[0].size = sizeof(struct fuse_entry_out); + args->out_args[0].value = outarg; } /* @@ -410,18 +410,18 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.flags = flags; inarg.mode = mode; inarg.umask = current_umask(); - args.in.h.opcode = FUSE_CREATE; - args.in.h.nodeid = get_node_id(dir); - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = entry->d_name.len + 1; - args.in.args[1].value = entry->d_name.name; - args.out.numargs = 2; - args.out.args[0].size = sizeof(outentry); - args.out.args[0].value = &outentry; - args.out.args[1].size = sizeof(outopen); - args.out.args[1].value = &outopen; + args.opcode = FUSE_CREATE; + args.nodeid = get_node_id(dir); + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; + args.out_numargs = 2; + args.out_args[0].size = sizeof(outentry); + args.out_args[0].value = &outentry; + args.out_args[1].size = sizeof(outopen); + args.out_args[1].value = &outopen; err = fuse_simple_request(fc, &args); if (err) goto out_free_ff; @@ -526,10 +526,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, return -ENOMEM; memset(&outarg, 0, sizeof(outarg)); - args->in.h.nodeid = get_node_id(dir); - args->out.numargs = 1; - args->out.args[0].size = sizeof(outarg); - args->out.args[0].value = &outarg; + args->nodeid = get_node_id(dir); + args->out_numargs = 1; + args->out_args[0].size = sizeof(outarg); + args->out_args[0].value = &outarg; err = fuse_simple_request(fc, args); if (err) goto out_put_forget_req; @@ -582,12 +582,12 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, inarg.mode = mode; inarg.rdev = new_encode_dev(rdev); inarg.umask = current_umask(); - args.in.h.opcode = FUSE_MKNOD; - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = entry->d_name.len + 1; - args.in.args[1].value = entry->d_name.name; + args.opcode = FUSE_MKNOD; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; return create_new_entry(fc, &args, dir, entry, mode); } @@ -609,12 +609,12 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.umask = current_umask(); - args.in.h.opcode = FUSE_MKDIR; - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = entry->d_name.len + 1; - args.in.args[1].value = entry->d_name.name; + args.opcode = FUSE_MKDIR; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; return create_new_entry(fc, &args, dir, entry, S_IFDIR); } @@ -625,12 +625,12 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, unsigned len = strlen(link) + 1; FUSE_ARGS(args); - args.in.h.opcode = FUSE_SYMLINK; - args.in.numargs = 2; - args.in.args[0].size = entry->d_name.len + 1; - args.in.args[0].value = entry->d_name.name; - args.in.args[1].size = len; - args.in.args[1].value = link; + args.opcode = FUSE_SYMLINK; + args.in_numargs = 2; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; + args.in_args[1].size = len; + args.in_args[1].value = link; return create_new_entry(fc, &args, dir, entry, S_IFLNK); } @@ -648,11 +648,11 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); - args.in.h.opcode = FUSE_UNLINK; - args.in.h.nodeid = get_node_id(dir); - args.in.numargs = 1; - args.in.args[0].size = entry->d_name.len + 1; - args.in.args[0].value = entry->d_name.name; + args.opcode = FUSE_UNLINK; + args.nodeid = get_node_id(dir); + args.in_numargs = 1; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { struct inode *inode = d_inode(entry); @@ -684,11 +684,11 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); - args.in.h.opcode = FUSE_RMDIR; - args.in.h.nodeid = get_node_id(dir); - args.in.numargs = 1; - args.in.args[0].size = entry->d_name.len + 1; - args.in.args[0].value = entry->d_name.name; + args.opcode = FUSE_RMDIR; + args.nodeid = get_node_id(dir); + args.in_numargs = 1; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { clear_nlink(d_inode(entry)); @@ -711,15 +711,15 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); inarg.flags = flags; - args.in.h.opcode = opcode; - args.in.h.nodeid = get_node_id(olddir); - args.in.numargs = 3; - args.in.args[0].size = argsize; - args.in.args[0].value = &inarg; - args.in.args[1].size = oldent->d_name.len + 1; - args.in.args[1].value = oldent->d_name.name; - args.in.args[2].size = newent->d_name.len + 1; - args.in.args[2].value = newent->d_name.name; + args.opcode = opcode; + args.nodeid = get_node_id(olddir); + args.in_numargs = 3; + args.in_args[0].size = argsize; + args.in_args[0].value = &inarg; + args.in_args[1].size = oldent->d_name.len + 1; + args.in_args[1].value = oldent->d_name.name; + args.in_args[2].size = newent->d_name.len + 1; + args.in_args[2].value = newent->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { /* ctime changes */ @@ -796,12 +796,12 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); - args.in.h.opcode = FUSE_LINK; - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = newent->d_name.len + 1; - args.in.args[1].value = newent->d_name.name; + args.opcode = FUSE_LINK; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = newent->d_name.len + 1; + args.in_args[1].value = newent->d_name.name; err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); /* Contrary to "normal" filesystems it can happen that link makes two "logical" inodes point to the same "physical" @@ -884,14 +884,14 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } - args.in.h.opcode = FUSE_GETATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_GETATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) { if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { @@ -1056,11 +1056,11 @@ static int fuse_access(struct inode *inode, int mask) memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); - args.in.h.opcode = FUSE_ACCESS; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = FUSE_ACCESS; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_access = 1; @@ -1383,14 +1383,14 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, struct fuse_setattr_in *inarg_p, struct fuse_attr_out *outarg_p) { - args->in.h.opcode = FUSE_SETATTR; - args->in.h.nodeid = get_node_id(inode); - args->in.numargs = 1; - args->in.args[0].size = sizeof(*inarg_p); - args->in.args[0].value = inarg_p; - args->out.numargs = 1; - args->out.args[0].size = sizeof(*outarg_p); - args->out.args[0].value = outarg_p; + args->opcode = FUSE_SETATTR; + args->nodeid = get_node_id(inode); + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg_p); + args->in_args[0].value = inarg_p; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg_p); + args->out_args[0].value = outarg_p; } /* diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e076c2cf65b0..1f90722f0ee8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -29,14 +29,14 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); if (!fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; - args.in.h.opcode = opcode; - args.in.h.nodeid = nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(*outargp); - args.out.args[0].value = outargp; + args.opcode = opcode; + args.nodeid = nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(*outargp); + args.out_args[0].value = outargp; return fuse_simple_request(fc, &args); } @@ -464,11 +464,11 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0; - args.in.h.opcode = opcode; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = opcode; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; return fuse_simple_request(fc, &args); } @@ -2221,11 +2221,11 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, inarg->lk.pid = pid; if (flock) inarg->lk_flags |= FUSE_LK_FLOCK; - args->in.h.opcode = opcode; - args->in.h.nodeid = get_node_id(inode); - args->in.numargs = 1; - args->in.args[0].size = sizeof(*inarg); - args->in.args[0].value = inarg; + args->opcode = opcode; + args->nodeid = get_node_id(inode); + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; } static int fuse_getlk(struct file *file, struct file_lock *fl) @@ -2238,9 +2238,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) int err; fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) err = convert_fuse_file_lock(fc, &outarg.lk, fl); @@ -2335,14 +2335,14 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) memset(&inarg, 0, sizeof(inarg)); inarg.block = block; inarg.blocksize = inode->i_sb->s_blocksize; - args.in.h.opcode = FUSE_BMAP; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_BMAP; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) fc->no_bmap = 1; @@ -2367,14 +2367,14 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) if (fc->no_lseek) goto fallback; - args.in.h.opcode = FUSE_LSEEK; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_LSEEK; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (err) { if (err == -ENOSYS) { @@ -2860,14 +2860,14 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) fuse_register_polled_file(fc, ff); } - args.in.h.opcode = FUSE_POLL; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_POLL; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) @@ -3075,11 +3075,11 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - args.in.h.opcode = FUSE_FALLOCATE; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = FUSE_FALLOCATE; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_fallocate = 1; @@ -3167,14 +3167,14 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); - args.in.h.opcode = FUSE_COPY_FILE_RANGE; - args.in.h.nodeid = ff_in->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_COPY_FILE_RANGE; + args.nodeid = ff_in->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_copy_file_range = 1; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 89bdc41e0d86..835c0671320c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -287,20 +287,13 @@ struct fuse_page_desc { }; struct fuse_args { - struct { - struct { - uint32_t opcode; - uint64_t nodeid; - } h; - unsigned numargs; - struct fuse_in_arg args[3]; - - } in; - struct { - unsigned argvar:1; - unsigned numargs; - struct fuse_arg args[2]; - } out; + uint32_t opcode; + uint64_t nodeid; + unsigned int in_numargs; + struct fuse_in_arg in_args[3]; + unsigned int out_argvar:1; + unsigned int out_numargs; + struct fuse_arg out_args[2]; }; #define FUSE_ARGS(args) struct fuse_args args = {} diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9ae9fdd5a014..4eaea0b29965 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -437,12 +437,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) } memset(&outarg, 0, sizeof(outarg)); - args.in.numargs = 0; - args.in.h.opcode = FUSE_STATFS; - args.in.h.nodeid = get_node_id(d_inode(dentry)); - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.in_numargs = 0; + args.opcode = FUSE_STATFS; + args.nodeid = get_node_id(d_inode(dentry)); + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 433717640f78..2e02486e46e6 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -25,15 +25,15 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, memset(&inarg, 0, sizeof(inarg)); inarg.size = size; inarg.flags = flags; - args.in.h.opcode = FUSE_SETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 3; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; - args.in.args[2].size = size; - args.in.args[2].value = value; + args.opcode = FUSE_SETXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 3; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; + args.in_args[2].size = size; + args.in_args[2].value = value; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_setxattr = 1; @@ -60,22 +60,22 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, memset(&inarg, 0, sizeof(inarg)); inarg.size = size; - args.in.h.opcode = FUSE_GETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; + args.opcode = FUSE_GETXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; /* This is really two different operations rolled into one */ - args.out.numargs = 1; + args.out_numargs = 1; if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = value; + args.out_argvar = 1; + args.out_args[0].size = size; + args.out_args[0].value = value; } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; } ret = fuse_simple_request(fc, &args); if (!ret && !size) @@ -121,20 +121,20 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) memset(&inarg, 0, sizeof(inarg)); inarg.size = size; - args.in.h.opcode = FUSE_LISTXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = FUSE_LISTXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; /* This is really two different operations rolled into one */ - args.out.numargs = 1; + args.out_numargs = 1; if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = list; + args.out_argvar = 1; + args.out_args[0].size = size; + args.out_args[0].value = list; } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; } ret = fuse_simple_request(fc, &args); if (!ret && !size) @@ -157,11 +157,11 @@ int fuse_removexattr(struct inode *inode, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - args.in.h.opcode = FUSE_REMOVEXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = strlen(name) + 1; - args.in.args[0].value = name; + args.opcode = FUSE_REMOVEXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = strlen(name) + 1; + args.in_args[0].value = name; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_removexattr = 1; From 1f4e9d03d1fbff428a0e864d5456e0a2dace6f81 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 0396/2880] fuse: rearrange and resize fuse_args fields This makes the structure better packed. Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 8 ++++---- fs/fuse/xattr.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 835c0671320c..a89362ee46d9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -287,12 +287,12 @@ struct fuse_page_desc { }; struct fuse_args { - uint32_t opcode; uint64_t nodeid; - unsigned int in_numargs; + uint32_t opcode; + unsigned short in_numargs; + unsigned short out_numargs; + bool out_argvar:1; struct fuse_in_arg in_args[3]; - unsigned int out_argvar:1; - unsigned int out_numargs; struct fuse_arg out_args[2]; }; diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 2e02486e46e6..20d052e08b3b 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -70,7 +70,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, /* This is really two different operations rolled into one */ args.out_numargs = 1; if (size) { - args.out_argvar = 1; + args.out_argvar = true; args.out_args[0].size = size; args.out_args[0].value = value; } else { @@ -129,7 +129,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) /* This is really two different operations rolled into one */ args.out_numargs = 1; if (size) { - args.out_argvar = 1; + args.out_argvar = true; args.out_args[0].size = size; args.out_args[0].value = list; } else { From 40ac7ab2d02176f8a70e37b88e41637ed97b304b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 0397/2880] fuse: simplify 'nofail' request Instead of complex games with a reserved request, just use __GFP_NOFAIL. Both calers (flush, readdir) guarantee that connection was already initialized, so no need to wait for fc->initialized. Also remove unneeded clearing of FR_BACKGROUND flag. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 64 +++--------------------------------------------- fs/fuse/file.c | 2 +- fs/fuse/fuse_i.h | 8 +----- fs/fuse/inode.c | 1 - 4 files changed, 6 insertions(+), 69 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e865c6b61652..7833aee97565 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -242,52 +242,6 @@ struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, } EXPORT_SYMBOL_GPL(fuse_get_req_for_background); -/* - * Return request in fuse_file->reserved_req. However that may - * currently be in use. If that is the case, wait for it to become - * available. - */ -static struct fuse_req *get_reserved_req(struct fuse_conn *fc, - struct file *file) -{ - struct fuse_req *req = NULL; - struct fuse_inode *fi = get_fuse_inode(file_inode(file)); - struct fuse_file *ff = file->private_data; - - do { - wait_event(fc->reserved_req_waitq, ff->reserved_req); - spin_lock(&fi->lock); - if (ff->reserved_req) { - req = ff->reserved_req; - ff->reserved_req = NULL; - req->stolen_file = get_file(file); - } - spin_unlock(&fi->lock); - } while (!req); - - return req; -} - -/* - * Put stolen request back into fuse_file->reserved_req - */ -static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) -{ - struct file *file = req->stolen_file; - struct fuse_inode *fi = get_fuse_inode(file_inode(file)); - struct fuse_file *ff = file->private_data; - - WARN_ON(req->max_pages); - spin_lock(&fi->lock); - memset(req, 0, sizeof(*req)); - fuse_request_init(req, NULL, NULL, 0); - BUG_ON(ff->reserved_req); - ff->reserved_req = req; - wake_up_all(&fc->reserved_req_waitq); - spin_unlock(&fi->lock); - fput(file); -} - /* * Gets a requests for a file operation, always succeeds * @@ -301,25 +255,18 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) * filesystem should not have it's own file open. If deadlock is * intentional, it can still be broken by "aborting" the filesystem. */ -struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, - struct file *file) +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc) { struct fuse_req *req; atomic_inc(&fc->num_waiting); - wait_event(fc->blocked_waitq, fc->initialized); - /* Matches smp_wmb() in fuse_set_initialized() */ - smp_rmb(); - req = fuse_request_alloc(0); - if (!req) - req = get_reserved_req(fc, file); + req = __fuse_request_alloc(0, GFP_KERNEL | __GFP_NOFAIL); req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); __set_bit(FR_WAITING, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); return req; } @@ -342,10 +289,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) fuse_drop_waiting(fc); } - if (req->stolen_file) - put_reserved_req(fc, req); - else - fuse_request_free(req); + fuse_request_free(req); } } EXPORT_SYMBOL_GPL(fuse_put_request); @@ -719,7 +663,7 @@ void fuse_force_forget(struct file *file, u64 nodeid) memset(&inarg, 0, sizeof(inarg)); inarg.nlookup = 1; - req = fuse_get_req_nofail_nopages(fc, file); + req = fuse_get_req_nofail_nopages(fc); req->in.h.opcode = FUSE_FORGET; req->in.h.nodeid = nodeid; req->in.numargs = 1; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1f90722f0ee8..7d12c1d27132 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -432,7 +432,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - req = fuse_get_req_nofail_nopages(fc, file); + req = fuse_get_req_nofail_nopages(fc); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a89362ee46d9..dd199391d6b9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -435,8 +435,6 @@ struct fuse_req { /** Request completion callback */ void (*end)(struct fuse_conn *, struct fuse_req *); - /** Request is stolen from fuse_file->reserved_req */ - struct file *stolen_file; }; struct fuse_iqueue { @@ -580,9 +578,6 @@ struct fuse_conn { /** waitq for blocked connection */ wait_queue_head_t blocked_waitq; - /** waitq for reserved requests */ - wait_queue_head_t reserved_req_waitq; - /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -927,8 +922,7 @@ void __fuse_get_request(struct fuse_req *req); /** * Gets a requests for a file operation, always succeeds */ -struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, - struct file *file); +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc); /** * Decrement reference count of a request. If count goes to zero free diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4eaea0b29965..2b9cc19fedcb 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -617,7 +617,6 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); - init_waitqueue_head(&fc->reserved_req_waitq); fuse_iqueue_init(&fc->iq); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); From c500ebaa908dbf6b3c562778a25d7e945b04f40f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 0398/2880] fuse: convert flush to simple api Add 'force' to fuse_args and use fuse_get_req_nofail_nopages() to allocate the request in that case. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 13 +++++++++---- fs/fuse/file.c | 20 +++++++++----------- fs/fuse/fuse_i.h | 6 +----- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 7833aee97565..a7be13b5d6ef 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -255,7 +255,7 @@ EXPORT_SYMBOL_GPL(fuse_get_req_for_background); * filesystem should not have it's own file open. If deadlock is * intentional, it can still be broken by "aborting" the filesystem. */ -struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc) +static struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc) { struct fuse_req *req; @@ -570,9 +570,14 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) struct fuse_req *req; ssize_t ret; - req = fuse_get_req(fc, 0); - if (IS_ERR(req)) - return PTR_ERR(req); + if (args->force) { + req = fuse_get_req_nofail_nopages(fc); + __set_bit(FR_FORCE, &req->flags); + } else { + req = fuse_get_req(fc, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + } /* Needs to be done after fuse_get_req() so that fc->minor is valid */ fuse_adjust_compat(fc, args); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7d12c1d27132..f404e15357a6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -410,8 +410,8 @@ static int fuse_flush(struct file *file, fl_owner_t id) struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; - struct fuse_req *req; struct fuse_flush_in inarg; + FUSE_ARGS(args); int err; if (is_bad_inode(inode)) @@ -432,19 +432,17 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - req = fuse_get_req_nofail_nopages(fc); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); - req->in.h.opcode = FUSE_FLUSH; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - __set_bit(FR_FORCE, &req->flags); - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.opcode = FUSE_FLUSH; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_flush = 1; err = 0; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index dd199391d6b9..24269f470c0e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -291,6 +291,7 @@ struct fuse_args { uint32_t opcode; unsigned short in_numargs; unsigned short out_numargs; + bool force:1; bool out_argvar:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; @@ -919,11 +920,6 @@ struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, */ void __fuse_get_request(struct fuse_req *req); -/** - * Gets a requests for a file operation, always succeeds - */ -struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc); - /** * Decrement reference count of a request. If count goes to zero free * the request. From 454a7613f54e64a36b257812ae879ef8567c675e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 0399/2880] fuse: add noreply to fuse_args This will be used by fuse_force_forget(). We can expand fuse_request_send() into fuse_simple_request(). The FR_WAITING bit has already been set, no need to check. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 4 +++- fs/fuse/fuse_i.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a7be13b5d6ef..85ed1abb9235 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -591,7 +591,9 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) req->out.numargs = args->out_numargs; memcpy(req->out.args, args->out_args, args->out_numargs * sizeof(struct fuse_arg)); - fuse_request_send(fc, req); + if (!args->noreply) + __set_bit(FR_ISREPLY, &req->flags); + __fuse_request_send(fc, req); ret = req->out.h.error; if (!ret && args->out_argvar) { BUG_ON(args->out_numargs != 1); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 24269f470c0e..ed6538e01feb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -292,6 +292,7 @@ struct fuse_args { unsigned short in_numargs; unsigned short out_numargs; bool force:1; + bool noreply:1; bool out_argvar:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; From 3545fe21128262d425dd476b6d31f9a9e4d1b7a7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 0400/2880] fuse: convert fuse_force_forget() to simple api Move this function to the readdir.c where its only caller resides. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 21 --------------------- fs/fuse/fuse_i.h | 3 --- fs/fuse/readdir.c | 21 +++++++++++++++++++++ 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 85ed1abb9235..19114b23e95f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -661,27 +661,6 @@ static int fuse_request_send_notify_reply(struct fuse_conn *fc, return err; } -void fuse_force_forget(struct file *file, u64 nodeid) -{ - struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - struct fuse_forget_in inarg; - - memset(&inarg, 0, sizeof(inarg)); - inarg.nlookup = 1; - req = fuse_get_req_nofail_nopages(fc); - req->in.h.opcode = FUSE_FORGET; - req->in.h.nodeid = nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - __clear_bit(FR_ISREPLY, &req->flags); - __fuse_request_send(fc, req); - /* ignore errors */ - fuse_put_request(fc, req); -} - /* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ed6538e01feb..e6a8e47d0e0b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -813,9 +813,6 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); -/* Used by READDIRPLUS */ -void fuse_force_forget(struct file *file, u64 nodeid); - /** * Initialize READ or READDIR request */ diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 574d03f8a573..56a19faa855d 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -249,6 +249,27 @@ retry: return 0; } +static void fuse_force_forget(struct file *file, u64 nodeid) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_forget_in inarg; + FUSE_ARGS(args); + + memset(&inarg, 0, sizeof(inarg)); + inarg.nlookup = 1; + args.opcode = FUSE_FORGET; + args.nodeid = nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + args.noreply = true; + + fuse_simple_request(fc, &args); + /* ignore errors */ +} + static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, struct dir_context *ctx, u64 attr_version) { From e413754b267e0e47ed38b3eacfa7178f21aca883 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 0401/2880] fuse: add nocreds to fuse_args In some cases it makes no sense to set pid/uid/gid fields in the request header. Allow fuse_simple_background() to omit these. This is only required in the "force" case, so for now just WARN if set otherwise. Fold fuse_get_req_nofail_nopages() into its only caller. Comment is obsolete anyway. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 44 +++++++++++++++----------------------------- fs/fuse/fuse_i.h | 1 + 2 files changed, 16 insertions(+), 29 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 19114b23e95f..9f1549166a5d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -242,34 +242,6 @@ struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, } EXPORT_SYMBOL_GPL(fuse_get_req_for_background); -/* - * Gets a requests for a file operation, always succeeds - * - * This is used for sending the FLUSH request, which must get to - * userspace, due to POSIX locks which may need to be unlocked. - * - * If allocation fails due to OOM, use the reserved request in - * fuse_file. - * - * This is very unlikely to deadlock accidentally, since the - * filesystem should not have it's own file open. If deadlock is - * intentional, it can still be broken by "aborting" the filesystem. - */ -static struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc) -{ - struct fuse_req *req; - - atomic_inc(&fc->num_waiting); - req = __fuse_request_alloc(0, GFP_KERNEL | __GFP_NOFAIL); - - req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); - req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); - req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); - - __set_bit(FR_WAITING, &req->flags); - return req; -} - void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (refcount_dec_and_test(&req->count)) { @@ -565,15 +537,29 @@ static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) } } +static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) +{ + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); +} + ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) { struct fuse_req *req; ssize_t ret; if (args->force) { - req = fuse_get_req_nofail_nopages(fc); + atomic_inc(&fc->num_waiting); + req = __fuse_request_alloc(0, GFP_KERNEL | __GFP_NOFAIL); + + if (!args->nocreds) + fuse_force_creds(fc, req); + + __set_bit(FR_WAITING, &req->flags); __set_bit(FR_FORCE, &req->flags); } else { + WARN_ON(args->nocreds); req = fuse_get_req(fc, 0); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e6a8e47d0e0b..43ae5b7f4dd4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -293,6 +293,7 @@ struct fuse_args { unsigned short out_numargs; bool force:1; bool noreply:1; + bool nocreds:1; bool out_argvar:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; From 1ccd1ea24962276ca0548386889ef7bf57479c5d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 0402/2880] fuse: convert destroy to simple api We can use the "force" flag to make sure the DESTROY request is always sent to userspace. So no need to keep it allocated during the lifetime of the filesystem. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- fs/fuse/fuse_i.h | 6 +++--- fs/fuse/inode.c | 28 ++++++++++------------------ 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f404e15357a6..53f2cc6970f1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -279,7 +279,7 @@ void fuse_release_common(struct file *file, bool isdir) * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir); + fuse_file_put(ff, ff->fc->destroy, isdir); } static int fuse_open(struct inode *inode, struct file *file) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 43ae5b7f4dd4..73f70f3872e7 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -715,6 +715,9 @@ struct fuse_conn { /** Does the filesystem support copy_file_range? */ unsigned no_copy_file_range:1; + /* Send DESTROY request */ + unsigned int destroy:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -736,9 +739,6 @@ struct fuse_conn { /** Key for lock owner ID scrambling */ u32 scramble_key[4]; - /** Reserved request for the DESTROY message */ - struct fuse_req *destroy_req; - /** Version counter for attribute changes */ atomic64_t attr_version; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2b9cc19fedcb..127984642a71 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -386,14 +386,13 @@ static void fuse_umount_begin(struct super_block *sb) static void fuse_send_destroy(struct fuse_conn *fc) { - struct fuse_req *req = fc->destroy_req; - if (req && fc->conn_init) { - fc->destroy_req = NULL; - req->in.h.opcode = FUSE_DESTROY; - __set_bit(FR_FORCE, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); - fuse_request_send(fc, req); - fuse_put_request(fc, req); + if (fc->conn_init) { + FUSE_ARGS(args); + + args.opcode = FUSE_DESTROY; + args.force = true; + args.nocreds = true; + fuse_simple_request(fc, &args); } } @@ -640,8 +639,6 @@ EXPORT_SYMBOL_GPL(fuse_conn_init); void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { - if (fc->destroy_req) - fuse_request_free(fc->destroy_req); put_pid_ns(fc->pid_ns); put_user_ns(fc->user_ns); fc->release(fc); @@ -1171,6 +1168,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) fc->user_id = ctx->user_id; fc->group_id = ctx->group_id; fc->max_read = max_t(unsigned, 4096, ctx->max_read); + fc->destroy = is_bdev; /* Used by get_root_inode() */ sb->s_fs_info = fc; @@ -1189,12 +1187,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) goto err_put_root; __set_bit(FR_BACKGROUND, &init_req->flags); - if (is_bdev) { - fc->destroy_req = fuse_request_alloc(0); - if (!fc->destroy_req) - goto err_free_init_req; - } - mutex_lock(&fuse_mutex); err = -EINVAL; if (file->private_data) @@ -1221,7 +1213,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) err_unlock: mutex_unlock(&fuse_mutex); - err_free_init_req: fuse_request_free(init_req); err_put_root: dput(root_dentry); @@ -1287,7 +1278,8 @@ static void fuse_sb_destroy(struct super_block *sb) struct fuse_conn *fc = get_fuse_conn_super(sb); if (fc) { - fuse_send_destroy(fc); + if (fc->destroy) + fuse_send_destroy(fc); fuse_abort_conn(fc); fuse_wait_aborted(fc); From 68583165f962793a6fe7d11f54713045f43fc8bb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 0403/2880] fuse: add pages to fuse_args Derive fuse_args_pages from fuse_args. This is used to handle requests which use pages for input or output. The related flags are added to fuse_args. New FR_ALLOC_PAGES flags is added to indicate whether the page arrays in fuse_req need to be freed by fuse_put_request() or not. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 41 +++++++++++++++++++++++++++++++---------- fs/fuse/fuse_i.h | 12 ++++++++++++ 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9f1549166a5d..3ef85c957122 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -81,6 +81,7 @@ static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) kmem_cache_free(fuse_req_cachep, req); return NULL; } + __set_bit(FR_ALLOC_PAGES, &req->flags); } else if (npages) { pages = req->inline_pages; page_descs = req->inline_page_descs; @@ -104,7 +105,7 @@ struct fuse_req *fuse_request_alloc_nofs(unsigned npages) static void fuse_req_pages_free(struct fuse_req *req) { - if (req->pages != req->inline_pages) + if (test_bit(FR_ALLOC_PAGES, &req->flags)) kfree(req->pages); } @@ -127,6 +128,7 @@ bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, memcpy(page_descs, req->page_descs, sizeof(struct fuse_page_desc) * req->max_pages); fuse_req_pages_free(req); + __set_bit(FR_ALLOC_PAGES, &req->flags); req->pages = pages; req->page_descs = page_descs; req->max_pages = npages; @@ -544,6 +546,33 @@ static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); } +void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) +{ + struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); + + req->in.h.opcode = args->opcode; + req->in.h.nodeid = args->nodeid; + req->in.numargs = args->in_numargs; + memcpy(req->in.args, args->in_args, + args->in_numargs * sizeof(struct fuse_in_arg)); + req->out.argvar = args->out_argvar; + req->out.numargs = args->out_numargs; + memcpy(req->out.args, args->out_args, + args->out_numargs * sizeof(struct fuse_arg)); + + if (args->in_pages || args->out_pages) { + req->in.argpages = args->in_pages; + req->out.argpages = args->out_pages; + req->out.page_zeroing = args->page_zeroing; + req->out.page_replace = args->page_replace; + + req->pages = ap->pages; + req->page_descs = ap->descs; + req->num_pages = ap->num_pages; + } + +} + ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) { struct fuse_req *req; @@ -567,16 +596,8 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) /* Needs to be done after fuse_get_req() so that fc->minor is valid */ fuse_adjust_compat(fc, args); + fuse_args_to_req(req, args); - req->in.h.opcode = args->opcode; - req->in.h.nodeid = args->nodeid; - req->in.numargs = args->in_numargs; - memcpy(req->in.args, args->in_args, - args->in_numargs * sizeof(struct fuse_in_arg)); - req->out.argvar = args->out_argvar; - req->out.numargs = args->out_numargs; - memcpy(req->out.args, args->out_args, - args->out_numargs * sizeof(struct fuse_arg)); if (!args->noreply) __set_bit(FR_ISREPLY, &req->flags); __fuse_request_send(fc, req); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 73f70f3872e7..1998d6ab4025 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -294,11 +294,22 @@ struct fuse_args { bool force:1; bool noreply:1; bool nocreds:1; + bool in_pages:1; + bool out_pages:1; bool out_argvar:1; + bool page_zeroing:1; + bool page_replace:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; }; +struct fuse_args_pages { + struct fuse_args args; + struct page **pages; + struct fuse_page_desc *descs; + unsigned int num_pages; +}; + #define FUSE_ARGS(args) struct fuse_args args = {} /** The request IO state (for asynchronous processing) */ @@ -352,6 +363,7 @@ enum fuse_req_flag { FR_SENT, FR_FINISHED, FR_PRIVATE, + FR_ALLOC_PAGES, }; /** From 4c29afece8729c6f6c1dd4865b6b7c972b7b3bbd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 0404/2880] fuse: convert readlink to simple api Also turn BUG_ON into gracefully recovered WARN_ON. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 54 +++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 30a96098e64a..11334ee01dc9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1152,38 +1152,36 @@ static int fuse_permission(struct inode *inode, int mask) static int fuse_readlink_page(struct inode *inode, struct page *page) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - int err; + struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_args_pages ap = { + .num_pages = 1, + .pages = &page, + .descs = &desc, + }; + char *link; + ssize_t res; - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); + ap.args.opcode = FUSE_READLINK; + ap.args.nodeid = get_node_id(inode); + ap.args.out_pages = true; + ap.args.out_argvar = true; + ap.args.page_zeroing = true; + ap.args.out_numargs = 1; + ap.args.out_args[0].size = desc.length; + res = fuse_simple_request(fc, &ap.args); - req->out.page_zeroing = 1; - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = PAGE_SIZE - 1; - req->in.h.opcode = FUSE_READLINK; - req->in.h.nodeid = get_node_id(inode); - req->out.argvar = 1; - req->out.numargs = 1; - req->out.args[0].size = PAGE_SIZE - 1; - fuse_request_send(fc, req); - err = req->out.h.error; - - if (!err) { - char *link = page_address(page); - size_t len = req->out.args[0].size; - - BUG_ON(len >= PAGE_SIZE); - link[len] = '\0'; - } - - fuse_put_request(fc, req); fuse_invalidate_atime(inode); - return err; + if (res < 0) + return res; + + if (WARN_ON(res >= PAGE_SIZE)) + return -EIO; + + link = page_address(page); + link[res] = '\0'; + + return 0; } static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, From 4c4f03f78ca9ce41a158710b87ad7e6d363e881a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 0405/2880] fuse: move page alloc fuse_req_pages_alloc() is moved to file.c, since its internal use by the device code will eventually be removed. Rename to fuse_pages_alloc() to signify that it's not only usable for fuse_req page array. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 17 ++--------------- fs/fuse/file.c | 12 ++++++++++++ fs/fuse/fuse_i.h | 2 ++ 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3ef85c957122..0a6624aeced9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -54,18 +54,6 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, __set_bit(FR_PENDING, &req->flags); } -static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc) -{ - struct page **pages; - - pages = kzalloc(npages * (sizeof(struct page *) + - sizeof(struct fuse_page_desc)), flags); - *desc = (void *) pages + npages * sizeof(struct page *); - - return pages; -} - static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); @@ -75,8 +63,7 @@ static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) WARN_ON(npages > FUSE_MAX_MAX_PAGES); if (npages > FUSE_REQ_INLINE_PAGES) { - pages = fuse_req_pages_alloc(npages, flags, - &page_descs); + pages = fuse_pages_alloc(npages, flags, &page_descs); if (!pages) { kmem_cache_free(fuse_req_cachep, req); return NULL; @@ -120,7 +107,7 @@ bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, fc->max_pages); WARN_ON(npages <= req->max_pages); - pages = fuse_req_pages_alloc(npages, flags, &page_descs); + pages = fuse_pages_alloc(npages, flags, &page_descs); if (!pages) return false; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 53f2cc6970f1..c3e95002f489 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,6 +19,18 @@ #include #include +struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) +{ + struct page **pages; + + pages = kzalloc(npages * (sizeof(struct page *) + + sizeof(struct fuse_page_desc)), flags); + *desc = (void *) (pages + npages); + + return pages; +} + static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, int opcode, struct fuse_open_out *outargp) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1998d6ab4025..b62a3e37ea4c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -909,6 +909,8 @@ struct fuse_req *fuse_request_alloc(unsigned npages); struct fuse_req *fuse_request_alloc_nofs(unsigned npages); +struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc); bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, gfp_t flags); From 093f38a2c1a81405f261ef1ac681a9003b061db5 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 0406/2880] fuse: convert ioctl to simple api fuse_simple_request() is converted to return length of last (instead of single) out arg, since FUSE_IOCTL_OUT has two out args, the second of which is variable length. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 4 +-- fs/fuse/file.c | 97 ++++++++++++++++++++++---------------------------- 2 files changed, 45 insertions(+), 56 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0a6624aeced9..29723e143666 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -590,8 +590,8 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) __fuse_request_send(fc, req); ret = req->out.h.error; if (!ret && args->out_argvar) { - BUG_ON(args->out_numargs != 1); - ret = req->out.args[0].size; + BUG_ON(args->out_numargs == 0); + ret = req->out.args[args->out_numargs - 1].size; } fuse_put_request(fc, req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c3e95002f489..1847cc53c416 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1267,14 +1267,14 @@ out: return written ? written : err; } -static inline void fuse_page_descs_length_init(struct fuse_req *req, - unsigned index, unsigned nr_pages) +static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, + unsigned int index, + unsigned int nr_pages) { int i; for (i = index; i < index + nr_pages; i++) - req->page_descs[i].length = PAGE_SIZE - - req->page_descs[i].offset; + descs[i].length = PAGE_SIZE - descs[i].offset; } static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) @@ -1326,7 +1326,8 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; req->page_descs[req->num_pages].offset = start; - fuse_page_descs_length_init(req, req->num_pages, npages); + fuse_page_descs_length_init(req->page_descs, req->num_pages, + npages); req->num_pages += npages; req->page_descs[req->num_pages - 1].length -= @@ -2582,14 +2583,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, .flags = flags }; struct fuse_ioctl_out outarg; - struct fuse_req *req = NULL; - struct page **pages = NULL; struct iovec *iov_page = NULL; struct iovec *in_iov = NULL, *out_iov = NULL; - unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; - size_t in_size, out_size, transferred, c; + unsigned int in_iovs = 0, out_iovs = 0, max_pages; + size_t in_size, out_size, c; + ssize_t transferred; int err, i; struct iov_iter ii; + struct fuse_args_pages ap = {}; #if BITS_PER_LONG == 32 inarg.flags |= FUSE_IOCTL_32BIT; @@ -2607,11 +2608,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL); + ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); - if (!pages || !iov_page) + if (!ap.pages || !iov_page) goto out; + fuse_page_descs_length_init(ap.descs, 0, fc->max_pages); + /* * If restricted, initialize IO parameters as encoded in @cmd. * RETRY from server is not allowed. @@ -2648,56 +2651,44 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -ENOMEM; if (max_pages > fc->max_pages) goto out; - while (num_pages < max_pages) { - pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!pages[num_pages]) + while (ap.num_pages < max_pages) { + ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!ap.pages[ap.num_pages]) goto out; - num_pages++; + ap.num_pages++; } - req = fuse_get_req(fc, num_pages); - if (IS_ERR(req)) { - err = PTR_ERR(req); - req = NULL; - goto out; - } - memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); - req->num_pages = num_pages; - fuse_page_descs_length_init(req, 0, req->num_pages); /* okay, let's send it to the client */ - req->in.h.opcode = FUSE_IOCTL; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; + ap.args.opcode = FUSE_IOCTL; + ap.args.nodeid = ff->nodeid; + ap.args.in_numargs = 1; + ap.args.in_args[0].size = sizeof(inarg); + ap.args.in_args[0].value = &inarg; if (in_size) { - req->in.numargs++; - req->in.args[1].size = in_size; - req->in.argpages = 1; + ap.args.in_numargs++; + ap.args.in_args[1].size = in_size; + ap.args.in_pages = true; err = -EFAULT; iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { - c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } } - req->out.numargs = 2; - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - req->out.args[1].size = out_size; - req->out.argpages = 1; - req->out.argvar = 1; + ap.args.out_numargs = 2; + ap.args.out_args[0].size = sizeof(outarg); + ap.args.out_args[0].value = &outarg; + ap.args.out_args[1].size = out_size; + ap.args.out_pages = true; + ap.args.out_argvar = true; - fuse_request_send(fc, req); - err = req->out.h.error; - transferred = req->out.args[1].size; - fuse_put_request(fc, req); - req = NULL; - if (err) + transferred = fuse_simple_request(fc, &ap.args); + err = transferred; + if (transferred < 0) goto out; /* did it ask for retry? */ @@ -2722,7 +2713,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - vaddr = kmap_atomic(pages[0]); + vaddr = kmap_atomic(ap.pages[0]); err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); @@ -2750,19 +2741,17 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -EFAULT; iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { - c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } err = 0; out: - if (req) - fuse_put_request(fc, req); free_page((unsigned long) iov_page); - while (num_pages) - __free_page(pages[--num_pages]); - kfree(pages); + while (ap.num_pages) + __free_page(ap.pages[--ap.num_pages]); + kfree(ap.pages); return err ? err : outarg.result; } From a0d45d84f4c9d119e0eaede4778211a789457a7a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 0407/2880] fuse: fuse_short_read(): don't take fuse_req as argument This will allow the use of this function when converting to the simple api (which doesn't use fuse_req). Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1847cc53c416..8e67add0f37f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -705,10 +705,9 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, spin_unlock(&fi->lock); } -static void fuse_short_read(struct fuse_req *req, struct inode *inode, - u64 attr_ver) +static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, + struct page **pages, unsigned int num_pages) { - size_t num_read = req->out.args[0].size; struct fuse_conn *fc = get_fuse_conn(inode); if (fc->writeback_cache) { @@ -721,12 +720,12 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, int start_idx = num_read >> PAGE_SHIFT; size_t off = num_read & (PAGE_SIZE - 1); - for (i = start_idx; i < req->num_pages; i++) { - zero_user_segment(req->pages[i], off, PAGE_SIZE); + for (i = start_idx; i < num_pages; i++) { + zero_user_segment(pages[i], off, PAGE_SIZE); off = 0; } } else { - loff_t pos = page_offset(req->pages[0]) + num_read; + loff_t pos = page_offset(pages[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } @@ -772,7 +771,8 @@ static int fuse_do_readpage(struct file *file, struct page *page) * Short read means EOF. If file size is larger, truncate it */ if (num_read < count) - fuse_short_read(req, inode, attr_ver); + fuse_short_read(inode, attr_ver, num_read, req->pages, + req->num_pages); SetPageUptodate(page); } @@ -815,7 +815,8 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) * Short read means EOF. If file size is larger, truncate it */ if (!req->out.h.error && num_read < count) - fuse_short_read(req, inode, req->misc.read.attr_ver); + fuse_short_read(inode, req->misc.read.attr_ver, + num_read, req->pages, req->num_pages); fuse_invalidate_atime(inode); } From 00793ca5d4439a22b18c0939522c81fa8661a362 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 0408/2880] fuse: covert readpage to simple api Derive fuse_io_args from struct fuse_args_pages. This will be used for both synchronous and asynchronous read/write requests. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 80 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 32 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8e67add0f37f..d927c336683a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -552,6 +552,33 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, req->out.args[0].size = count; } +struct fuse_io_args { + struct { + struct fuse_read_in in; + } read; + struct fuse_args_pages ap; +}; + +void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, + size_t count, int opcode) +{ + struct fuse_file *ff = file->private_data; + struct fuse_args *args = &ia->ap.args; + + ia->read.in.fh = ff->fh; + ia->read.in.offset = pos; + ia->read.in.size = count; + ia->read.in.flags = file->f_flags; + args->opcode = opcode; + args->nodeid = ff->nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(ia->read.in); + args->in_args[0].value = &ia->read.in; + args->out_argvar = true; + args->out_numargs = 1; + args->out_args[0].size = count; +} + static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) { unsigned i; @@ -732,16 +759,19 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, static int fuse_do_readpage(struct file *file, struct page *page) { - struct kiocb iocb; - struct fuse_io_priv io; struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - size_t num_read; loff_t pos = page_offset(page); - size_t count = PAGE_SIZE; + struct fuse_page_desc desc = { .length = PAGE_SIZE }; + struct fuse_io_args ia = { + .ap.args.page_zeroing = true, + .ap.args.out_pages = true, + .ap.num_pages = 1, + .ap.pages = &page, + .ap.descs = &desc, + }; + ssize_t res; u64 attr_ver; - int err; /* * Page writeback can extend beyond the lifetime of the @@ -750,36 +780,22 @@ static int fuse_do_readpage(struct file *file, struct page *page) */ fuse_wait_on_page_writeback(inode, page->index); - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); - attr_ver = fuse_get_attr_version(fc); - req->out.page_zeroing = 1; - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = count; - init_sync_kiocb(&iocb, file); - io = (struct fuse_io_priv) FUSE_IO_PRIV_SYNC(&iocb); - num_read = fuse_send_read(req, &io, pos, count, NULL); - err = req->out.h.error; + fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); + res = fuse_simple_request(fc, &ia.ap.args); + if (res < 0) + return res; + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (res < desc.length) + fuse_short_read(inode, attr_ver, res, ia.ap.pages, + ia.ap.num_pages); - if (!err) { - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (num_read < count) - fuse_short_read(inode, attr_ver, num_read, req->pages, - req->num_pages); + SetPageUptodate(page); - SetPageUptodate(page); - } - - fuse_put_request(fc, req); - - return err; + return 0; } static int fuse_readpage(struct file *file, struct page *page) From 338f2e3f3341a9a844331b1bcb159e68638d5eef Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 0409/2880] fuse: convert sync write to simple api Extract a fuse_write_flags() helper that converts ki_flags relevant write to open flags. The other parts of fuse_send_write() aren't used in the fuse_perform_write() case. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 130 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 44 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d927c336683a..1dc499ad8606 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -553,9 +553,15 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, } struct fuse_io_args { - struct { - struct fuse_read_in in; - } read; + union { + struct { + struct fuse_read_in in; + } read; + struct { + struct fuse_write_in in; + struct fuse_write_out out; + } write; + }; struct fuse_args_pages ap; }; @@ -1001,6 +1007,40 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, req->out.args[0].value = outarg; } +static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, + loff_t pos, size_t count) +{ + struct fuse_args *args = &ia->ap.args; + + ia->write.in.fh = ff->fh; + ia->write.in.offset = pos; + ia->write.in.size = count; + args->opcode = FUSE_WRITE; + args->nodeid = ff->nodeid; + args->in_numargs = 2; + if (ff->fc->minor < 9) + args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; + else + args->in_args[0].size = sizeof(ia->write.in); + args->in_args[0].value = &ia->write.in; + args->in_args[1].size = count; + args->out_numargs = 1; + args->out_args[0].size = sizeof(ia->write.out); + args->out_args[0].value = &ia->write.out; +} + +static unsigned int fuse_write_flags(struct kiocb *iocb) +{ + unsigned int flags = iocb->ki_filp->f_flags; + + if (iocb->ki_flags & IOCB_DSYNC) + flags |= O_DSYNC; + if (iocb->ki_flags & IOCB_SYNC) + flags |= O_SYNC; + + return flags; +} + static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, loff_t pos, size_t count, fl_owner_t owner) { @@ -1011,11 +1051,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, struct fuse_write_in *inarg = &req->misc.write.in; fuse_write_fill(req, ff, pos, count); - inarg->flags = file->f_flags; - if (iocb->ki_flags & IOCB_DSYNC) - inarg->flags |= O_DSYNC; - if (iocb->ki_flags & IOCB_SYNC) - inarg->flags |= O_SYNC; + inarg->flags = fuse_write_flags(iocb); if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); @@ -1045,26 +1081,31 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos) return ret; } -static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, - struct inode *inode, loff_t pos, - size_t count) +static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, + struct kiocb *iocb, struct inode *inode, + loff_t pos, size_t count) { - size_t res; - unsigned offset; - unsigned i; - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + struct fuse_args_pages *ap = &ia->ap; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + unsigned int offset, i; + int err; - for (i = 0; i < req->num_pages; i++) - fuse_wait_on_page_writeback(inode, req->pages[i]->index); + for (i = 0; i < ap->num_pages; i++) + fuse_wait_on_page_writeback(inode, ap->pages[i]->index); - res = fuse_send_write(req, &io, pos, count, NULL); + fuse_write_args_fill(ia, ff, pos, count); + ia->write.in.flags = fuse_write_flags(iocb); - offset = req->page_descs[0].offset; - count = res; - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; + err = fuse_simple_request(fc, &ap->args); - if (!req->out.h.error && !offset && count >= PAGE_SIZE) + offset = ap->descs[0].offset; + count = ia->write.out.size; + for (i = 0; i < ap->num_pages; i++) { + struct page *page = ap->pages[i]; + + if (!err && !offset && count >= PAGE_SIZE) SetPageUptodate(page); if (count > PAGE_SIZE - offset) @@ -1077,20 +1118,21 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, put_page(page); } - return res; + return err; } -static ssize_t fuse_fill_write_pages(struct fuse_req *req, - struct address_space *mapping, - struct iov_iter *ii, loff_t pos) +static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos, + unsigned int max_pages) { struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); size_t count = 0; int err; - req->in.argpages = 1; - req->page_descs[0].offset = offset; + ap->args.in_pages = true; + ap->descs[0].offset = offset; do { size_t tmp; @@ -1126,9 +1168,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, } err = 0; - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].length = tmp; - req->num_pages++; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].length = tmp; + ap->num_pages++; count += tmp; pos += tmp; @@ -1139,7 +1181,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - req->num_pages < req->max_pages && offset == 0); + ap->num_pages < max_pages && offset == 0); return count > 0 ? count : err; } @@ -1167,27 +1209,27 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); do { - struct fuse_req *req; ssize_t count; + struct fuse_io_args ia = {}; + struct fuse_args_pages *ap = &ia.ap; unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), fc->max_pages); - req = fuse_get_req(fc, nr_pages); - if (IS_ERR(req)) { - err = PTR_ERR(req); + ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs); + if (!ap->pages) { + err = -ENOMEM; break; } - count = fuse_fill_write_pages(req, mapping, ii, pos); + count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages); if (count <= 0) { err = count; } else { - size_t num_written; - - num_written = fuse_send_write_pages(req, iocb, inode, - pos, count); - err = req->out.h.error; + err = fuse_send_write_pages(&ia, iocb, inode, + pos, count); if (!err) { + size_t num_written = ia.write.out.size; + res += num_written; pos += num_written; @@ -1196,7 +1238,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, err = -EIO; } } - fuse_put_request(fc, req); + kfree(ap->pages); } while (!err && iov_iter_count(ii)); if (res > 0) From 1259728731a7902c4965952c55fa16014b4fd0e7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 0410/2880] fuse: add simple background helper Create a helper named fuse_simple_background() that is similar to fuse_simple_request(). Unlike the latter, it returns immediately and calls the supplied 'end' callback when the reply is received. The supplied 'args' pointer is stored in 'fuse_req' which allows the callback to interpret the output arguments decoded from the reply. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 6 ++++++ 2 files changed, 51 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 29723e143666..a52a7380baa4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -626,6 +626,51 @@ bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) return queued; } +static void fuse_simple_end(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_args *args = req->args; + int err = req->out.h.error; + + if (!err && args->out_argvar) { + BUG_ON(args->out_numargs == 0); + args->out_args[args->out_numargs - 1].size = + req->out.args[args->out_numargs - 1].size; + } + req->args->end(fc, req->args, req->out.h.error); +} + +int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, + gfp_t gfp_flags) +{ + struct fuse_req *req; + + if (args->force) { + WARN_ON(!args->nocreds); + req = __fuse_request_alloc(0, gfp_flags); + if (!req) + return -ENOMEM; + __set_bit(FR_BACKGROUND, &req->flags); + } else { + WARN_ON(args->nocreds); + req = __fuse_get_req(fc, 0, true); + if (IS_ERR(req)) + return PTR_ERR(req); + } + + fuse_args_to_req(req, args); + + req->args = args; + req->end = fuse_simple_end; + + if (!fuse_request_queue_background(fc, req)) { + fuse_put_request(fc, req); + return -ENOTCONN; + } + + return 0; +} +EXPORT_SYMBOL_GPL(fuse_simple_background); + void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { WARN_ON(!req->end); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b62a3e37ea4c..8f46c5549e57 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -301,6 +301,7 @@ struct fuse_args { bool page_replace:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; + void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error); }; struct fuse_args_pages { @@ -381,6 +382,9 @@ struct fuse_req { /** Entry on the interrupts list */ struct list_head intr_entry; + /* Input/output arguments */ + struct fuse_args *args; + /** refcount */ refcount_t count; @@ -948,6 +952,8 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); * Simple request sending that does request allocation and freeing */ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); +int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, + gfp_t gfp_flags); /** * Send a request in the background From 45ac96ed7c369112039289cc7c325a5682ef7c03 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 0411/2880] fuse: convert direct_io to simple api Change of semantics in fuse_async_req_send/fuse_send_(read|write): these can now return error, in which case the 'end' callback isn't called, so the fuse_io_args object needs to be freed. Added verification that the return value is sane (less than or equal to the requested read/write size). Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 227 ++++++++++++++++++++++++++++--------------------- 1 file changed, 128 insertions(+), 99 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1dc499ad8606..ee0817a3d5f1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -563,6 +563,7 @@ struct fuse_io_args { } write; }; struct fuse_args_pages ap; + struct fuse_io_priv *io; }; void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, @@ -585,15 +586,15 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) +static void fuse_release_user_pages(struct fuse_args_pages *ap, + bool should_dirty) { - unsigned i; + unsigned int i; - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; + for (i = 0; i < ap->num_pages; i++) { if (should_dirty) - set_page_dirty_lock(page); - put_page(page); + set_page_dirty_lock(ap->pages[i]); + put_page(ap->pages[i]); } } @@ -663,64 +664,94 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) kref_put(&io->refcnt, fuse_io_release); } -static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) +static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, + unsigned int npages) { - struct fuse_io_priv *io = req->io; - ssize_t pos = -1; + struct fuse_io_args *ia; - fuse_release_user_pages(req, io->should_dirty); - - if (io->write) { - if (req->misc.write.in.size != req->misc.write.out.size) - pos = req->misc.write.in.offset - io->offset + - req->misc.write.out.size; - } else { - if (req->misc.read.in.size != req->out.args[0].size) - pos = req->misc.read.in.offset - io->offset + - req->out.args[0].size; + ia = kzalloc(sizeof(*ia), GFP_KERNEL); + if (ia) { + ia->io = io; + ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL, + &ia->ap.descs); + if (!ia->ap.pages) { + kfree(ia); + ia = NULL; + } } - - fuse_aio_complete(io, req->out.h.error, pos); + return ia; } -static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, - size_t num_bytes, struct fuse_io_priv *io) +static void fuse_io_free(struct fuse_io_args *ia) { + kfree(ia->ap.pages); + kfree(ia); +} + +static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, + int err) +{ + struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_io_priv *io = ia->io; + ssize_t pos = -1; + + fuse_release_user_pages(&ia->ap, io->should_dirty); + + if (err) { + /* Nothing */ + } else if (io->write) { + if (ia->write.out.size > ia->write.in.size) { + err = -EIO; + } else if (ia->write.in.size != ia->write.out.size) { + pos = ia->write.in.offset - io->offset + + ia->write.out.size; + } + } else { + u32 outsize = args->out_args[0].size; + + if (ia->read.in.size != outsize) + pos = ia->read.in.offset - io->offset + outsize; + } + + fuse_aio_complete(io, err, pos); + fuse_io_free(ia); +} + +static ssize_t fuse_async_req_send(struct fuse_conn *fc, + struct fuse_io_args *ia, size_t num_bytes) +{ + ssize_t err; + struct fuse_io_priv *io = ia->io; + spin_lock(&io->lock); kref_get(&io->refcnt); io->size += num_bytes; io->reqs++; spin_unlock(&io->lock); - req->io = io; - req->end = fuse_aio_complete_req; + ia->ap.args.end = fuse_aio_complete_req; + err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); - __fuse_get_request(req); - fuse_request_send_background(fc, req); - - return num_bytes; + return err ?: num_bytes; } -static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io, - loff_t pos, size_t count, fl_owner_t owner) +static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, + fl_owner_t owner) { - struct file *file = io->iocb->ki_filp; + struct file *file = ia->io->iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - fuse_read_fill(req, file, pos, count, FUSE_READ); + fuse_read_args_fill(ia, file, pos, count, FUSE_READ); if (owner != NULL) { - struct fuse_read_in *inarg = &req->misc.read.in; - - inarg->read_flags |= FUSE_READ_LOCKOWNER; - inarg->lock_owner = fuse_lock_owner_id(fc, owner); + ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; + ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner); } - if (io->async) - return fuse_async_req_send(fc, req, count, io); + if (ia->io->async) + return fuse_async_req_send(fc, ia, count); - fuse_request_send(fc, req); - return req->out.args[0].size; + return fuse_simple_request(fc, &ia->ap.args); } static void fuse_read_update_size(struct inode *inode, loff_t size, @@ -1041,27 +1072,31 @@ static unsigned int fuse_write_flags(struct kiocb *iocb) return flags; } -static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, - loff_t pos, size_t count, fl_owner_t owner) +static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, + size_t count, fl_owner_t owner) { - struct kiocb *iocb = io->iocb; + struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - struct fuse_write_in *inarg = &req->misc.write.in; + struct fuse_write_in *inarg = &ia->write.in; + ssize_t err; - fuse_write_fill(req, ff, pos, count); + fuse_write_args_fill(ia, ff, pos, count); inarg->flags = fuse_write_flags(iocb); if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } - if (io->async) - return fuse_async_req_send(fc, req, count, io); + if (ia->io->async) + return fuse_async_req_send(fc, ia, count); - fuse_request_send(fc, req); - return req->misc.write.out.size; + err = fuse_simple_request(fc, &ia->ap.args); + if (!err && ia->write.out.size > count) + err = -EIO; + + return err ?: ia->write.out.size; } bool fuse_write_update_size(struct inode *inode, loff_t pos) @@ -1347,8 +1382,9 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, return min(iov_iter_single_seg_count(ii), max_size); } -static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, - size_t *nbytesp, int write) +static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, + size_t *nbytesp, int write, + unsigned int max_pages) { size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; @@ -1359,21 +1395,21 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, size_t frag_size = fuse_get_frag_size(ii, *nbytesp); if (write) - req->in.args[1].value = (void *) user_addr; + ap->args.in_args[1].value = (void *) user_addr; else - req->out.args[0].value = (void *) user_addr; + ap->args.out_args[0].value = (void *) user_addr; iov_iter_advance(ii, frag_size); *nbytesp = frag_size; return 0; } - while (nbytes < *nbytesp && req->num_pages < req->max_pages) { + while (nbytes < *nbytesp && ap->num_pages < max_pages) { unsigned npages; size_t start; - ret = iov_iter_get_pages(ii, &req->pages[req->num_pages], + ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages], *nbytesp - nbytes, - req->max_pages - req->num_pages, + max_pages - ap->num_pages, &start); if (ret < 0) break; @@ -1384,19 +1420,18 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, ret += start; npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; - req->page_descs[req->num_pages].offset = start; - fuse_page_descs_length_init(req->page_descs, req->num_pages, - npages); + ap->descs[ap->num_pages].offset = start; + fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); - req->num_pages += npages; - req->page_descs[req->num_pages - 1].length -= + ap->num_pages += npages; + ap->descs[ap->num_pages - 1].length -= (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } if (write) - req->in.argpages = 1; + ap->args.in_pages = 1; else - req->out.argpages = 1; + ap->args.out_pages = 1; *nbytesp = nbytes; @@ -1418,17 +1453,16 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, pgoff_t idx_from = pos >> PAGE_SHIFT; pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; ssize_t res = 0; - struct fuse_req *req; int err = 0; + struct fuse_io_args *ia; + unsigned int max_pages; - if (io->async) - req = fuse_get_req_for_background(fc, iov_iter_npages(iter, - fc->max_pages)); - else - req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages)); - if (IS_ERR(req)) - return PTR_ERR(req); + max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); + if (!ia) + return -ENOMEM; + ia->io = io; if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) inode_lock(inode); @@ -1439,54 +1473,49 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, io->should_dirty = !write && iter_is_iovec(iter); while (count) { - size_t nres; + ssize_t nres; fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); - err = fuse_get_user_pages(req, iter, &nbytes, write); + + err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, + max_pages); if (err && !nbytes) break; if (write) { - if (!capable(CAP_FSETID)) { - struct fuse_write_in *inarg; + if (!capable(CAP_FSETID)) + ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV; - inarg = &req->misc.write.in; - inarg->write_flags |= FUSE_WRITE_KILL_PRIV; - } - nres = fuse_send_write(req, io, pos, nbytes, owner); + nres = fuse_send_write(ia, pos, nbytes, owner); } else { - nres = fuse_send_read(req, io, pos, nbytes, owner); + nres = fuse_send_read(ia, pos, nbytes, owner); } - if (!io->async) - fuse_release_user_pages(req, io->should_dirty); - if (req->out.h.error) { - err = req->out.h.error; - break; - } else if (nres > nbytes) { - res = 0; - err = -EIO; + if (!io->async || nres < 0) { + fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_io_free(ia); + } + ia = NULL; + if (nres < 0) { + err = nres; break; } + WARN_ON(nres > nbytes); + count -= nres; res += nres; pos += nres; if (nres != nbytes) break; if (count) { - fuse_put_request(fc, req); - if (io->async) - req = fuse_get_req_for_background(fc, - iov_iter_npages(iter, fc->max_pages)); - else - req = fuse_get_req(fc, iov_iter_npages(iter, - fc->max_pages)); - if (IS_ERR(req)) + max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); + if (!ia) break; } } - if (!IS_ERR(req)) - fuse_put_request(fc, req); + if (ia) + fuse_io_free(ia); if (res > 0) *ppos = pos; From 134831e36bbdd6ce6c3ef623ddf6d9238f244ed3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 0412/2880] fuse: convert readpages to simple api Need to extend fuse_io_args with 'attr_ver' and 'ff' members, that take the functionality of the same named members in fuse_req. fuse_short_read() can now take struct fuse_args_pages. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 137 +++++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 66 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ee0817a3d5f1..ac2323443c09 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -556,6 +556,7 @@ struct fuse_io_args { union { struct { struct fuse_read_in in; + u64 attr_ver; } read; struct { struct fuse_write_in in; @@ -564,6 +565,7 @@ struct fuse_io_args { }; struct fuse_args_pages ap; struct fuse_io_priv *io; + struct fuse_file *ff; }; void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, @@ -770,7 +772,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, } static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, - struct page **pages, unsigned int num_pages) + struct fuse_args_pages *ap) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -784,12 +786,12 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, int start_idx = num_read >> PAGE_SHIFT; size_t off = num_read & (PAGE_SIZE - 1); - for (i = start_idx; i < num_pages; i++) { - zero_user_segment(pages[i], off, PAGE_SIZE); + for (i = start_idx; i < ap->num_pages; i++) { + zero_user_segment(ap->pages[i], off, PAGE_SIZE); off = 0; } } else { - loff_t pos = page_offset(pages[0]) + num_read; + loff_t pos = page_offset(ap->pages[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } @@ -827,8 +829,7 @@ static int fuse_do_readpage(struct file *file, struct page *page) * Short read means EOF. If file size is larger, truncate it */ if (res < desc.length) - fuse_short_read(inode, attr_ver, res, ia.ap.pages, - ia.ap.num_pages); + fuse_short_read(inode, attr_ver, res, &ia.ap); SetPageUptodate(page); @@ -851,15 +852,18 @@ static int fuse_readpage(struct file *file, struct page *page) return err; } -static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, + int err) { int i; - size_t count = req->misc.read.in.size; - size_t num_read = req->out.args[0].size; + struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_args_pages *ap = &ia->ap; + size_t count = ia->read.in.size; + size_t num_read = args->out_args[0].size; struct address_space *mapping = NULL; - for (i = 0; mapping == NULL && i < req->num_pages; i++) - mapping = req->pages[i]->mapping; + for (i = 0; mapping == NULL && i < ap->num_pages; i++) + mapping = ap->pages[i]->mapping; if (mapping) { struct inode *inode = mapping->host; @@ -867,94 +871,97 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) /* * Short read means EOF. If file size is larger, truncate it */ - if (!req->out.h.error && num_read < count) - fuse_short_read(inode, req->misc.read.attr_ver, - num_read, req->pages, req->num_pages); + if (!err && num_read < count) + fuse_short_read(inode, ia->read.attr_ver, num_read, ap); fuse_invalidate_atime(inode); } - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; - if (!req->out.h.error) + for (i = 0; i < ap->num_pages; i++) { + struct page *page = ap->pages[i]; + + if (!err) SetPageUptodate(page); else SetPageError(page); unlock_page(page); put_page(page); } - if (req->ff) - fuse_file_put(req->ff, false, false); + if (ia->ff) + fuse_file_put(ia->ff, false, false); + + fuse_io_free(ia); } -static void fuse_send_readpages(struct fuse_req *req, struct file *file) +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - loff_t pos = page_offset(req->pages[0]); - size_t count = req->num_pages << PAGE_SHIFT; + struct fuse_args_pages *ap = &ia->ap; + loff_t pos = page_offset(ap->pages[0]); + size_t count = ap->num_pages << PAGE_SHIFT; + int err; - req->out.argpages = 1; - req->out.page_zeroing = 1; - req->out.page_replace = 1; - fuse_read_fill(req, file, pos, count, FUSE_READ); - req->misc.read.attr_ver = fuse_get_attr_version(fc); + ap->args.out_pages = true; + ap->args.page_zeroing = true; + ap->args.page_replace = true; + fuse_read_args_fill(ia, file, pos, count, FUSE_READ); + ia->read.attr_ver = fuse_get_attr_version(fc); if (fc->async_read) { - req->ff = fuse_file_get(ff); - req->end = fuse_readpages_end; - fuse_request_send_background(fc, req); + ia->ff = fuse_file_get(ff); + ap->args.end = fuse_readpages_end; + err = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + if (!err) + return; } else { - fuse_request_send(fc, req); - fuse_readpages_end(fc, req); - fuse_put_request(fc, req); + err = fuse_simple_request(fc, &ap->args); } + fuse_readpages_end(fc, &ap->args, err); } struct fuse_fill_data { - struct fuse_req *req; + struct fuse_io_args *ia; struct file *file; struct inode *inode; - unsigned nr_pages; + unsigned int nr_pages; + unsigned int max_pages; }; static int fuse_readpages_fill(void *_data, struct page *page) { struct fuse_fill_data *data = _data; - struct fuse_req *req = data->req; + struct fuse_io_args *ia = data->ia; + struct fuse_args_pages *ap = &ia->ap; struct inode *inode = data->inode; struct fuse_conn *fc = get_fuse_conn(inode); fuse_wait_on_page_writeback(inode, page->index); - if (req->num_pages && - (req->num_pages == fc->max_pages || - (req->num_pages + 1) * PAGE_SIZE > fc->max_read || - req->pages[req->num_pages - 1]->index + 1 != page->index)) { - unsigned int nr_alloc = min_t(unsigned int, data->nr_pages, - fc->max_pages); - fuse_send_readpages(req, data->file); - if (fc->async_read) - req = fuse_get_req_for_background(fc, nr_alloc); - else - req = fuse_get_req(fc, nr_alloc); - - data->req = req; - if (IS_ERR(req)) { + if (ap->num_pages && + (ap->num_pages == fc->max_pages || + (ap->num_pages + 1) * PAGE_SIZE > fc->max_read || + ap->pages[ap->num_pages - 1]->index + 1 != page->index)) { + data->max_pages = min_t(unsigned int, data->nr_pages, + fc->max_pages); + fuse_send_readpages(ia, data->file); + data->ia = ia = fuse_io_alloc(NULL, data->max_pages); + if (!ia) { unlock_page(page); - return PTR_ERR(req); + return -ENOMEM; } + ap = &ia->ap; } - if (WARN_ON(req->num_pages >= req->max_pages)) { + if (WARN_ON(ap->num_pages >= data->max_pages)) { unlock_page(page); - fuse_put_request(fc, req); + fuse_io_free(ia); return -EIO; } get_page(page); - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].length = PAGE_SIZE; - req->num_pages++; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].length = PAGE_SIZE; + ap->num_pages++; data->nr_pages--; return 0; } @@ -966,7 +973,6 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_data data; int err; - unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages); err = -EIO; if (is_bad_inode(inode)) @@ -974,21 +980,20 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - if (fc->async_read) - data.req = fuse_get_req_for_background(fc, nr_alloc); - else - data.req = fuse_get_req(fc, nr_alloc); data.nr_pages = nr_pages; - err = PTR_ERR(data.req); - if (IS_ERR(data.req)) + data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages); +; + data.ia = fuse_io_alloc(NULL, data.max_pages); + err = -ENOMEM; + if (!data.ia) goto out; err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); if (!err) { - if (data.req->num_pages) - fuse_send_readpages(data.req, file); + if (data.ia->ap.num_pages) + fuse_send_readpages(data.ia, file); else - fuse_put_request(fc, data.req); + fuse_io_free(data.ia); } out: return err; From 43f5098eb82b1dbf3988cab0a26e729e88a004fc Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 0413/2880] fuse: convert readdir to simple api The old fuse_read_fill() helper can be deleted, now that the last user is gone. The fuse_io_args struct is moved to fuse_i.h so it can be shared between readdir/read code. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 36 ------------------------------------ fs/fuse/fuse_i.h | 23 ++++++++++++++++++++--- fs/fuse/readdir.c | 47 ++++++++++++++++++++--------------------------- 3 files changed, 40 insertions(+), 66 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ac2323443c09..3cb98ff267cf 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -532,42 +532,6 @@ out: return err; } -void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, - size_t count, int opcode) -{ - struct fuse_read_in *inarg = &req->misc.read.in; - struct fuse_file *ff = file->private_data; - - inarg->fh = ff->fh; - inarg->offset = pos; - inarg->size = count; - inarg->flags = file->f_flags; - req->in.h.opcode = opcode; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct fuse_read_in); - req->in.args[0].value = inarg; - req->out.argvar = 1; - req->out.numargs = 1; - req->out.args[0].size = count; -} - -struct fuse_io_args { - union { - struct { - struct fuse_read_in in; - u64 attr_ver; - } read; - struct { - struct fuse_write_in in; - struct fuse_write_out out; - } write; - }; - struct fuse_args_pages ap; - struct fuse_io_priv *io; - struct fuse_file *ff; -}; - void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8f46c5549e57..4d7cd20967c2 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -830,11 +830,28 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); -/** +/* * Initialize READ or READDIR request */ -void fuse_read_fill(struct fuse_req *req, struct file *file, - loff_t pos, size_t count, int opcode); +struct fuse_io_args { + union { + struct { + struct fuse_read_in in; + u64 attr_ver; + } read; + struct { + struct fuse_write_in in; + struct fuse_write_out out; + } write; + }; + struct fuse_args_pages ap; + struct fuse_io_priv *io; + struct fuse_file *ff; +}; + +void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, + size_t count, int opcode); + /** * Send OPEN or OPENDIR request diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 56a19faa855d..fae025db06f4 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -316,62 +316,55 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { - int plus, err; - size_t nbytes; + int plus; + ssize_t res; struct page *page; struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + struct fuse_io_args ia = {}; + struct fuse_args_pages *ap = &ia.ap; + struct fuse_page_desc desc = { .length = PAGE_SIZE }; u64 attr_version = 0; bool locked; - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); - page = alloc_page(GFP_KERNEL); - if (!page) { - fuse_put_request(fc, req); + if (!page) return -ENOMEM; - } plus = fuse_use_readdirplus(inode, ctx); - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = PAGE_SIZE; + ap->args.out_pages = 1; + ap->num_pages = 1; + ap->pages = &page; + ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fc); - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); + fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, + FUSE_READDIRPLUS); } else { - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); + fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, + FUSE_READDIR); } locked = fuse_lock_inode(inode); - fuse_request_send(fc, req); + res = fuse_simple_request(fc, &ap->args); fuse_unlock_inode(inode, locked); - nbytes = req->out.args[0].size; - err = req->out.h.error; - fuse_put_request(fc, req); - if (!err) { - if (!nbytes) { + if (res >= 0) { + if (!res) { struct fuse_file *ff = file->private_data; if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - err = parse_dirplusfile(page_address(page), nbytes, + res = parse_dirplusfile(page_address(page), res, file, ctx, attr_version); } else { - err = parse_dirfile(page_address(page), nbytes, file, + res = parse_dirfile(page_address(page), res, file, ctx); } } __free_page(page); fuse_invalidate_atime(inode); - return err; + return res; } enum fuse_parse_result { From 33826ebbbe4b45ccecf2f5a08b3457f5d59c6282 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 0414/2880] fuse: convert writepages to simple api Derive fuse_writepage_args from fuse_io_args. Sending the request is tricky since it was done with fi->lock held, hence we must either use atomic allocation or release the lock. Both are possible so try atomic first and if it fails, release the lock and do the regular allocation with GFP_NOFS and __GFP_NOFAIL. Both flags are necessary for correct operation. Move the page realloc function from dev.c to file.c and convert to using fuse_writepage_args. The last caller of fuse_write_fill() is gone, so get rid of it. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 27 ---- fs/fuse/file.c | 361 +++++++++++++++++++++++++++-------------------- fs/fuse/fuse_i.h | 3 - 3 files changed, 206 insertions(+), 185 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a52a7380baa4..d87accc9df9d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -96,33 +96,6 @@ static void fuse_req_pages_free(struct fuse_req *req) kfree(req->pages); } -bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, - gfp_t flags) -{ - struct page **pages; - struct fuse_page_desc *page_descs; - unsigned int npages = min_t(unsigned int, - max_t(unsigned int, req->max_pages * 2, - FUSE_DEFAULT_MAX_PAGES_PER_REQ), - fc->max_pages); - WARN_ON(npages <= req->max_pages); - - pages = fuse_pages_alloc(npages, flags, &page_descs); - if (!pages) - return false; - - memcpy(pages, req->pages, sizeof(struct page *) * req->max_pages); - memcpy(page_descs, req->page_descs, - sizeof(struct fuse_page_desc) * req->max_pages); - fuse_req_pages_free(req); - __set_bit(FR_ALLOC_PAGES, &req->flags); - req->pages = pages; - req->page_descs = page_descs; - req->max_pages = npages; - - return true; -} - void fuse_request_free(struct fuse_req *req) { fuse_req_pages_free(req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3cb98ff267cf..399b89b29bb4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -347,19 +347,27 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) return (u64) v0 + ((u64) v1 << 32); } -static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi, +struct fuse_writepage_args { + struct fuse_io_args ia; + struct list_head writepages_entry; + struct list_head queue_entry; + struct fuse_writepage_args *next; + struct inode *inode; +}; + +static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_req *req; + struct fuse_writepage_args *wpa; - list_for_each_entry(req, &fi->writepages, writepages_entry) { + list_for_each_entry(wpa, &fi->writepages, writepages_entry) { pgoff_t curr_index; - WARN_ON(get_fuse_inode(req->inode) != fi); - curr_index = req->misc.write.in.offset >> PAGE_SHIFT; - if (idx_from < curr_index + req->num_pages && + WARN_ON(get_fuse_inode(wpa->inode) != fi); + curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; + if (idx_from < curr_index + wpa->ia.ap.num_pages && curr_index <= idx_to) { - return req; + return wpa; } } return NULL; @@ -984,29 +992,6 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) return generic_file_read_iter(iocb, to); } -static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, - loff_t pos, size_t count) -{ - struct fuse_write_in *inarg = &req->misc.write.in; - struct fuse_write_out *outarg = &req->misc.write.out; - - inarg->fh = ff->fh; - inarg->offset = pos; - inarg->size = count; - req->in.h.opcode = FUSE_WRITE; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 2; - if (ff->fc->minor < 9) - req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; - else - req->in.args[0].size = sizeof(struct fuse_write_in); - req->in.args[0].value = inarg; - req->in.args[1].size = count; - req->out.numargs = 1; - req->out.args[0].size = sizeof(struct fuse_write_out); - req->out.args[0].value = outarg; -} - static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, loff_t pos, size_t count) { @@ -1576,45 +1561,53 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return fuse_direct_write_iter(iocb, from); } -static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_free(struct fuse_writepage_args *wpa) { + struct fuse_args_pages *ap = &wpa->ia.ap; int i; - for (i = 0; i < req->num_pages; i++) - __free_page(req->pages[i]); + for (i = 0; i < ap->num_pages; i++) + __free_page(ap->pages[i]); - if (req->ff) - fuse_file_put(req->ff, false, false); + if (wpa->ia.ff) + fuse_file_put(wpa->ia.ff, false, false); + + kfree(ap->pages); + kfree(wpa); } -static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_finish(struct fuse_conn *fc, + struct fuse_writepage_args *wpa) { - struct inode *inode = req->inode; + struct fuse_args_pages *ap = &wpa->ia.ap; + struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - list_del(&req->writepages_entry); - for (i = 0; i < req->num_pages; i++) { + list_del(&wpa->writepages_entry); + for (i = 0; i < ap->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP); + dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); } wake_up(&fi->page_waitq); } /* Called under fi->lock, may release and reacquire it */ -static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, - loff_t size) +static void fuse_send_writepage(struct fuse_conn *fc, + struct fuse_writepage_args *wpa, loff_t size) __releases(fi->lock) __acquires(fi->lock) { - struct fuse_req *aux, *next; - struct fuse_inode *fi = get_fuse_inode(req->inode); - struct fuse_write_in *inarg = &req->misc.write.in; - __u64 data_size = req->num_pages * PAGE_SIZE; - bool queued; + struct fuse_writepage_args *aux, *next; + struct fuse_inode *fi = get_fuse_inode(wpa->inode); + struct fuse_write_in *inarg = &wpa->ia.write.in; + struct fuse_args *args = &wpa->ia.ap.args; + __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE; + int err; + fi->writectr++; if (inarg->offset + data_size <= size) { inarg->size = data_size; } else if (inarg->offset < size) { @@ -1624,29 +1617,36 @@ __acquires(fi->lock) goto out_free; } - req->in.args[1].size = inarg->size; - queued = fuse_request_queue_background(fc, req); + args->in_args[1].size = inarg->size; + args->force = true; + args->nocreds = true; + + err = fuse_simple_background(fc, args, GFP_ATOMIC); + if (err == -ENOMEM) { + spin_unlock(&fi->lock); + err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL); + spin_lock(&fi->lock); + } + /* Fails on broken connection only */ - if (unlikely(!queued)) + if (unlikely(err)) goto out_free; - fi->writectr++; return; out_free: - fuse_writepage_finish(fc, req); + fi->writectr--; + fuse_writepage_finish(fc, wpa); spin_unlock(&fi->lock); /* After fuse_writepage_finish() aux request list is private */ - for (aux = req->misc.write.next; aux; aux = next) { - next = aux->misc.write.next; - aux->misc.write.next = NULL; - fuse_writepage_free(fc, aux); - fuse_put_request(fc, aux); + for (aux = wpa->next; aux; aux = next) { + next = aux->next; + aux->next = NULL; + fuse_writepage_free(aux); } - fuse_writepage_free(fc, req); - fuse_put_request(fc, req); + fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1663,29 +1663,34 @@ __acquires(fi->lock) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); loff_t crop = i_size_read(inode); - struct fuse_req *req; + struct fuse_writepage_args *wpa; while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { - req = list_entry(fi->queued_writes.next, struct fuse_req, list); - list_del_init(&req->list); - fuse_send_writepage(fc, req, crop); + wpa = list_entry(fi->queued_writes.next, + struct fuse_writepage_args, queue_entry); + list_del_init(&wpa->queue_entry); + fuse_send_writepage(fc, wpa, crop); } } -static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, + int error) { - struct inode *inode = req->inode; + struct fuse_writepage_args *wpa = + container_of(args, typeof(*wpa), ia.ap.args); + struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - mapping_set_error(inode->i_mapping, req->out.h.error); + mapping_set_error(inode->i_mapping, error); spin_lock(&fi->lock); - while (req->misc.write.next) { + while (wpa->next) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_write_in *inarg = &req->misc.write.in; - struct fuse_req *next = req->misc.write.next; - req->misc.write.next = next->misc.write.next; - next->misc.write.next = NULL; - next->ff = fuse_file_get(req->ff); + struct fuse_write_in *inarg = &wpa->ia.write.in; + struct fuse_writepage_args *next = wpa->next; + + wpa->next = next->next; + next->next = NULL; + next->ia.ff = fuse_file_get(wpa->ia.ff); list_add(&next->writepages_entry, &fi->writepages); /* @@ -1714,9 +1719,9 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) fuse_send_writepage(fc, next, inarg->offset + inarg->size); } fi->writectr--; - fuse_writepage_finish(fc, req); + fuse_writepage_finish(fc, wpa); spin_unlock(&fi->lock); - fuse_writepage_free(fc, req); + fuse_writepage_free(wpa); } static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, @@ -1758,52 +1763,71 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } +static struct fuse_writepage_args *fuse_writepage_args_alloc(void) +{ + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; + + wpa = kzalloc(sizeof(*wpa), GFP_NOFS); + if (wpa) { + ap = &wpa->ia.ap; + ap->num_pages = 0; + ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs); + if (!ap->pages) { + kfree(wpa); + wpa = NULL; + } + } + return wpa; + +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_req *req; + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; struct page *tmp_page; int error = -ENOMEM; set_page_writeback(page); - req = fuse_request_alloc_nofs(1); - if (!req) + wpa = fuse_writepage_args_alloc(); + if (!wpa) goto err; + ap = &wpa->ia.ap; - /* writeback always goes to bg_queue */ - __set_bit(FR_BACKGROUND, &req->flags); tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) goto err_free; error = -EIO; - req->ff = fuse_write_file_get(fc, fi); - if (!req->ff) + wpa->ia.ff = fuse_write_file_get(fc, fi); + if (!wpa->ia.ff) goto err_nofile; - fuse_write_fill(req, req->ff, page_offset(page), 0); + fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); copy_highpage(tmp_page, page); - req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; - req->misc.write.next = NULL; - req->in.argpages = 1; - req->num_pages = 1; - req->pages[0] = tmp_page; - req->page_descs[0].offset = 0; - req->page_descs[0].length = PAGE_SIZE; - req->end = fuse_writepage_end; - req->inode = inode; + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->next = NULL; + ap->args.in_pages = true; + ap->num_pages = 1; + ap->pages[0] = tmp_page; + ap->descs[0].offset = 0; + ap->descs[0].length = PAGE_SIZE; + ap->args.end = fuse_writepage_end; + wpa->inode = inode; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); - list_add(&req->writepages_entry, &fi->writepages); - list_add_tail(&req->list, &fi->queued_writes); + list_add(&wpa->writepages_entry, &fi->writepages); + list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1814,7 +1838,7 @@ static int fuse_writepage_locked(struct page *page) err_nofile: __free_page(tmp_page); err_free: - fuse_request_free(req); + kfree(wpa); err: mapping_set_error(page->mapping, error); end_page_writeback(page); @@ -1844,23 +1868,50 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) } struct fuse_fill_wb_data { - struct fuse_req *req; + struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; struct page **orig_pages; + unsigned int max_pages; }; +static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) +{ + struct fuse_args_pages *ap = &data->wpa->ia.ap; + struct fuse_conn *fc = get_fuse_conn(data->inode); + struct page **pages; + struct fuse_page_desc *descs; + unsigned int npages = min_t(unsigned int, + max_t(unsigned int, data->max_pages * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), + fc->max_pages); + WARN_ON(npages <= data->max_pages); + + pages = fuse_pages_alloc(npages, GFP_NOFS, &descs); + if (!pages) + return false; + + memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages); + memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages); + kfree(ap->pages); + ap->pages = pages; + ap->descs = descs; + data->max_pages = npages; + + return true; +} + static void fuse_writepages_send(struct fuse_fill_wb_data *data) { - struct fuse_req *req = data->req; + struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_pages = req->num_pages; + int num_pages = wpa->ia.ap.num_pages; int i; - req->ff = fuse_file_get(data->ff); + wpa->ia.ff = fuse_file_get(data->ff); spin_lock(&fi->lock); - list_add_tail(&req->list, &fi->queued_writes); + list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1875,54 +1926,52 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) * this new request onto the auxiliary list, otherwise reuse the existing one by * copying the new page contents over to the old temporary page. */ -static bool fuse_writepage_in_flight(struct fuse_req *new_req, +static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, struct page *page) { - struct fuse_conn *fc = get_fuse_conn(new_req->inode); - struct fuse_inode *fi = get_fuse_inode(new_req->inode); - struct fuse_req *tmp; - struct fuse_req *old_req; + struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); + struct fuse_writepage_args *tmp; + struct fuse_writepage_args *old_wpa; + struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - WARN_ON(new_req->num_pages != 0); + WARN_ON(new_ap->num_pages != 0); spin_lock(&fi->lock); - list_del(&new_req->writepages_entry); - old_req = fuse_find_writeback(fi, page->index, page->index); - if (!old_req) { - list_add(&new_req->writepages_entry, &fi->writepages); + list_del(&new_wpa->writepages_entry); + old_wpa = fuse_find_writeback(fi, page->index, page->index); + if (!old_wpa) { + list_add(&new_wpa->writepages_entry, &fi->writepages); spin_unlock(&fi->lock); return false; } - new_req->num_pages = 1; - for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) { + new_ap->num_pages = 1; + for (tmp = old_wpa->next; tmp; tmp = tmp->next) { pgoff_t curr_index; - WARN_ON(tmp->inode != new_req->inode); - curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT; + WARN_ON(tmp->inode != new_wpa->inode); + curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; if (curr_index == page->index) { - WARN_ON(tmp->num_pages != 1); - WARN_ON(!test_bit(FR_PENDING, &tmp->flags)); - swap(tmp->pages[0], new_req->pages[0]); + WARN_ON(tmp->ia.ap.num_pages != 1); + swap(tmp->ia.ap.pages[0], new_ap->pages[0]); break; } } if (!tmp) { - new_req->misc.write.next = old_req->misc.write.next; - old_req->misc.write.next = new_req; + new_wpa->next = old_wpa->next; + old_wpa->next = new_wpa; } spin_unlock(&fi->lock); if (tmp) { - struct backing_dev_info *bdi = inode_to_bdi(new_req->inode); + struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); + dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); - fuse_writepage_free(fc, new_req); - fuse_request_free(new_req); + fuse_writepage_free(new_wpa); } return true; @@ -1932,7 +1981,8 @@ static int fuse_writepages_fill(struct page *page, struct writeback_control *wbc, void *_data) { struct fuse_fill_wb_data *data = _data; - struct fuse_req *req = data->req; + struct fuse_writepage_args *wpa = data->wpa; + struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); @@ -1955,16 +2005,16 @@ static int fuse_writepages_fill(struct page *page, */ is_writeback = fuse_page_is_writeback(inode, page->index); - if (req && req->num_pages && - (is_writeback || req->num_pages == fc->max_pages || - (req->num_pages + 1) * PAGE_SIZE > fc->max_write || - data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { + if (wpa && ap->num_pages && + (is_writeback || ap->num_pages == fc->max_pages || + (ap->num_pages + 1) * PAGE_SIZE > fc->max_write || + data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) { fuse_writepages_send(data); - data->req = NULL; - } else if (req && req->num_pages == req->max_pages) { - if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) { + data->wpa = NULL; + } else if (wpa && ap->num_pages == data->max_pages) { + if (!fuse_pages_realloc(data)) { fuse_writepages_send(data); - req = data->req = NULL; + data->wpa = NULL; } } @@ -1982,59 +2032,60 @@ static int fuse_writepages_fill(struct page *page, * This is ensured by holding the page lock in page_mkwrite() while * checking fuse_page_is_writeback(). We already hold the page lock * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment req->num_pages. + * request to the fi->writepages list and increment ap->num_pages. * After this fuse_page_is_writeback() will indicate that the page is * under writeback, so we can release the page lock. */ - if (data->req == NULL) { + if (data->wpa == NULL) { struct fuse_inode *fi = get_fuse_inode(inode); err = -ENOMEM; - req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES); - if (!req) { + wpa = fuse_writepage_args_alloc(); + if (!wpa) { __free_page(tmp_page); goto out_unlock; } + data->max_pages = 1; - fuse_write_fill(req, data->ff, page_offset(page), 0); - req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; - req->misc.write.next = NULL; - req->in.argpages = 1; - __set_bit(FR_BACKGROUND, &req->flags); - req->num_pages = 0; - req->end = fuse_writepage_end; - req->inode = inode; + ap = &wpa->ia.ap; + fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0); + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->next = NULL; + ap->args.in_pages = true; + ap->args.end = fuse_writepage_end; + ap->num_pages = 0; + wpa->inode = inode; spin_lock(&fi->lock); - list_add(&req->writepages_entry, &fi->writepages); + list_add(&wpa->writepages_entry, &fi->writepages); spin_unlock(&fi->lock); - data->req = req; + data->wpa = wpa; } set_page_writeback(page); copy_highpage(tmp_page, page); - req->pages[req->num_pages] = tmp_page; - req->page_descs[req->num_pages].offset = 0; - req->page_descs[req->num_pages].length = PAGE_SIZE; + ap->pages[ap->num_pages] = tmp_page; + ap->descs[ap->num_pages].offset = 0; + ap->descs[ap->num_pages].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; - if (is_writeback && fuse_writepage_in_flight(req, page)) { + if (is_writeback && fuse_writepage_in_flight(wpa, page)) { end_page_writeback(page); - data->req = NULL; + data->wpa = NULL; goto out_unlock; } - data->orig_pages[req->num_pages] = page; + data->orig_pages[ap->num_pages] = page; /* * Protected by fi->lock against concurrent access by * fuse_page_is_writeback(). */ spin_lock(&fi->lock); - req->num_pages++; + ap->num_pages++; spin_unlock(&fi->lock); out_unlock: @@ -2056,7 +2107,7 @@ static int fuse_writepages(struct address_space *mapping, goto out; data.inode = inode; - data.req = NULL; + data.wpa = NULL; data.ff = NULL; err = -ENOMEM; @@ -2067,9 +2118,9 @@ static int fuse_writepages(struct address_space *mapping, goto out; err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); - if (data.req) { + if (data.wpa) { /* Ignore errors if we can write at least one page */ - BUG_ON(!data.req->num_pages); + WARN_ON(!data.wpa->ia.ap.num_pages); fuse_writepages_send(&data); err = 0; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 4d7cd20967c2..a9d707bf2e69 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -932,9 +932,6 @@ struct fuse_req *fuse_request_alloc_nofs(unsigned npages); struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, struct fuse_page_desc **desc); -bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, - gfp_t flags); - /** * Free a request From 615047eff10886bb5ef8e97232b431348bad06d1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 0415/2880] fuse: convert init to simple api Bypass the fc->initialized check by setting the force flag. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 67 ++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 127984642a71..2cec25ad54b7 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -874,11 +874,19 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) spin_unlock(&fc->bg_lock); } -static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) -{ - struct fuse_init_out *arg = &req->misc.init_out; +struct fuse_init_args { + struct fuse_args args; + struct fuse_init_in in; + struct fuse_init_out out; +}; - if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) +static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, + int error) +{ + struct fuse_init_args *ia = container_of(args, typeof(*ia), args); + struct fuse_init_out *arg = &ia->out; + + if (error || arg->major != FUSE_KERNEL_VERSION) fc->conn_error = 1; else { unsigned long ra_pages; @@ -955,18 +963,23 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } + kfree(ia); + fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } -static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_send_init(struct fuse_conn *fc) { - struct fuse_init_in *arg = &req->misc.init_in; + struct fuse_init_args *ia; - arg->major = FUSE_KERNEL_VERSION; - arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; - arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL); + + ia->in.major = FUSE_KERNEL_VERSION; + ia->in.minor = FUSE_KERNEL_MINOR_VERSION; + ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; + ia->in.flags |= + FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | @@ -975,19 +988,23 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA; - req->in.h.opcode = FUSE_INIT; - req->in.numargs = 1; - req->in.args[0].size = sizeof(*arg); - req->in.args[0].value = arg; - req->out.numargs = 1; + ia->args.opcode = FUSE_INIT; + ia->args.in_numargs = 1; + ia->args.in_args[0].size = sizeof(ia->in); + ia->args.in_args[0].value = &ia->in; + ia->args.out_numargs = 1; /* Variable length argument used for backward compatibility with interface version < 7.5. Rest of init_out is zeroed by do_get_request(), so a short reply is not a problem */ - req->out.argvar = 1; - req->out.args[0].size = sizeof(struct fuse_init_out); - req->out.args[0].value = &req->misc.init_out; - req->end = process_init_reply; - fuse_request_send_background(fc, req); + ia->args.out_argvar = 1; + ia->args.out_args[0].size = sizeof(ia->out); + ia->args.out_args[0].value = &ia->out; + ia->args.force = true; + ia->args.nocreds = true; + ia->args.end = process_init_reply; + + if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0) + process_init_reply(fc, &ia->args, -ENOTCONN); } static void fuse_free_conn(struct fuse_conn *fc) @@ -1087,7 +1104,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) struct inode *root; struct file *file; struct dentry *root_dentry; - struct fuse_req *init_req; int err; int is_bdev = sb->s_bdev != NULL; @@ -1182,11 +1198,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) /* Root dentry doesn't have .d_revalidate */ sb->s_d_op = &fuse_dentry_operations; - init_req = fuse_request_alloc(0); - if (!init_req) - goto err_put_root; - __set_bit(FR_BACKGROUND, &init_req->flags); - mutex_lock(&fuse_mutex); err = -EINVAL; if (file->private_data) @@ -1207,14 +1218,12 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) */ fput(file); - fuse_send_init(fc, init_req); + fuse_send_init(fc); return 0; err_unlock: mutex_unlock(&fuse_mutex); - fuse_request_free(init_req); - err_put_root: dput(root_dentry); err_dev_free: fuse_dev_free(fud); From b50ef7c52ad7c954b743cd1cb181e9bf03537bab Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 0416/2880] cuse: convert init to simple api This is a straightforward conversion. Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 91 ++++++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index d011a1ad1d4f..2332e7f960a8 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -298,6 +298,14 @@ static void cuse_gendev_release(struct device *dev) kfree(dev); } +struct cuse_init_args { + struct fuse_args_pages ap; + struct cuse_init_in in; + struct cuse_init_out out; + struct page *page; + struct fuse_page_desc desc; +}; + /** * cuse_process_init_reply - finish initializing CUSE channel * @@ -305,21 +313,22 @@ static void cuse_gendev_release(struct device *dev) * required data structures for it. Please read the comment at the * top of this file for high level overview. */ -static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +static void cuse_process_init_reply(struct fuse_conn *fc, + struct fuse_args *args, int error) { + struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_args_pages *ap = &ia->ap; struct cuse_conn *cc = fc_to_cc(fc), *pos; - struct cuse_init_out *arg = req->out.args[0].value; - struct page *page = req->pages[0]; + struct cuse_init_out *arg = &ia->out; + struct page *page = ap->pages[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; dev_t devt; int rc, i; - if (req->out.h.error || - arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { + if (error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) goto err; - } fc->minor = arg->minor; fc->max_read = max_t(unsigned, arg->max_read, 4096); @@ -328,7 +337,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) /* parse init reply */ cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; - rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size, + rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size, &devinfo); if (rc) goto err; @@ -395,7 +404,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) dev_set_uevent_suppress(dev, 0); kobject_uevent(&dev->kobj, KOBJ_ADD); out: - kfree(arg); + kfree(ia); __free_page(page); return; @@ -414,55 +423,49 @@ err: static int cuse_send_init(struct cuse_conn *cc) { int rc; - struct fuse_req *req; struct page *page; struct fuse_conn *fc = &cc->fc; - struct cuse_init_in *arg; - void *outarg; + struct cuse_init_args *ia; + struct fuse_args_pages *ap; BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); - req = fuse_get_req_for_background(fc, 1); - if (IS_ERR(req)) { - rc = PTR_ERR(req); - goto err; - } - rc = -ENOMEM; page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) - goto err_put_req; + goto err; - outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL); - if (!outarg) + ia = kzalloc(sizeof(*ia), GFP_KERNEL); + if (!ia) goto err_free_page; - arg = &req->misc.cuse_init_in; - arg->major = FUSE_KERNEL_VERSION; - arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->flags |= CUSE_UNRESTRICTED_IOCTL; - req->in.h.opcode = CUSE_INIT; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct cuse_init_in); - req->in.args[0].value = arg; - req->out.numargs = 2; - req->out.args[0].size = sizeof(struct cuse_init_out); - req->out.args[0].value = outarg; - req->out.args[1].size = CUSE_INIT_INFO_MAX; - req->out.argvar = 1; - req->out.argpages = 1; - req->pages[0] = page; - req->page_descs[0].length = req->out.args[1].size; - req->num_pages = 1; - req->end = cuse_process_init_reply; - fuse_request_send_background(fc, req); - - return 0; + ap = &ia->ap; + ia->in.major = FUSE_KERNEL_VERSION; + ia->in.minor = FUSE_KERNEL_MINOR_VERSION; + ia->in.flags |= CUSE_UNRESTRICTED_IOCTL; + ap->args.opcode = CUSE_INIT; + ap->args.in_numargs = 1; + ap->args.in_args[0].size = sizeof(ia->in); + ap->args.in_args[0].value = &ia->in; + ap->args.out_numargs = 2; + ap->args.out_args[0].size = sizeof(ia->out); + ap->args.out_args[0].value = &ia->out; + ap->args.out_args[1].size = CUSE_INIT_INFO_MAX; + ap->args.out_argvar = 1; + ap->args.out_pages = 1; + ap->num_pages = 1; + ap->pages = &ia->page; + ap->descs = &ia->desc; + ia->page = page; + ia->desc.length = ap->args.out_args[1].size; + ap->args.end = cuse_process_init_reply; + rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + if (rc) { + kfree(ia); err_free_page: - __free_page(page); -err_put_req: - fuse_put_request(fc, req); + __free_page(page); + } err: return rc; } From 4cb548666e4c13699904a063f6909b169d3f900c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 0417/2880] fuse: convert release to simple api Since we cannot reserve the request structure up-front, make sure that the request allocation doesn't fail using __GFP_NOFAIL. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 75 +++++++++++++++++++++++++----------------------- fs/fuse/fuse_i.h | 8 ++---- 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 399b89b29bb4..56ed45ef3058 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -53,6 +53,12 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, return fuse_simple_request(fc, &args); } +struct fuse_release_args { + struct fuse_args args; + struct fuse_release_in inarg; + struct inode *inode; +}; + struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; @@ -62,8 +68,8 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) return NULL; ff->fc = fc; - ff->reserved_req = fuse_request_alloc(0); - if (unlikely(!ff->reserved_req)) { + ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL); + if (!ff->release_args) { kfree(ff); return NULL; } @@ -81,7 +87,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) void fuse_file_free(struct fuse_file *ff) { - fuse_request_free(ff->reserved_req); + kfree(ff->release_args); mutex_destroy(&ff->readdir.lock); kfree(ff); } @@ -92,34 +98,31 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } -static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args, + int error) { - iput(req->misc.release.inode); + struct fuse_release_args *ra = container_of(args, typeof(*ra), args); + + iput(ra->inode); + kfree(ra); } static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_req *req = ff->reserved_req; + struct fuse_args *args = &ff->release_args->args; if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { - /* - * Drop the release request when client does not - * implement 'open' - */ - __clear_bit(FR_BACKGROUND, &req->flags); - iput(req->misc.release.inode); - fuse_put_request(ff->fc, req); + /* Do nothing when client does not implement 'open' */ + fuse_release_end(ff->fc, args, 0); } else if (sync) { - __set_bit(FR_FORCE, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); - fuse_request_send(ff->fc, req); - iput(req->misc.release.inode); - fuse_put_request(ff->fc, req); + fuse_simple_request(ff->fc, args); + fuse_release_end(ff->fc, args, 0); } else { - req->end = fuse_release_end; - __set_bit(FR_BACKGROUND, &req->flags); - fuse_request_send_background(ff->fc, req); + args->end = fuse_release_end; + if (fuse_simple_background(ff->fc, args, + GFP_KERNEL | __GFP_NOFAIL)) + fuse_release_end(ff->fc, args, -ENOTCONN); } kfree(ff); } @@ -239,8 +242,7 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, int flags, int opcode) { struct fuse_conn *fc = ff->fc; - struct fuse_req *req = ff->reserved_req; - struct fuse_release_in *inarg = &req->misc.release.in; + struct fuse_release_args *ra = ff->release_args; /* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) { @@ -255,32 +257,33 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, wake_up_interruptible_all(&ff->poll_wait); - inarg->fh = ff->fh; - inarg->flags = flags; - req->in.h.opcode = opcode; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct fuse_release_in); - req->in.args[0].value = inarg; + ra->inarg.fh = ff->fh; + ra->inarg.flags = flags; + ra->args.in_numargs = 1; + ra->args.in_args[0].size = sizeof(struct fuse_release_in); + ra->args.in_args[0].value = &ra->inarg; + ra->args.opcode = opcode; + ra->args.nodeid = ff->nodeid; + ra->args.force = true; + ra->args.nocreds = true; } void fuse_release_common(struct file *file, bool isdir) { struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; - struct fuse_req *req = ff->reserved_req; + struct fuse_release_args *ra = ff->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; fuse_prepare_release(fi, ff, file->f_flags, opcode); if (ff->flock) { - struct fuse_release_in *inarg = &req->misc.release.in; - inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; - inarg->lock_owner = fuse_lock_owner_id(ff->fc, - (fl_owner_t) file); + ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; + ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc, + (fl_owner_t) file); } /* Hold inode until release is finished */ - req->misc.release.inode = igrab(file_inode(file)); + ra->inode = igrab(file_inode(file)); /* * Normally this will send the RELEASE request, however if diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a9d707bf2e69..a055a45361ef 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -164,17 +164,15 @@ enum { }; struct fuse_conn; +struct fuse_release_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ struct fuse_conn *fc; - /* - * Request reserved for flush and release. - * Modified under relative fuse_inode::lock. - */ - struct fuse_req *reserved_req; + /* Argument space reserved for release */ + struct fuse_release_args *release_args; /** Kernel file handle guaranteed to be unique */ u64 kh; From 75b399dda5be1e9ad6952ffcec4c1a057ffae860 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:11 +0200 Subject: [PATCH 0418/2880] fuse: convert retrieve to simple api Rename fuse_request_send_notify_reply() to fuse_simple_notify_reply() and convert to passing fuse_args instead of fuse_req. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 92 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 30 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d87accc9df9d..42db40750a2a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -655,18 +655,32 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) } EXPORT_SYMBOL_GPL(fuse_request_send_background); -static int fuse_request_send_notify_reply(struct fuse_conn *fc, - struct fuse_req *req, u64 unique) +static int fuse_simple_notify_reply(struct fuse_conn *fc, + struct fuse_args *args, u64 unique) { - int err = -ENODEV; + struct fuse_req *req; struct fuse_iqueue *fiq = &fc->iq; + int err = 0; + + req = fuse_get_req(fc, 0); + if (IS_ERR(req)) + return PTR_ERR(req); __clear_bit(FR_ISREPLY, &req->flags); req->in.h.unique = unique; + + fuse_args_to_req(req, args); + req->args = args; + req->end = fuse_simple_end; + spin_lock(&fiq->lock); if (fiq->connected) { queue_request(fiq, req); - err = 0; + spin_unlock(&fiq->lock); + } else { + err = -ENODEV; + spin_unlock(&fiq->lock); + fuse_put_request(fc, req); } spin_unlock(&fiq->lock); @@ -1691,9 +1705,19 @@ out_finish: return err; } -static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) +struct fuse_retrieve_args { + struct fuse_args_pages ap; + struct fuse_notify_retrieve_in inarg; +}; + +static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args, + int error) { - release_pages(req->pages, req->num_pages); + struct fuse_retrieve_args *ra = + container_of(args, typeof(*ra), ap.args); + + release_pages(ra->ap.pages, ra->ap.num_pages); + kfree(ra); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, @@ -1701,13 +1725,16 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, { int err; struct address_space *mapping = inode->i_mapping; - struct fuse_req *req; pgoff_t index; loff_t file_size; unsigned int num; unsigned int offset; size_t total_len = 0; unsigned int num_pages; + struct fuse_retrieve_args *ra; + size_t args_size = sizeof(*ra); + struct fuse_args_pages *ap; + struct fuse_args *args; offset = outarg->offset & ~PAGE_MASK; file_size = i_size_read(inode); @@ -1721,19 +1748,26 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); - req = fuse_get_req(fc, num_pages); - if (IS_ERR(req)) - return PTR_ERR(req); + args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0])); - req->in.h.opcode = FUSE_NOTIFY_REPLY; - req->in.h.nodeid = outarg->nodeid; - req->in.numargs = 2; - req->in.argpages = 1; - req->end = fuse_retrieve_end; + ra = kzalloc(args_size, GFP_KERNEL); + if (!ra) + return -ENOMEM; + + ap = &ra->ap; + ap->pages = (void *) (ra + 1); + ap->descs = (void *) (ap->pages + num_pages); + + args = &ap->args; + args->nodeid = outarg->nodeid; + args->opcode = FUSE_NOTIFY_REPLY; + args->in_numargs = 2; + args->in_pages = true; + args->end = fuse_retrieve_end; index = outarg->offset >> PAGE_SHIFT; - while (num && req->num_pages < num_pages) { + while (num && ap->num_pages < num_pages) { struct page *page; unsigned int this_num; @@ -1742,27 +1776,25 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, break; this_num = min_t(unsigned, num, PAGE_SIZE - offset); - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].offset = offset; - req->page_descs[req->num_pages].length = this_num; - req->num_pages++; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].offset = offset; + ap->descs[ap->num_pages].length = this_num; + ap->num_pages++; offset = 0; num -= this_num; total_len += this_num; index++; } - req->misc.retrieve_in.offset = outarg->offset; - req->misc.retrieve_in.size = total_len; - req->in.args[0].size = sizeof(req->misc.retrieve_in); - req->in.args[0].value = &req->misc.retrieve_in; - req->in.args[1].size = total_len; + ra->inarg.offset = outarg->offset; + ra->inarg.size = total_len; + args->in_args[0].size = sizeof(ra->inarg); + args->in_args[0].value = &ra->inarg; + args->in_args[1].size = total_len; - err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); - if (err) { - fuse_retrieve_end(fc, req); - fuse_put_request(fc, req); - } + err = fuse_simple_notify_reply(fc, args, outarg->notify_unique); + if (err) + fuse_retrieve_end(fc, args, err); return err; } From 66abc3599c3c8795861470f21ae149520a57153d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:11 +0200 Subject: [PATCH 0419/2880] fuse: unexport request ops All requests are now sent with one of the fuse_simple_... helpers. Get rid of the old api from the fuse internal header. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 51 +++++++++--------------------------------------- fs/fuse/fuse_i.h | 39 ------------------------------------ 2 files changed, 9 insertions(+), 81 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 42db40750a2a..1ff18d319949 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -79,16 +79,10 @@ static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) return req; } -struct fuse_req *fuse_request_alloc(unsigned npages) +static struct fuse_req *fuse_request_alloc(unsigned int npages) { return __fuse_request_alloc(npages, GFP_KERNEL); } -EXPORT_SYMBOL_GPL(fuse_request_alloc); - -struct fuse_req *fuse_request_alloc_nofs(unsigned npages) -{ - return __fuse_request_alloc(npages, GFP_NOFS); -} static void fuse_req_pages_free(struct fuse_req *req) { @@ -96,13 +90,13 @@ static void fuse_req_pages_free(struct fuse_req *req) kfree(req->pages); } -void fuse_request_free(struct fuse_req *req) +static void fuse_request_free(struct fuse_req *req) { fuse_req_pages_free(req); kmem_cache_free(fuse_req_cachep, req); } -void __fuse_get_request(struct fuse_req *req) +static void __fuse_get_request(struct fuse_req *req) { refcount_inc(&req->count); } @@ -139,6 +133,8 @@ static void fuse_drop_waiting(struct fuse_conn *fc) } } +static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); + static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, bool for_background) { @@ -191,20 +187,12 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, return ERR_PTR(err); } -struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) +static struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned int npages) { return __fuse_get_req(fc, npages, false); } -EXPORT_SYMBOL_GPL(fuse_get_req); -struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, - unsigned npages) -{ - return __fuse_get_req(fc, npages, true); -} -EXPORT_SYMBOL_GPL(fuse_get_req_for_background); - -void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (refcount_dec_and_test(&req->count)) { if (test_bit(FR_BACKGROUND, &req->flags)) { @@ -455,17 +443,6 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) } } -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) -{ - __set_bit(FR_ISREPLY, &req->flags); - if (!test_bit(FR_WAITING, &req->flags)) { - __set_bit(FR_WAITING, &req->flags); - atomic_inc(&fc->num_waiting); - } - __fuse_request_send(fc, req); -} -EXPORT_SYMBOL_GPL(fuse_request_send); - static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) { if (fc->minor < 4 && args->opcode == FUSE_STATFS) @@ -571,7 +548,8 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) return ret; } -bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) +static bool fuse_request_queue_background(struct fuse_conn *fc, + struct fuse_req *req) { bool queued = false; @@ -644,17 +622,6 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, } EXPORT_SYMBOL_GPL(fuse_simple_background); -void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) -{ - WARN_ON(!req->end); - if (!fuse_request_queue_background(fc, req)) { - req->out.h.error = -ENOTCONN; - req->end(fc, req); - fuse_put_request(fc, req); - } -} -EXPORT_SYMBOL_GPL(fuse_request_send_background); - static int fuse_simple_notify_reply(struct fuse_conn *fc, struct fuse_args *args, u64 unique) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a055a45361ef..ffbc2cd649e3 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -924,42 +924,9 @@ void __exit fuse_ctl_cleanup(void); /** * Allocate a request */ -struct fuse_req *fuse_request_alloc(unsigned npages); - -struct fuse_req *fuse_request_alloc_nofs(unsigned npages); - struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, struct fuse_page_desc **desc); -/** - * Free a request - */ -void fuse_request_free(struct fuse_req *req); - -/** - * Get a request, may fail with -ENOMEM, - * caller should specify # elements in req->pages[] explicitly - */ -struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); -struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, - unsigned npages); - -/* - * Increment reference count on request - */ -void __fuse_get_request(struct fuse_req *req); - -/** - * Decrement reference count of a request. If count goes to zero free - * the request. - */ -void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); - -/** - * Send a request (synchronous) - */ -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); - /** * Simple request sending that does request allocation and freeing */ @@ -967,12 +934,6 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, gfp_t gfp_flags); -/** - * Send a request in the background - */ -void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); -bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req); - /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); From 7213394c4e184b002d8011c13d916e7ac6d17520 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:11 +0200 Subject: [PATCH 0420/2880] fuse: simplify request allocation Page arrays are not allocated together with the request anymore. Get rid of the dead code Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 61 +++++++++--------------------------------------- fs/fuse/file.c | 4 ++-- fs/fuse/fuse_i.h | 7 ------ 3 files changed, 13 insertions(+), 59 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1ff18d319949..f240d541edfc 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -40,59 +40,26 @@ static struct fuse_dev *fuse_get_dev(struct file *file) return READ_ONCE(file->private_data); } -static void fuse_request_init(struct fuse_req *req, struct page **pages, - struct fuse_page_desc *page_descs, - unsigned npages) +static void fuse_request_init(struct fuse_req *req) { INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); refcount_set(&req->count, 1); - req->pages = pages; - req->page_descs = page_descs; - req->max_pages = npages; __set_bit(FR_PENDING, &req->flags); } -static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) +static struct fuse_req *fuse_request_alloc(gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); - if (req) { - struct page **pages = NULL; - struct fuse_page_desc *page_descs = NULL; + if (req) + fuse_request_init(req); - WARN_ON(npages > FUSE_MAX_MAX_PAGES); - if (npages > FUSE_REQ_INLINE_PAGES) { - pages = fuse_pages_alloc(npages, flags, &page_descs); - if (!pages) { - kmem_cache_free(fuse_req_cachep, req); - return NULL; - } - __set_bit(FR_ALLOC_PAGES, &req->flags); - } else if (npages) { - pages = req->inline_pages; - page_descs = req->inline_page_descs; - } - - fuse_request_init(req, pages, page_descs, npages); - } return req; } -static struct fuse_req *fuse_request_alloc(unsigned int npages) -{ - return __fuse_request_alloc(npages, GFP_KERNEL); -} - -static void fuse_req_pages_free(struct fuse_req *req) -{ - if (test_bit(FR_ALLOC_PAGES, &req->flags)) - kfree(req->pages); -} - static void fuse_request_free(struct fuse_req *req) { - fuse_req_pages_free(req); kmem_cache_free(fuse_req_cachep, req); } @@ -135,8 +102,7 @@ static void fuse_drop_waiting(struct fuse_conn *fc) static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); -static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, - bool for_background) +static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) { struct fuse_req *req; int err; @@ -159,7 +125,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, if (fc->conn_error) goto out; - req = fuse_request_alloc(npages); + req = fuse_request_alloc(GFP_KERNEL); err = -ENOMEM; if (!req) { if (for_background) @@ -187,11 +153,6 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, return ERR_PTR(err); } -static struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned int npages) -{ - return __fuse_get_req(fc, npages, false); -} - static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (refcount_dec_and_test(&req->count)) { @@ -517,7 +478,7 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) if (args->force) { atomic_inc(&fc->num_waiting); - req = __fuse_request_alloc(0, GFP_KERNEL | __GFP_NOFAIL); + req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL); if (!args->nocreds) fuse_force_creds(fc, req); @@ -526,7 +487,7 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) __set_bit(FR_FORCE, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fc, 0); + req = fuse_get_req(fc, false); if (IS_ERR(req)) return PTR_ERR(req); } @@ -597,13 +558,13 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, if (args->force) { WARN_ON(!args->nocreds); - req = __fuse_request_alloc(0, gfp_flags); + req = fuse_request_alloc(gfp_flags); if (!req) return -ENOMEM; __set_bit(FR_BACKGROUND, &req->flags); } else { WARN_ON(args->nocreds); - req = __fuse_get_req(fc, 0, true); + req = fuse_get_req(fc, true); if (IS_ERR(req)) return PTR_ERR(req); } @@ -629,7 +590,7 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc, struct fuse_iqueue *fiq = &fc->iq; int err = 0; - req = fuse_get_req(fc, 0); + req = fuse_get_req(fc, false); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 56ed45ef3058..a2ea347c4d2c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,8 +19,8 @@ #include #include -struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc) +static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) { struct page **pages; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ffbc2cd649e3..90cb82d5d62d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -362,7 +362,6 @@ enum fuse_req_flag { FR_SENT, FR_FINISHED, FR_PRIVATE, - FR_ALLOC_PAGES, }; /** @@ -921,12 +920,6 @@ void fuse_dev_cleanup(void); int fuse_ctl_init(void); void __exit fuse_ctl_cleanup(void); -/** - * Allocate a request - */ -struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc); - /** * Simple request sending that does request allocation and freeing */ From 145b673bd208af97b2f98572f286111ab8e7bc59 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:11 +0200 Subject: [PATCH 0421/2880] fuse: clean up fuse_req Get rid of request specific fields in fuse_req that are not used anymore. Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 45 --------------------------------------------- 1 file changed, 45 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 90cb82d5d62d..a81fdbe62df9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -47,9 +47,6 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 -/** Number of page pointers embedded in fuse_req */ -#define FUSE_REQ_INLINE_PAGES 1 - /** List of active connections */ extern struct list_head fuse_conn_list; @@ -397,57 +394,15 @@ struct fuse_req { /** Used to wake up the task waiting for completion of request*/ wait_queue_head_t waitq; - /** Data for asynchronous requests */ - union { - struct { - struct fuse_release_in in; - struct inode *inode; - } release; - struct fuse_init_in init_in; - struct fuse_init_out init_out; - struct cuse_init_in cuse_init_in; - struct { - struct fuse_read_in in; - u64 attr_ver; - } read; - struct { - struct fuse_write_in in; - struct fuse_write_out out; - struct fuse_req *next; - } write; - struct fuse_notify_retrieve_in retrieve_in; - } misc; - /** page vector */ struct page **pages; /** page-descriptor vector */ struct fuse_page_desc *page_descs; - /** size of the 'pages' array */ - unsigned max_pages; - - /** inline page vector */ - struct page *inline_pages[FUSE_REQ_INLINE_PAGES]; - - /** inline page-descriptor vector */ - struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES]; - /** number of pages in vector */ unsigned num_pages; - /** File used in the request (or NULL) */ - struct fuse_file *ff; - - /** Inode used in the request or NULL */ - struct inode *inode; - - /** AIO control block */ - struct fuse_io_priv *io; - - /** Link on fi->writepages */ - struct list_head writepages_entry; - /** Request completion callback */ void (*end)(struct fuse_conn *, struct fuse_req *); From d49937749fef2597f6bcaf2a0ed67e88e347b7fb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:11 +0200 Subject: [PATCH 0422/2880] fuse: stop copying args to fuse_req No need to duplicate the argument arrays in fuse_req, so just dereference req->args instead of copying to the fuse_req internal ones. This allows further cleanup of the fuse_req structure. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 80 ++++++++++++++++-------------------------------- fs/fuse/fuse_i.h | 60 +++++------------------------------- 2 files changed, 34 insertions(+), 106 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f240d541edfc..dfc0658d990b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -202,7 +202,8 @@ static unsigned int fuse_req_hash(u64 unique) static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + - len_args(req->in.numargs, (struct fuse_arg *) req->in.args); + len_args(req->args->in_numargs, + (struct fuse_arg *) req->args->in_args); list_add_tail(&req->list, &fiq->pending); wake_up(&fiq->waitq); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); @@ -257,6 +258,7 @@ static void flush_bg_queue(struct fuse_conn *fc) static void request_end(struct fuse_conn *fc, struct fuse_req *req) { struct fuse_iqueue *fiq = &fc->iq; + bool async = req->args->end; if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; @@ -302,8 +304,8 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&req->waitq); } - if (req->end) - req->end(fc, req); + if (async) + req->args->end(fc, req->args, req->out.h.error); put_request: fuse_put_request(fc, req); } @@ -450,25 +452,12 @@ void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) req->in.h.opcode = args->opcode; req->in.h.nodeid = args->nodeid; - req->in.numargs = args->in_numargs; - memcpy(req->in.args, args->in_args, - args->in_numargs * sizeof(struct fuse_in_arg)); - req->out.argvar = args->out_argvar; - req->out.numargs = args->out_numargs; - memcpy(req->out.args, args->out_args, - args->out_numargs * sizeof(struct fuse_arg)); - if (args->in_pages || args->out_pages) { - req->in.argpages = args->in_pages; - req->out.argpages = args->out_pages; - req->out.page_zeroing = args->page_zeroing; - req->out.page_replace = args->page_replace; - req->pages = ap->pages; req->page_descs = ap->descs; req->num_pages = ap->num_pages; } - + req->args = args; } ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) @@ -502,7 +491,7 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) ret = req->out.h.error; if (!ret && args->out_argvar) { BUG_ON(args->out_numargs == 0); - ret = req->out.args[args->out_numargs - 1].size; + ret = args->out_args[args->out_numargs - 1].size; } fuse_put_request(fc, req); @@ -538,19 +527,6 @@ static bool fuse_request_queue_background(struct fuse_conn *fc, return queued; } -static void fuse_simple_end(struct fuse_conn *fc, struct fuse_req *req) -{ - struct fuse_args *args = req->args; - int err = req->out.h.error; - - if (!err && args->out_argvar) { - BUG_ON(args->out_numargs == 0); - args->out_args[args->out_numargs - 1].size = - req->out.args[args->out_numargs - 1].size; - } - req->args->end(fc, req->args, req->out.h.error); -} - int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, gfp_t gfp_flags) { @@ -571,9 +547,6 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, fuse_args_to_req(req, args); - req->args = args; - req->end = fuse_simple_end; - if (!fuse_request_queue_background(fc, req)) { fuse_put_request(fc, req); return -ENOTCONN; @@ -598,8 +571,6 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc, req->in.h.unique = unique; fuse_args_to_req(req, args); - req->args = args; - req->end = fuse_simple_end; spin_lock(&fiq->lock); if (fiq->connected) { @@ -1197,7 +1168,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_iqueue *fiq = &fc->iq; struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; - struct fuse_in *in; + struct fuse_args *args; unsigned reqsize; unsigned int hash; @@ -1258,14 +1229,14 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, list_del_init(&req->list); spin_unlock(&fiq->lock); - in = &req->in; - reqsize = in->h.len; + args = req->args; + reqsize = req->in.h.len; /* If request is too large, reply with an error and restart the read */ if (nbytes < reqsize) { req->out.h.error = -EIO; /* SETXATTR is special, since it may contain too large data */ - if (in->h.opcode == FUSE_SETXATTR) + if (args->opcode == FUSE_SETXATTR) req->out.h.error = -E2BIG; request_end(fc, req); goto restart; @@ -1274,10 +1245,10 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, list_add(&req->list, &fpq->io); spin_unlock(&fpq->lock); cs->req = req; - err = fuse_copy_one(cs, &in->h, sizeof(in->h)); + err = fuse_copy_one(cs, &req->in.h, sizeof(req->in.h)); if (!err) - err = fuse_copy_args(cs, in->numargs, in->argpages, - (struct fuse_arg *) in->args, 0); + err = fuse_copy_args(cs, args->in_numargs, args->in_pages, + (struct fuse_arg *) args->in_args, 0); fuse_copy_finish(cs); spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); @@ -1808,27 +1779,25 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) return NULL; } -static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, +static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, unsigned nbytes) { unsigned reqsize = sizeof(struct fuse_out_header); - if (out->h.error) - return nbytes != reqsize ? -EINVAL : 0; + reqsize += len_args(args->out_numargs, args->out_args); - reqsize += len_args(out->numargs, out->args); - - if (reqsize < nbytes || (reqsize > nbytes && !out->argvar)) + if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar)) return -EINVAL; else if (reqsize > nbytes) { - struct fuse_arg *lastarg = &out->args[out->numargs-1]; + struct fuse_arg *lastarg = &args->out_args[args->out_numargs-1]; unsigned diffsize = reqsize - nbytes; + if (diffsize > lastarg->size) return -EINVAL; lastarg->size -= diffsize; } - return fuse_copy_args(cs, out->numargs, out->argpages, out->args, - out->page_zeroing); + return fuse_copy_args(cs, args->out_numargs, args->out_pages, + args->out_args, args->page_zeroing); } /* @@ -1907,10 +1876,13 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, set_bit(FR_LOCKED, &req->flags); spin_unlock(&fpq->lock); cs->req = req; - if (!req->out.page_replace) + if (!req->args->page_replace) cs->move_pages = 0; - err = copy_out_args(cs, &req->out, nbytes); + if (oh.error) + err = nbytes != sizeof(oh) ? -EINVAL : 0; + else + err = copy_out_args(cs, req->args, nbytes); fuse_copy_finish(cs); spin_lock(&fpq->lock); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a81fdbe62df9..9454f2328bd0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -224,57 +224,12 @@ struct fuse_in_arg { const void *value; }; -/** The request input */ -struct fuse_in { - /** The request header */ - struct fuse_in_header h; - - /** True if the data for the last argument is in req->pages */ - unsigned argpages:1; - - /** Number of arguments */ - unsigned numargs; - - /** Array of arguments */ - struct fuse_in_arg args[3]; -}; - /** One output argument of a request */ struct fuse_arg { unsigned size; void *value; }; -/** The request output */ -struct fuse_out { - /** Header returned from userspace */ - struct fuse_out_header h; - - /* - * The following bitfields are not changed during the request - * processing - */ - - /** Last argument is variable length (can be shorter than - arg->size) */ - unsigned argvar:1; - - /** Last argument is a list of pages to copy data to */ - unsigned argpages:1; - - /** Zero partially or not copied pages */ - unsigned page_zeroing:1; - - /** Pages may be replaced with new ones */ - unsigned page_replace:1; - - /** Number or arguments */ - unsigned numargs; - - /** Array of arguments */ - struct fuse_arg args[2]; -}; - /** FUSE page descriptor */ struct fuse_page_desc { unsigned int length; @@ -385,11 +340,15 @@ struct fuse_req { /* Request flags, updated with test/set/clear_bit() */ unsigned long flags; - /** The request input */ - struct fuse_in in; + /* The request input header */ + struct { + struct fuse_in_header h; + } in; - /** The request output */ - struct fuse_out out; + /* The request output header */ + struct { + struct fuse_out_header h; + } out; /** Used to wake up the task waiting for completion of request*/ wait_queue_head_t waitq; @@ -403,9 +362,6 @@ struct fuse_req { /** number of pages in vector */ unsigned num_pages; - /** Request completion callback */ - void (*end)(struct fuse_conn *, struct fuse_req *); - }; struct fuse_iqueue { From 05ea48cc2b098c533193bb058b82aa016a8361bc Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:11 +0200 Subject: [PATCH 0423/2880] fuse: stop copying pages to fuse_req The page array pointers are also duplicated across fuse_args_pages and fuse_req. Get rid of the fuse_req ones. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 18 ++++++------------ fs/fuse/fuse_i.h | 9 --------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index dfc0658d990b..d7cae9001ca7 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -448,15 +448,8 @@ static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) { - struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); - req->in.h.opcode = args->opcode; req->in.h.nodeid = args->nodeid; - if (args->in_pages || args->out_pages) { - req->pages = ap->pages; - req->page_descs = ap->descs; - req->num_pages = ap->num_pages; - } req->args = args; } @@ -939,14 +932,15 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, { unsigned i; struct fuse_req *req = cs->req; + struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); - for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { + + for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) { int err; - unsigned offset = req->page_descs[i].offset; - unsigned count = min(nbytes, req->page_descs[i].length); + unsigned int offset = ap->descs[i].offset; + unsigned int count = min(nbytes, ap->descs[i].length); - err = fuse_copy_page(cs, &req->pages[i], offset, count, - zeroing); + err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing); if (err) return err; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 9454f2328bd0..378f1fe69d07 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -353,15 +353,6 @@ struct fuse_req { /** Used to wake up the task waiting for completion of request*/ wait_queue_head_t waitq; - /** page vector */ - struct page **pages; - - /** page-descriptor vector */ - struct fuse_page_desc *page_descs; - - /** number of pages in vector */ - unsigned num_pages; - }; struct fuse_iqueue { From 38c7a30a9d5ff8dd0c7a9ab1441aa5a96cf60afa Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 10 Sep 2019 10:26:46 -0700 Subject: [PATCH 0424/2880] Documentation/process: Volunteer as the ambassador for Intel Cc: Jonathan Corbet Cc: Greg Kroah-Hartman Cc: Sasha Levin Cc: Ben Hutchings Cc: Thomas Gleixner Cc: Laura Abbott Cc: Andrew Cooper Cc: Trilok Soni Cc: Kees Cook Cc: Tony Luck Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Dan Williams Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Link: https://lore.kernel.org/r/20190910172646.25BFCE7B@viggo.jf.intel.com Signed-off-by: Greg Kroah-Hartman --- Documentation/process/embargoed-hardware-issues.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index 402636356fbe..e57b9f39c69f 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -216,7 +216,7 @@ an involved disclosed party. The current ambassadors list: ARM AMD IBM - Intel + Intel Tony Luck Qualcomm Trilok Soni Microsoft Sasha Levin From aac60f1a867773de9eb164013d89c99f3ea1f009 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Wed, 11 Sep 2019 02:33:35 +0000 Subject: [PATCH 0425/2880] KVM: arm/arm64: vgic: Use the appropriate TRACE_INCLUDE_PATH Commit 49dfe94fe5ad ("KVM: arm/arm64: Fix TRACE_INCLUDE_PATH") fixes TRACE_INCLUDE_PATH to the correct relative path to the define_trace.h and explains why did the old one work. The same fix should be applied to virt/kvm/arm/vgic/trace.h. Reviewed-by: Masahiro Yamada Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h index 55fed77a9f73..4fd4f6db181b 100644 --- a/virt/kvm/arm/vgic/trace.h +++ b/virt/kvm/arm/vgic/trace.h @@ -30,7 +30,7 @@ TRACE_EVENT(vgic_update_irq_pending, #endif /* _TRACE_VGIC_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm/vgic +#define TRACE_INCLUDE_PATH ../../virt/kvm/arm/vgic #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace From f22f812d5ce75a18b56073a7a63862e6ea764070 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 12 Sep 2019 14:28:13 +0200 Subject: [PATCH 0426/2880] fuse: fix request limit The size of struct fuse_req was reduced from 392B to 144B on a non-debug config, thus the sanitize_global_limit() helper was setting a larger default limit. This doesn't really reflect reduction in the memory used by requests, since the fields removed from fuse_req were added to fuse_args derived structs; e.g. sizeof(struct fuse_writepages_args) is 248B, thus resulting in slightly more memory being used for writepage requests overalll (due to using 256B slabs). Make the calculatation ignore the size of fuse_req and use the old 392B value. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2cec25ad54b7..4404d21649ff 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -826,9 +826,12 @@ static const struct super_operations fuse_super_operations = { static void sanitize_global_limit(unsigned *limit) { + /* + * The default maximum number of async requests is calculated to consume + * 1/2^13 of the total memory, assuming 392 bytes per request. + */ if (*limit == 0) - *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / - sizeof(struct fuse_req); + *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392; if (*limit >= 1 << 16) *limit = (1 << 16) - 1; From 04ec5af0776e9baefed59891f12adbcb5fa71a23 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 21 Jun 2018 09:33:40 +0100 Subject: [PATCH 0427/2880] fuse: export fuse_end_request() virtio-fs will need to complete requests from outside fs/fuse/dev.c. Make the symbol visible. Signed-off-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 19 ++++++++++--------- fs/fuse/fuse_i.h | 5 +++++ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d7cae9001ca7..2696f000df67 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -255,7 +255,7 @@ static void flush_bg_queue(struct fuse_conn *fc) * the 'end' callback is called if given, else the reference to the * request is released */ -static void request_end(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) { struct fuse_iqueue *fiq = &fc->iq; bool async = req->args->end; @@ -309,6 +309,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) put_request: fuse_put_request(fc, req); } +EXPORT_SYMBOL_GPL(fuse_request_end); static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { @@ -396,12 +397,12 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) req->in.h.unique = fuse_get_unique(fiq); queue_request(fiq, req); /* acquire extra reference, since request is still needed - after request_end() */ + after fuse_request_end() */ __fuse_get_request(req); spin_unlock(&fiq->lock); request_wait_answer(fc, req); - /* Pairs with smp_wmb() in request_end() */ + /* Pairs with smp_wmb() in fuse_request_end() */ smp_rmb(); } } @@ -1151,7 +1152,7 @@ __releases(fiq->lock) * the pending list and copies request data to userspace buffer. If * no reply is needed (FORGET) or request has been aborted or there * was an error during the copying then it's finished by calling - * request_end(). Otherwise add it to the processing list, and set + * fuse_request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, @@ -1232,7 +1233,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, /* SETXATTR is special, since it may contain too large data */ if (args->opcode == FUSE_SETXATTR) req->out.h.error = -E2BIG; - request_end(fc, req); + fuse_request_end(fc, req); goto restart; } spin_lock(&fpq->lock); @@ -1275,7 +1276,7 @@ out_end: if (!test_bit(FR_PRIVATE, &req->flags)) list_del_init(&req->list); spin_unlock(&fpq->lock); - request_end(fc, req); + fuse_request_end(fc, req); return err; err_unlock: @@ -1799,7 +1800,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, * the write buffer. The request is then searched on the processing * list by the unique ID found in the header. If found, then remove * it from the list and copy the rest of the buffer to the request. - * The request is finished by calling request_end() + * The request is finished by calling fuse_request_end(). */ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_copy_state *cs, size_t nbytes) @@ -1889,7 +1890,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, list_del_init(&req->list); spin_unlock(&fpq->lock); - request_end(fc, req); + fuse_request_end(fc, req); out: return err ? err : nbytes; @@ -2029,7 +2030,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req->out.h.error = -ECONNABORTED; clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - request_end(fc, req); + fuse_request_end(fc, req); } } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 378f1fe69d07..5c50d3da1b0d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -829,6 +829,11 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, gfp_t gfp_flags); +/** + * End a finished request + */ +void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req); + /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); From 14d46d7abc3973a47e8eb0eb5eb87ee8d910a505 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 21 Jun 2018 09:34:25 +0100 Subject: [PATCH 0428/2880] fuse: export fuse_len_args() virtio-fs will need to query the length of fuse_arg lists. Make the symbol visible. Signed-off-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 9 +++++---- fs/fuse/fuse_i.h | 5 +++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 2696f000df67..8f36e6485bb2 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -177,7 +177,7 @@ static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) } EXPORT_SYMBOL_GPL(fuse_put_request); -static unsigned len_args(unsigned numargs, struct fuse_arg *args) +unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) { unsigned nbytes = 0; unsigned i; @@ -187,6 +187,7 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) return nbytes; } +EXPORT_SYMBOL_GPL(fuse_len_args); static u64 fuse_get_unique(struct fuse_iqueue *fiq) { @@ -202,8 +203,8 @@ static unsigned int fuse_req_hash(u64 unique) static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + - len_args(req->args->in_numargs, - (struct fuse_arg *) req->args->in_args); + fuse_len_args(req->args->in_numargs, + (struct fuse_arg *) req->args->in_args); list_add_tail(&req->list, &fiq->pending); wake_up(&fiq->waitq); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); @@ -1779,7 +1780,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, { unsigned reqsize = sizeof(struct fuse_out_header); - reqsize += len_args(args->out_numargs, args->out_args); + reqsize += fuse_len_args(args->out_numargs, args->out_args); if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar)) return -EINVAL; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 5c50d3da1b0d..39ef981a618c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -971,4 +971,9 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); /* readdir.c */ int fuse_readdir(struct file *file, struct dir_context *ctx); +/** + * Return the number of bytes in an arguments list + */ +unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); + #endif /* _FS_FUSE_I_H */ From 95a84cdb11c26315a6d34664846f82c438c961a1 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 6 Mar 2019 16:51:39 -0500 Subject: [PATCH 0429/2880] fuse: export fuse_send_init_request() This will be used by virtio-fs to send init request to fuse server after initialization of virt queues. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 1 + fs/fuse/inode.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 39ef981a618c..7192080a06d0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -867,6 +867,7 @@ void fuse_conn_put(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); +void fuse_send_init(struct fuse_conn *fc); /** * Add connection to control filesystem diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4404d21649ff..5d455f4d6195 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -972,7 +972,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, wake_up_all(&fc->blocked_waitq); } -static void fuse_send_init(struct fuse_conn *fc) +void fuse_send_init(struct fuse_conn *fc) { struct fuse_init_args *ia; @@ -1009,6 +1009,7 @@ static void fuse_send_init(struct fuse_conn *fc) if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0) process_init_reply(fc, &ia->args, -ENOTCONN); } +EXPORT_SYMBOL_GPL(fuse_send_init); static void fuse_free_conn(struct fuse_conn *fc) { From 79d96efffda7597b41968d5d8813b39fc2965f1b Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 22 Jun 2018 13:48:30 +0100 Subject: [PATCH 0430/2880] fuse: export fuse_get_unique() virtio-fs will need unique IDs for FORGET requests from outside fs/fuse/dev.c. Make the symbol visible. Signed-off-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 3 ++- fs/fuse/fuse_i.h | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8f36e6485bb2..151176a99dc2 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -189,11 +189,12 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) } EXPORT_SYMBOL_GPL(fuse_len_args); -static u64 fuse_get_unique(struct fuse_iqueue *fiq) +u64 fuse_get_unique(struct fuse_iqueue *fiq) { fiq->reqctr += FUSE_REQ_ID_STEP; return fiq->reqctr; } +EXPORT_SYMBOL_GPL(fuse_get_unique); static unsigned int fuse_req_hash(u64 unique) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7192080a06d0..b868b71d47d9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -977,4 +977,9 @@ int fuse_readdir(struct file *file, struct dir_context *ctx); */ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); +/** + * Get the next unique ID for a request + */ +u64 fuse_get_unique(struct fuse_iqueue *fiq); + #endif /* _FS_FUSE_I_H */ From 4388c5aac4bae5c83a2c66882043942002ba09a2 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 5 Jun 2019 15:50:43 -0400 Subject: [PATCH 0431/2880] fuse: export fuse_dequeue_forget() function File systems like virtio-fs need to do not have to play directly with forget list data structures. There is a helper function use that instead. Rename dequeue_forget() to fuse_dequeue_forget() and export it so that stacked filesystems can use it. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 13 +++++++------ fs/fuse/fuse_i.h | 4 ++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 151176a99dc2..943bc5cf941a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1033,9 +1033,9 @@ __releases(fiq->lock) return err ? err : reqsize; } -static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, - unsigned max, - unsigned *countp) +struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp) { struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; @@ -1054,6 +1054,7 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, return head; } +EXPORT_SYMBOL(fuse_dequeue_forget); static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, @@ -1061,7 +1062,7 @@ static int fuse_read_single_forget(struct fuse_iqueue *fiq, __releases(fiq->lock) { int err; - struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL); + struct fuse_forget_link *forget = fuse_dequeue_forget(fiq, 1, NULL); struct fuse_forget_in arg = { .nlookup = forget->forget_one.nlookup, }; @@ -1109,7 +1110,7 @@ __releases(fiq->lock) } max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); - head = dequeue_forget(fiq, max_forgets, &count); + head = fuse_dequeue_forget(fiq, max_forgets, &count); spin_unlock(&fiq->lock); arg.count = count; @@ -2119,7 +2120,7 @@ void fuse_abort_conn(struct fuse_conn *fc) clear_bit(FR_PENDING, &req->flags); list_splice_tail_init(&fiq->pending, &to_end); while (forget_pending(fiq)) - kfree(dequeue_forget(fiq, 1, NULL)); + kfree(fuse_dequeue_forget(fiq, 1, NULL)); wake_up_all(&fiq->waitq); spin_unlock(&fiq->lock); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b868b71d47d9..5f910c99e8dd 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -729,6 +729,10 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); +struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp); + /* * Initialize READ or READDIR request */ From 0cc2656cdb0b1f234e6d29378cb061e29d7522bc Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 13 Jun 2018 10:23:04 +0100 Subject: [PATCH 0432/2880] fuse: extract fuse_fill_super_common() fuse_fill_super() includes code to process the fd= option and link the struct fuse_dev to the fd's struct file. In virtio-fs there is no file descriptor because /dev/fuse is not used. This patch extracts fuse_fill_super_common() so that both classic fuse and virtio-fs can share the code to initialize a mount. Signed-off-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 27 +++++++++++ fs/fuse/inode.c | 114 ++++++++++++++++++++++------------------------- 2 files changed, 81 insertions(+), 60 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 5f910c99e8dd..1902148281cc 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -416,6 +416,26 @@ struct fuse_dev { struct list_head entry; }; +struct fuse_fs_context { + int fd; + unsigned int rootmode; + kuid_t user_id; + kgid_t group_id; + bool is_bdev:1; + bool fd_present:1; + bool rootmode_present:1; + bool user_id_present:1; + bool group_id_present:1; + bool default_permissions:1; + bool allow_other:1; + unsigned int max_read; + unsigned int blksize; + const char *subtype; + + /* fuse_dev pointer to fill in, should contain NULL on entry */ + void **fudptr; +}; + /** * A Fuse connection. * @@ -873,6 +893,13 @@ struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); void fuse_send_init(struct fuse_conn *fc); +/** + * Fill in superblock and initialize fuse connection + * @sb: partially-initialized superblock to fill in + * @ctx: mount context + */ +int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); + /** * Add connection to control filesystem */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5d455f4d6195..30d92e633ece 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -64,23 +64,6 @@ MODULE_PARM_DESC(max_user_congthresh, static struct file_system_type fuseblk_fs_type; #endif -struct fuse_fs_context { - const char *subtype; - bool is_bdev; - int fd; - unsigned rootmode; - kuid_t user_id; - kgid_t group_id; - unsigned fd_present:1; - unsigned rootmode_present:1; - unsigned user_id_present:1; - unsigned group_id_present:1; - unsigned default_permissions:1; - unsigned allow_other:1; - unsigned max_read; - unsigned blksize; -}; - struct fuse_forget_link *fuse_alloc_forget(void) { return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); @@ -1100,16 +1083,13 @@ void fuse_dev_free(struct fuse_dev *fud) } EXPORT_SYMBOL_GPL(fuse_dev_free); -static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) +int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { - struct fuse_fs_context *ctx = fsc->fs_private; struct fuse_dev *fud; - struct fuse_conn *fc; + struct fuse_conn *fc = get_fuse_conn_super(sb); struct inode *root; - struct file *file; struct dentry *root_dentry; int err; - int is_bdev = sb->s_bdev != NULL; err = -EINVAL; if (sb->s_flags & SB_MANDLOCK) @@ -1117,7 +1097,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - if (is_bdev) { + if (ctx->is_bdev) { #ifdef CONFIG_BLOCK err = -EINVAL; if (!sb_set_blocksize(sb, ctx->blksize)) @@ -1140,19 +1120,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; - file = fget(ctx->fd); - err = -EINVAL; - if (!file) - goto err; - - /* - * Require mount to happen from the same user namespace which - * opened /dev/fuse to prevent potential attacks. - */ - if (file->f_op != &fuse_dev_operations || - file->f_cred->user_ns != sb->s_user_ns) - goto err_fput; - /* * If we are not in the initial user namespace posix * acls must be translated. @@ -1160,17 +1127,9 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) if (sb->s_user_ns != &init_user_ns) sb->s_xattr = fuse_no_acl_xattr_handlers; - fc = kmalloc(sizeof(*fc), GFP_KERNEL); - err = -ENOMEM; - if (!fc) - goto err_fput; - - fuse_conn_init(fc, sb->s_user_ns); - fc->release = fuse_free_conn; - fud = fuse_dev_alloc(fc); if (!fud) - goto err_put_conn; + goto err; fc->dev = sb->s_dev; fc->sb = sb; @@ -1188,10 +1147,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) fc->user_id = ctx->user_id; fc->group_id = ctx->group_id; fc->max_read = max_t(unsigned, 4096, ctx->max_read); - fc->destroy = is_bdev; - - /* Used by get_root_inode() */ - sb->s_fs_info = fc; + fc->destroy = ctx->is_bdev; err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); @@ -1204,7 +1160,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) mutex_lock(&fuse_mutex); err = -EINVAL; - if (file->private_data) + if (*ctx->fudptr) goto err_unlock; err = fuse_ctl_add_conn(fc); @@ -1213,17 +1169,8 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - file->private_data = fud; + *ctx->fudptr = fud; mutex_unlock(&fuse_mutex); - /* - * atomic_dec_and_test() in fput() provides the necessary - * memory barrier for file->private_data to be visible on all - * CPUs after this - */ - fput(file); - - fuse_send_init(fc); - return 0; err_unlock: @@ -1231,6 +1178,53 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) dput(root_dentry); err_dev_free: fuse_dev_free(fud); + err: + return err; +} +EXPORT_SYMBOL_GPL(fuse_fill_super_common); + +static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + struct file *file; + int err; + struct fuse_conn *fc; + + err = -EINVAL; + file = fget(ctx->fd); + if (!file) + goto err; + + /* + * Require mount to happen from the same user namespace which + * opened /dev/fuse to prevent potential attacks. + */ + if ((file->f_op != &fuse_dev_operations) || + (file->f_cred->user_ns != sb->s_user_ns)) + goto err_fput; + ctx->fudptr = &file->private_data; + + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + err = -ENOMEM; + if (!fc) + goto err_fput; + + fuse_conn_init(fc, sb->s_user_ns); + fc->release = fuse_free_conn; + sb->s_fs_info = fc; + + err = fuse_fill_super_common(sb, ctx); + if (err) + goto err_put_conn; + /* + * atomic_dec_and_test() in fput() provides the necessary + * memory barrier for file->private_data to be visible on all + * CPUs after this + */ + fput(file); + fuse_send_init(get_fuse_conn_super(sb)); + return 0; + err_put_conn: fuse_conn_put(fc); sb->s_fs_info = NULL; From ae3aad77f46fbba56eff7141b2fc49870b60827e Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 18 Jun 2018 15:53:19 +0100 Subject: [PATCH 0433/2880] fuse: add fuse_iqueue_ops callbacks The /dev/fuse device uses fiq->waitq and fasync to signal that requests are available. These mechanisms do not apply to virtio-fs. This patch introduces callbacks so alternative behavior can be used. Note that queue_interrupt() changes along these lines: spin_lock(&fiq->waitq.lock); wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); spin_unlock(&fiq->waitq.lock); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); Since queue_request() and queue_forget() also call kill_fasync() inside the spinlock this should be safe. Signed-off-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 2 +- fs/fuse/dev.c | 46 ++++++++++++++++++++++++++++++---------------- fs/fuse/fuse_i.h | 42 +++++++++++++++++++++++++++++++++++++++++- fs/fuse/inode.c | 13 +++++++++---- 4 files changed, 81 insertions(+), 22 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 2332e7f960a8..6a0de0ce4403 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -506,7 +506,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) * Limit the cuse channel to requests that can * be represented in file->f_cred->user_ns. */ - fuse_conn_init(&cc->fc, file->f_cred->user_ns); + fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); fud = fuse_dev_alloc(&cc->fc); if (!fud) { diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 943bc5cf941a..358a01435058 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -201,14 +201,33 @@ static unsigned int fuse_req_hash(u64 unique) return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } -static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) +/** + * A new request is available, wake fiq->waitq + */ +static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + wake_up(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + spin_unlock(&fiq->lock); +} + +const struct fuse_iqueue_ops fuse_dev_fiq_ops = { + .wake_forget_and_unlock = fuse_dev_wake_and_unlock, + .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, + .wake_pending_and_unlock = fuse_dev_wake_and_unlock, +}; +EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); + +static void queue_request_and_unlock(struct fuse_iqueue *fiq, + struct fuse_req *req) +__releases(fiq->lock) { req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); list_add_tail(&req->list, &fiq->pending); - wake_up(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + fiq->ops->wake_pending_and_unlock(fiq); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, @@ -223,12 +242,11 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, if (fiq->connected) { fiq->forget_list_tail->next = forget; fiq->forget_list_tail = forget; - wake_up(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + fiq->ops->wake_forget_and_unlock(fiq); } else { kfree(forget); + spin_unlock(&fiq->lock); } - spin_unlock(&fiq->lock); } static void flush_bg_queue(struct fuse_conn *fc) @@ -244,8 +262,7 @@ static void flush_bg_queue(struct fuse_conn *fc) fc->active_background++; spin_lock(&fiq->lock); req->in.h.unique = fuse_get_unique(fiq); - queue_request(fiq, req); - spin_unlock(&fiq->lock); + queue_request_and_unlock(fiq, req); } } @@ -334,10 +351,10 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) spin_unlock(&fiq->lock); return 0; } - wake_up(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + fiq->ops->wake_interrupt_and_unlock(fiq); + } else { + spin_unlock(&fiq->lock); } - spin_unlock(&fiq->lock); return 0; } @@ -397,11 +414,10 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) req->out.h.error = -ENOTCONN; } else { req->in.h.unique = fuse_get_unique(fiq); - queue_request(fiq, req); /* acquire extra reference, since request is still needed after fuse_request_end() */ __fuse_get_request(req); - spin_unlock(&fiq->lock); + queue_request_and_unlock(fiq, req); request_wait_answer(fc, req); /* Pairs with smp_wmb() in fuse_request_end() */ @@ -570,14 +586,12 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc, spin_lock(&fiq->lock); if (fiq->connected) { - queue_request(fiq, req); - spin_unlock(&fiq->lock); + queue_request_and_unlock(fiq, req); } else { err = -ENODEV; spin_unlock(&fiq->lock); fuse_put_request(fc, req); } - spin_unlock(&fiq->lock); return err; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1902148281cc..8c13865955d4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -355,6 +355,39 @@ struct fuse_req { }; +struct fuse_iqueue; + +/** + * Input queue callbacks + * + * Input queue signalling is device-specific. For example, the /dev/fuse file + * uses fiq->waitq and fasync to wake processes that are waiting on queue + * readiness. These callbacks allow other device types to respond to input + * queue activity. + */ +struct fuse_iqueue_ops { + /** + * Signal that a forget has been queued + */ + void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); + + /** + * Signal that an INTERRUPT request has been queued + */ + void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); + + /** + * Signal that a request has been queued + */ + void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); +}; + +/** /dev/fuse input queue operations */ +extern const struct fuse_iqueue_ops fuse_dev_fiq_ops; + struct fuse_iqueue { /** Connection established */ unsigned connected; @@ -383,6 +416,12 @@ struct fuse_iqueue { /** O_ASYNC requests */ struct fasync_struct *fasync; + + /** Device-specific callbacks */ + const struct fuse_iqueue_ops *ops; + + /** Device-specific state */ + void *priv; }; #define FUSE_PQ_HASH_BITS 8 @@ -882,7 +921,8 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns); +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, + const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv); /** * Release reference to fuse_conn diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 30d92e633ece..734fdd597c3e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -568,7 +568,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void fuse_iqueue_init(struct fuse_iqueue *fiq) +static void fuse_iqueue_init(struct fuse_iqueue *fiq, + const struct fuse_iqueue_ops *ops, + void *priv) { memset(fiq, 0, sizeof(struct fuse_iqueue)); spin_lock_init(&fiq->lock); @@ -577,6 +579,8 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq) INIT_LIST_HEAD(&fiq->interrupts); fiq->forget_list_tail = &fiq->forget_list_head; fiq->connected = 1; + fiq->ops = ops; + fiq->priv = priv; } static void fuse_pqueue_init(struct fuse_pqueue *fpq) @@ -590,7 +594,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) fpq->connected = 1; } -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, + const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); @@ -599,7 +604,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); - fuse_iqueue_init(&fc->iq); + fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); INIT_LIST_HEAD(&fc->devices); @@ -1209,7 +1214,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) if (!fc) goto err_fput; - fuse_conn_init(fc, sb->s_user_ns); + fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); fc->release = fuse_free_conn; sb->s_fs_info = fc; From 0cd1eb9a4160a96e0ec9b93b2e7b489f449bf22d Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 6 Mar 2019 16:51:40 -0500 Subject: [PATCH 0434/2880] fuse: separate fuse device allocation and installation in fuse_conn As of now fuse_dev_alloc() both allocates a fuse device and installs it in fuse_conn list. fuse_dev_alloc() can fail if fuse_device allocation fails. virtio-fs needs to initialize multiple fuse devices (one per virtio queue). It initializes one fuse device as part of call to fuse_fill_super_common() and rest of the devices are allocated and installed after that. But, we can't afford to fail after calling fuse_fill_super_common() as we don't have a way to undo all the actions done by fuse_fill_super_common(). So to avoid failures after the call to fuse_fill_super_common(), pre-allocate all fuse devices early and install them into fuse connection later. This patch provides two separate helpers for fuse device allocation and fuse device installation in fuse_conn. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 2 +- fs/fuse/dev.c | 2 +- fs/fuse/fuse_i.h | 4 +++- fs/fuse/inode.c | 31 ++++++++++++++++++++++++------- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 6a0de0ce4403..45762bb7a934 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -508,7 +508,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); - fud = fuse_dev_alloc(&cc->fc); + fud = fuse_dev_alloc_install(&cc->fc); if (!fud) { kfree(cc); return -ENOMEM; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 358a01435058..b2e18861c59c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2203,7 +2203,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) if (new->private_data) return -EINVAL; - fud = fuse_dev_alloc(fc); + fud = fuse_dev_alloc_install(fc); if (!fud) return -ENOMEM; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8c13865955d4..242d47752e78 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -929,7 +929,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, */ void fuse_conn_put(struct fuse_conn *fc); -struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc(void); +void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); void fuse_send_init(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 734fdd597c3e..6a0f31faaeaa 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1045,7 +1045,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) return 0; } -struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) +struct fuse_dev *fuse_dev_alloc(void) { struct fuse_dev *fud; struct list_head *pq; @@ -1061,17 +1061,34 @@ struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) } fud->pq.processing = pq; - fud->fc = fuse_conn_get(fc); fuse_pqueue_init(&fud->pq); - spin_lock(&fc->lock); - list_add_tail(&fud->entry, &fc->devices); - spin_unlock(&fc->lock); - return fud; } EXPORT_SYMBOL_GPL(fuse_dev_alloc); +void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc) +{ + fud->fc = fuse_conn_get(fc); + spin_lock(&fc->lock); + list_add_tail(&fud->entry, &fc->devices); + spin_unlock(&fc->lock); +} +EXPORT_SYMBOL_GPL(fuse_dev_install); + +struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + + fud = fuse_dev_alloc(); + if (!fud) + return NULL; + + fuse_dev_install(fud, fc); + return fud; +} +EXPORT_SYMBOL_GPL(fuse_dev_alloc_install); + void fuse_dev_free(struct fuse_dev *fud) { struct fuse_conn *fc = fud->fc; @@ -1132,7 +1149,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_user_ns != &init_user_ns) sb->s_xattr = fuse_no_acl_xattr_handlers; - fud = fuse_dev_alloc(fc); + fud = fuse_dev_alloc_install(fc); if (!fud) goto err; From 8fab010644363f8f80194322aa7a81e38c867af3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 15 Aug 2018 17:42:34 +0200 Subject: [PATCH 0435/2880] fuse: delete dentry if timeout is zero Don't hold onto dentry in lru list if need to re-lookup it anyway at next access. Only do this if explicitly enabled, otherwise it could result in performance regression. More advanced version of this patch would periodically flush out dentries from the lru which have gone stale. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 28 +++++++++++++++++++++++++--- fs/fuse/fuse_i.h | 3 +++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 11334ee01dc9..ba0a175d7578 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -29,12 +29,28 @@ union fuse_dentry { struct rcu_head rcu; }; -static inline void fuse_dentry_settime(struct dentry *entry, u64 time) +static void fuse_dentry_settime(struct dentry *dentry, u64 time) { - ((union fuse_dentry *) entry->d_fsdata)->time = time; + struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); + bool delete = !time && fc->delete_stale; + /* + * Mess with DCACHE_OP_DELETE because dput() will be faster without it. + * Don't care about races, either way it's just an optimization + */ + if ((!delete && (dentry->d_flags & DCACHE_OP_DELETE)) || + (delete && !(dentry->d_flags & DCACHE_OP_DELETE))) { + spin_lock(&dentry->d_lock); + if (!delete) + dentry->d_flags &= ~DCACHE_OP_DELETE; + else + dentry->d_flags |= DCACHE_OP_DELETE; + spin_unlock(&dentry->d_lock); + } + + ((union fuse_dentry *) dentry->d_fsdata)->time = time; } -static inline u64 fuse_dentry_time(struct dentry *entry) +static inline u64 fuse_dentry_time(const struct dentry *entry) { return ((union fuse_dentry *) entry->d_fsdata)->time; } @@ -255,8 +271,14 @@ static void fuse_dentry_release(struct dentry *dentry) kfree_rcu(fd, rcu); } +static int fuse_dentry_delete(const struct dentry *dentry) +{ + return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); +} + const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, + .d_delete = fuse_dentry_delete, .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, }; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 242d47752e78..fed68a427a4c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -692,6 +692,9 @@ struct fuse_conn { /* Send DESTROY request */ unsigned int destroy:1; + /* Delete dentries that have gone stale */ + unsigned int delete_stale:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; From 783863d6476ce9f27fa87227f76ae9134caf43fa Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 29 Aug 2019 11:01:20 +0200 Subject: [PATCH 0436/2880] fuse: dissociate DESTROY from fuseblk Allow virtio-fs to also send DESTROY request. Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 9 +++++++++ fs/fuse/inode.c | 12 ++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index fed68a427a4c..48d214df9172 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -467,6 +467,7 @@ struct fuse_fs_context { bool group_id_present:1; bool default_permissions:1; bool allow_other:1; + bool destroy:1; unsigned int max_read; unsigned int blksize; const char *subtype; @@ -945,6 +946,13 @@ void fuse_send_init(struct fuse_conn *fc); */ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); +/** + * Disassociate fuse connection from superblock and kill the superblock + * + * Calls kill_anon_super(), do not use with bdev mounts. + */ +void fuse_kill_sb_anon(struct super_block *sb); + /** * Add connection to control filesystem */ @@ -1057,5 +1065,6 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); * Get the next unique ID for a request */ u64 fuse_get_unique(struct fuse_iqueue *fiq); +void fuse_free_conn(struct fuse_conn *fc); #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6a0f31faaeaa..cd6ffb3fe69e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -999,11 +999,12 @@ void fuse_send_init(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_send_init); -static void fuse_free_conn(struct fuse_conn *fc) +void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); kfree_rcu(fc, rcu); } +EXPORT_SYMBOL_GPL(fuse_free_conn); static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { @@ -1169,7 +1170,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) fc->user_id = ctx->user_id; fc->group_id = ctx->group_id; fc->max_read = max_t(unsigned, 4096, ctx->max_read); - fc->destroy = ctx->is_bdev; + fc->destroy = ctx->destroy; err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); @@ -1293,8 +1294,10 @@ static int fuse_init_fs_context(struct fs_context *fc) ctx->blksize = FUSE_DEFAULT_BLKSIZE; #ifdef CONFIG_BLOCK - if (fc->fs_type == &fuseblk_fs_type) + if (fc->fs_type == &fuseblk_fs_type) { ctx->is_bdev = true; + ctx->destroy = true; + } #endif fc->fs_private = ctx; @@ -1319,11 +1322,12 @@ static void fuse_sb_destroy(struct super_block *sb) } } -static void fuse_kill_sb_anon(struct super_block *sb) +void fuse_kill_sb_anon(struct super_block *sb) { fuse_sb_destroy(sb); kill_anon_super(sb); } +EXPORT_SYMBOL_GPL(fuse_kill_sb_anon); static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, From 15c8e72e88e0b707ffefd524ca33f28cdb3db487 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 6 May 2019 15:35:43 -0400 Subject: [PATCH 0437/2880] fuse: allow skipping control interface and forced unmount virtio-fs does not support aborting requests which are being processed. That is requests which have been sent to fuse daemon on host. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 8 ++++++++ fs/fuse/inode.c | 7 ++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 48d214df9172..fc89cb40e874 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -468,6 +468,8 @@ struct fuse_fs_context { bool default_permissions:1; bool allow_other:1; bool destroy:1; + bool no_control:1; + bool no_force_umount:1; unsigned int max_read; unsigned int blksize; const char *subtype; @@ -696,6 +698,12 @@ struct fuse_conn { /* Delete dentries that have gone stale */ unsigned int delete_stale:1; + /** Do not create entry in fusectl fs */ + unsigned int no_control:1; + + /** Do not allow MNT_FORCE umount */ + unsigned int no_force_umount:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cd6ffb3fe69e..10d193b24fb8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -364,7 +364,10 @@ void fuse_unlock_inode(struct inode *inode, bool locked) static void fuse_umount_begin(struct super_block *sb) { - fuse_abort_conn(get_fuse_conn_super(sb)); + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (!fc->no_force_umount) + fuse_abort_conn(fc); } static void fuse_send_destroy(struct fuse_conn *fc) @@ -1171,6 +1174,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) fc->group_id = ctx->group_id; fc->max_read = max_t(unsigned, 4096, ctx->max_read); fc->destroy = ctx->destroy; + fc->no_control = ctx->no_control; + fc->no_force_umount = ctx->no_force_umount; err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); From c4bb667eaf520f21b3a3db0489682becc9c49bcc Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 2 Aug 2019 18:15:19 +0100 Subject: [PATCH 0438/2880] fuse: reserve values for mapping protocol SETUPMAPPING is a command for use with 'virtiofsd', a fuse-over-virtio implementation; it may find use in other fuse impelementations as well in which the kernel does not have access to the address space of the daemon directly. A SETUPMAPPING operation causes a section of a file to be mapped into a memory window visible to the kernel. The offsets in the file and the window are defined by the kernel performing the operation. The daemon may reject the request, for reasons including permissions and limited resources. When a request perfectly overlaps a previous mapping, the previous mapping is replaced. When a mapping partially overlaps a previous mapping, the previous mapping is split into one or two smaller mappings. REMOVEMAPPING is the complement to SETUPMAPPING; it unmaps a range of mapped files from the window visible to the kernel. The map_alignment field communicates the alignment constraint for FUSE_SETUPMAPPING/FUSE_REMOVEMAPPING and allows the daemon to constrain the addresses and file offsets chosen by the kernel. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Vivek Goyal Signed-off-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index df2e12fb3381..802b0377a49e 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -133,6 +133,8 @@ * * 7.31 * - add FUSE_WRITE_KILL_PRIV flag + * - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING + * - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag */ #ifndef _LINUX_FUSE_H @@ -274,6 +276,7 @@ struct fuse_file_lock { * FUSE_CACHE_SYMLINKS: cache READLINK responses * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir * FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request + * FUSE_MAP_ALIGNMENT: map_alignment field is valid */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -301,6 +304,7 @@ struct fuse_file_lock { #define FUSE_CACHE_SYMLINKS (1 << 23) #define FUSE_NO_OPENDIR_SUPPORT (1 << 24) #define FUSE_EXPLICIT_INVAL_DATA (1 << 25) +#define FUSE_MAP_ALIGNMENT (1 << 26) /** * CUSE INIT request/reply flags @@ -422,6 +426,8 @@ enum fuse_opcode { FUSE_RENAME2 = 45, FUSE_LSEEK = 46, FUSE_COPY_FILE_RANGE = 47, + FUSE_SETUPMAPPING = 48, + FUSE_REMOVEMAPPING = 49, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -656,7 +662,7 @@ struct fuse_init_out { uint32_t max_write; uint32_t time_gran; uint16_t max_pages; - uint16_t padding; + uint16_t map_alignment; uint32_t unused[8]; }; From 501ae8ecae2ba5122774dee4445003505a7fd01b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 4 Sep 2019 08:36:33 -0400 Subject: [PATCH 0439/2880] fuse: reserve byteswapped init opcodes virtio fs tunnels fuse over a virtio channel. One issue is two sides might be speaking different endian-ness. To detects this, host side looks at the opcode value in the FUSE_INIT command. Works fine at the moment but might fail if a future version of fuse will use such an opcode for initialization. Let's reserve this opcode so we remember and don't do this. Same for CUSE_INIT. Signed-off-by: Michael S. Tsirkin Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 2971d29a42e4..df2e12fb3381 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -425,6 +425,10 @@ enum fuse_opcode { /* CUSE specific operations */ CUSE_INIT = 4096, + + /* Reserved opcodes: helpful to detect structure endian-ness */ + CUSE_INIT_BSWAP_RESERVED = 1048576, /* CUSE_INIT << 8 */ + FUSE_INIT_BSWAP_RESERVED = 436207616, /* FUSE_INIT << 24 */ }; enum fuse_notify_code { From a4098bc6eed5e31e0391bcc068e61804c98138df Mon Sep 17 00:00:00 2001 From: Igor Druzhinin Date: Thu, 12 Sep 2019 19:31:51 +0100 Subject: [PATCH 0440/2880] xen/pci: reserve MCFG areas earlier If MCFG area is not reserved in E820, Xen by default will defer its usage until Dom0 registers it explicitly after ACPI parser recognizes it as a reserved resource in DSDT. Having it reserved in E820 is not mandatory according to "PCI Firmware Specification, rev 3.2" (par. 4.1.2) and firmware is free to keep a hole in E820 in that place. Xen doesn't know what exactly is inside this hole since it lacks full ACPI view of the platform therefore it's potentially harmful to access MCFG region without additional checks as some machines are known to provide inconsistent information on the size of the region. Now xen_mcfg_late() runs after acpi_init() which is too late as some basic PCI enumeration starts exactly there as well. Trying to register a device prior to MCFG reservation causes multiple problems with PCIe extended capability initializations in Xen (e.g. SR-IOV VF BAR sizing). There are no convenient hooks for us to subscribe to so register MCFG areas earlier upon the first invocation of xen_add_device(). It should be safe to do once since all the boot time buses must have their MCFG areas in MCFG table already and we don't support PCI bus hot-plug. Signed-off-by: Igor Druzhinin Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/pci.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index 3eeb9bea7630..224df03ce42e 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -17,6 +17,8 @@ #include "../pci/pci.h" #ifdef CONFIG_PCI_MMCONFIG #include + +static int xen_mcfg_late(void); #endif static bool __read_mostly pci_seg_supported = true; @@ -28,7 +30,18 @@ static int xen_add_device(struct device *dev) #ifdef CONFIG_PCI_IOV struct pci_dev *physfn = pci_dev->physfn; #endif - +#ifdef CONFIG_PCI_MMCONFIG + static bool pci_mcfg_reserved = false; + /* + * Reserve MCFG areas in Xen on first invocation due to this being + * potentially called from inside of acpi_init immediately after + * MCFG table has been finally parsed. + */ + if (!pci_mcfg_reserved) { + xen_mcfg_late(); + pci_mcfg_reserved = true; + } +#endif if (pci_seg_supported) { struct { struct physdev_pci_device_add add; @@ -201,7 +214,7 @@ static int __init register_xen_pci_notifier(void) arch_initcall(register_xen_pci_notifier); #ifdef CONFIG_PCI_MMCONFIG -static int __init xen_mcfg_late(void) +static int xen_mcfg_late(void) { struct pci_mmcfg_region *cfg; int rc; @@ -240,8 +253,4 @@ static int __init xen_mcfg_late(void) } return 0; } -/* - * Needs to be done after acpi_init which are subsys_initcall. - */ -subsys_initcall_sync(xen_mcfg_late); #endif From 2472518e44eee3a45bd436a4162046790b74767a Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 29 Aug 2019 11:08:35 -0700 Subject: [PATCH 0441/2880] Revert "drm/bridge: adv7511: Attach to DSI host at probe time" This reverts commit 83f35bc3a852f1c3892c7474998c5cec707c7ba3. There are at least two DSI controller drivers which relies on the old behaviour of adv7511 driver. To avoid platform breakage this patch should be reverted. This is a temporary solution, as it blocks adv7511 usage with other platforms. Assumption that DSI device driver (bridge/panel) should first expose drm_bridge/drm_panel object, then look for DSI bus is just incorrect - it can work with devices controlled via i2c but it cannot work with devices controlled via DSI - they will not be able to probe. To solve the issue following steps should be performed: - rework reverted patch allowing co-operation with broken DSI controller drivers - with simple/ugly workaround, - fix controller drivers and then remove workaround. Signed-off-by: Rob Clark [a.hajda: changed commit message] Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/20190829180836.14453-1-robdclark@gmail.com --- drivers/gpu/drm/bridge/adv7511/adv7511_drv.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c index 98bccace8c1c..f6d2681f6927 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c @@ -874,6 +874,9 @@ static int adv7511_bridge_attach(struct drm_bridge *bridge) &adv7511_connector_helper_funcs); drm_connector_attach_encoder(&adv->connector, bridge->encoder); + if (adv->type == ADV7533) + ret = adv7533_attach_dsi(adv); + if (adv->i2c_main->irq) regmap_write(adv->regmap, ADV7511_REG_INT_ENABLE(0), ADV7511_INT0_HPD); @@ -1219,17 +1222,8 @@ static int adv7511_probe(struct i2c_client *i2c, const struct i2c_device_id *id) drm_bridge_add(&adv7511->bridge); adv7511_audio_init(dev, adv7511); - - if (adv7511->type == ADV7533) { - ret = adv7533_attach_dsi(adv7511); - if (ret) - goto err_remove_bridge; - } - return 0; -err_remove_bridge: - drm_bridge_remove(&adv7511->bridge); err_unregister_cec: i2c_unregister_device(adv7511->i2c_cec); if (adv7511->cec_clk) From 716c6a228ec9635e4b524f9c01aac089c5dd6ed5 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 13 Sep 2019 12:33:07 +0100 Subject: [PATCH 0442/2880] MAINTAINERS: Add PCI native host/endpoint controllers designated reviewer Add Andrew Murray as designated reviewer for PCI native host and endpoint controller drivers. Signed-off-by: Lorenzo Pieralisi Cc: Andrew Murray Cc: Bjorn Helgaas --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 783569e3c4b4..08d97988034d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12442,6 +12442,7 @@ F: arch/x86/kernel/early-quirks.c PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS M: Lorenzo Pieralisi +R: Andrew Murray L: linux-pci@vger.kernel.org Q: http://patchwork.ozlabs.org/project/linux-pci/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git/ From 11330a9fef049db195f50300627dd972e19e0f8e Mon Sep 17 00:00:00 2001 From: Chuanhua Han Date: Fri, 6 Sep 2019 15:53:19 +0800 Subject: [PATCH 0443/2880] i2c: imx: ACPI support for NXP i2c controller Enable NXP i2c controller to boot with ACPI. Signed-off-by: Meenakshi Aggarwal Signed-off-by: Udit Kumar Signed-off-by: Chuanhua Han Signed-off-by: Biwen Li Reviewed-by: Andy Shevchenko Acked-by: Oleksij Rempel Tested-by: Oleksij Rempel Acked-by: Rafael J. Wysocki Signed-off-by: Wolfram Sang --- drivers/acpi/acpi_apd.c | 7 +++++++ drivers/i2c/busses/i2c-imx.c | 17 +++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c index 7cd0c9ac71ea..71511ae2dfcd 100644 --- a/drivers/acpi/acpi_apd.c +++ b/drivers/acpi/acpi_apd.c @@ -160,11 +160,17 @@ static const struct apd_device_desc hip08_i2c_desc = { .setup = acpi_apd_setup, .fixed_clk_rate = 250000000, }; + static const struct apd_device_desc thunderx2_i2c_desc = { .setup = acpi_apd_setup, .fixed_clk_rate = 125000000, }; +static const struct apd_device_desc nxp_i2c_desc = { + .setup = acpi_apd_setup, + .fixed_clk_rate = 350000000, +}; + static const struct apd_device_desc hip08_spi_desc = { .setup = acpi_apd_setup, .fixed_clk_rate = 250000000, @@ -238,6 +244,7 @@ static const struct acpi_device_id acpi_apd_device_ids[] = { { "HISI02A1", APD_ADDR(hip07_i2c_desc) }, { "HISI02A2", APD_ADDR(hip08_i2c_desc) }, { "HISI0173", APD_ADDR(hip08_spi_desc) }, + { "NXP0001", APD_ADDR(nxp_i2c_desc) }, #endif { } }; diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index b1b8b938d7f4..40692fc70d92 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -20,6 +20,7 @@ * */ +#include #include #include #include @@ -255,6 +256,12 @@ static const struct of_device_id i2c_imx_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, i2c_imx_dt_ids); +static const struct acpi_device_id i2c_imx_acpi_ids[] = { + {"NXP0001", .driver_data = (kernel_ulong_t)&vf610_i2c_hwdata}, + { } +}; +MODULE_DEVICE_TABLE(acpi, i2c_imx_acpi_ids); + static inline int is_imx1_i2c(struct imx_i2c_struct *i2c_imx) { return i2c_imx->hwdata->devtype == IMX1_I2C; @@ -1050,14 +1057,13 @@ static const struct i2c_algorithm i2c_imx_algo = { static int i2c_imx_probe(struct platform_device *pdev) { - const struct of_device_id *of_id = of_match_device(i2c_imx_dt_ids, - &pdev->dev); struct imx_i2c_struct *i2c_imx; struct resource *res; struct imxi2c_platform_data *pdata = dev_get_platdata(&pdev->dev); void __iomem *base; int irq, ret; dma_addr_t phy_addr; + const struct imx_i2c_hwdata *match; dev_dbg(&pdev->dev, "<%s>\n", __func__); @@ -1077,8 +1083,9 @@ static int i2c_imx_probe(struct platform_device *pdev) if (!i2c_imx) return -ENOMEM; - if (of_id) - i2c_imx->hwdata = of_id->data; + match = device_get_match_data(&pdev->dev); + if (match) + i2c_imx->hwdata = match; else i2c_imx->hwdata = (struct imx_i2c_hwdata *) platform_get_device_id(pdev)->driver_data; @@ -1091,6 +1098,7 @@ static int i2c_imx_probe(struct platform_device *pdev) i2c_imx->adapter.nr = pdev->id; i2c_imx->adapter.dev.of_node = pdev->dev.of_node; i2c_imx->base = base; + ACPI_COMPANION_SET(&i2c_imx->adapter.dev, ACPI_COMPANION(&pdev->dev)); /* Get I2C clock */ i2c_imx->clk = devm_clk_get(&pdev->dev, NULL); @@ -1253,6 +1261,7 @@ static struct platform_driver i2c_imx_driver = { .name = DRIVER_NAME, .pm = &i2c_imx_pm_ops, .of_match_table = i2c_imx_dt_ids, + .acpi_match_table = i2c_imx_acpi_ids, }, .id_table = imx_i2c_devtype, }; From 8ebf15e9c869e764b3aab4928938ee68c0e2bd6d Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 10 Sep 2019 10:29:17 +0100 Subject: [PATCH 0444/2880] i2c: tegra: Move suspend handling to NOIRQ phase Commit acc8abcb2a9c ("i2c: tegra: Add suspend-resume support") added suspend support for the Tegra I2C driver and following this change on Tegra30 the following WARNING is seen on entering suspend ... WARNING: CPU: 2 PID: 689 at /dvs/git/dirty/git-master_l4t-upstream/kernel/drivers/i2c/i2c-core.h:54 __i2c_transfer+0x35c/0x70c i2c i2c-4: Transfer while suspended Modules linked in: brcmfmac brcmutil CPU: 2 PID: 689 Comm: rtcwake Not tainted 5.3.0-rc7-g089cf7f6ecb2 #1 Hardware name: NVIDIA Tegra SoC (Flattened Device Tree) [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0xb4/0xc8) [] (dump_stack) from [] (__warn+0xe0/0xf8) [] (__warn) from [] (warn_slowpath_fmt+0x48/0x6c) [] (warn_slowpath_fmt) from [] (__i2c_transfer+0x35c/0x70c) [] (__i2c_transfer) from [] (i2c_transfer+0x58/0xf4) [] (i2c_transfer) from [] (i2c_transfer_buffer_flags+0x4c/0x70) [] (i2c_transfer_buffer_flags) from [] (regmap_i2c_write+0x14/0x30) [] (regmap_i2c_write) from [] (_regmap_raw_write_impl+0x35c/0x868) [] (_regmap_raw_write_impl) from [] (_regmap_update_bits+0xe4/0xec) [] (_regmap_update_bits) from [] (regmap_update_bits_base+0x50/0x74) [] (regmap_update_bits_base) from [] (regulator_disable_regmap+0x44/0x54) [] (regulator_disable_regmap) from [] (_regulator_do_disable+0xf8/0x268) [] (_regulator_do_disable) from [] (_regulator_disable+0xf4/0x19c) [] (_regulator_disable) from [] (regulator_disable+0x34/0x64) [] (regulator_disable) from [] (regulator_bulk_disable+0x28/0xb4) [] (regulator_bulk_disable) from [] (tegra_pcie_power_off+0x64/0xa8) [] (tegra_pcie_power_off) from [] (tegra_pcie_pm_suspend+0x25c/0x3f4) [] (tegra_pcie_pm_suspend) from [] (dpm_run_callback+0x38/0x1d4) [] (dpm_run_callback) from [] (__device_suspend_noirq+0xc0/0x2b8) [] (__device_suspend_noirq) from [] (dpm_noirq_suspend_devices+0x100/0x37c) [] (dpm_noirq_suspend_devices) from [] (dpm_suspend_noirq+0x1c/0x48) [] (dpm_suspend_noirq) from [] (suspend_devices_and_enter+0x1d0/0xa00) [] (suspend_devices_and_enter) from [] (pm_suspend+0x220/0x74c) [] (pm_suspend) from [] (state_store+0x6c/0xc8) [] (state_store) from [] (kernfs_fop_write+0xe8/0x1c4) [] (kernfs_fop_write) from [] (__vfs_write+0x2c/0x1c4) [] (__vfs_write) from [] (vfs_write+0xa4/0x184) [] (vfs_write) from [] (ksys_write+0x9c/0xdc) [] (ksys_write) from [] (ret_fast_syscall+0x0/0x54) Exception stack(0xe9f21fa8 to 0xe9f21ff0) 1fa0: 0000006c 004b2438 00000004 004b2438 00000004 00000000 1fc0: 0000006c 004b2438 004b1228 00000004 00000004 00000004 0049e78c 004b1228 1fe0: 00000004 be9809b8 b6f0bc0b b6e96206 The problem is that the Tegra PCIe driver indirectly uses I2C for controlling some regulators and the I2C driver is now being suspended before the PCIe driver causing the PCIe suspend to fail. The Tegra PCIe driver is suspended during the NOIRQ phase and this cannot be changed due to other dependencies. Therefore, we also need to move the suspend handling for the Tegra I2C driver to the NOIRQ phase as well. In order to move the I2C suspend handling to the NOIRQ phase we also need to avoid calling pm_runtime_get/put() because per commit 1e2ef05bb8cf ("PM: Limit race conditions between runtime PM and system sleep (v2)") these cannot be called early in resume. The function tegra_i2c_init(), called during resume, calls pm_runtime_get/put() and so move these calls outside of tegra_i2c_init(), so this function can be used during the NOIRQ resume phase. Fixes: acc8abcb2a9c ("i2c: tegra: Add suspend-resume support") Signed-off-by: Jon Hunter Acked-by: Thierry Reding Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-tegra.c | 40 +++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 18f0ceed9f1b..c1683f9338b4 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -713,12 +713,6 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev, bool clk_reinit) u32 tsu_thd; u8 tlow, thigh; - err = pm_runtime_get_sync(i2c_dev->dev); - if (err < 0) { - dev_err(i2c_dev->dev, "runtime resume failed %d\n", err); - return err; - } - reset_control_assert(i2c_dev->rst); udelay(2); reset_control_deassert(i2c_dev->rst); @@ -772,7 +766,7 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev, bool clk_reinit) if (err) { dev_err(i2c_dev->dev, "failed changing clock rate: %d\n", err); - goto err; + return err; } } @@ -787,23 +781,21 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev, bool clk_reinit) err = tegra_i2c_flush_fifos(i2c_dev); if (err) - goto err; + return err; if (i2c_dev->is_multimaster_mode && i2c_dev->hw->has_slcg_override_reg) i2c_writel(i2c_dev, I2C_MST_CORE_CLKEN_OVR, I2C_CLKEN_OVERRIDE); err = tegra_i2c_wait_for_config_load(i2c_dev); if (err) - goto err; + return err; if (i2c_dev->irq_disabled) { i2c_dev->irq_disabled = false; enable_irq(i2c_dev->irq); } -err: - pm_runtime_put(i2c_dev->dev); - return err; + return 0; } static int tegra_i2c_disable_packet_mode(struct tegra_i2c_dev *i2c_dev) @@ -1616,12 +1608,14 @@ static int tegra_i2c_probe(struct platform_device *pdev) } pm_runtime_enable(&pdev->dev); - if (!pm_runtime_enabled(&pdev->dev)) { + if (!pm_runtime_enabled(&pdev->dev)) ret = tegra_i2c_runtime_resume(&pdev->dev); - if (ret < 0) { - dev_err(&pdev->dev, "runtime resume failed\n"); - goto unprepare_div_clk; - } + else + ret = pm_runtime_get_sync(i2c_dev->dev); + + if (ret < 0) { + dev_err(&pdev->dev, "runtime resume failed\n"); + goto unprepare_div_clk; } if (i2c_dev->is_multimaster_mode) { @@ -1666,6 +1660,8 @@ static int tegra_i2c_probe(struct platform_device *pdev) if (ret) goto release_dma; + pm_runtime_put(&pdev->dev); + return 0; release_dma: @@ -1725,17 +1721,25 @@ static int __maybe_unused tegra_i2c_resume(struct device *dev) struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); int err; + err = tegra_i2c_runtime_resume(dev); + if (err) + return err; + err = tegra_i2c_init(i2c_dev, false); if (err) return err; + err = tegra_i2c_runtime_suspend(dev); + if (err) + return err; + i2c_mark_adapter_resumed(&i2c_dev->adapter); return 0; } static const struct dev_pm_ops tegra_i2c_pm = { - SET_SYSTEM_SLEEP_PM_OPS(tegra_i2c_suspend, tegra_i2c_resume) + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(tegra_i2c_suspend, tegra_i2c_resume) SET_RUNTIME_PM_OPS(tegra_i2c_runtime_suspend, tegra_i2c_runtime_resume, NULL) }; From 85624f90c8fc7214cd37e064635d17541d657732 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sun, 15 Sep 2019 16:46:23 +0300 Subject: [PATCH 0445/2880] platform/x86: asus-wmi: Make it depend on ACPI battery API When driver has been switched to use ACPI battery API in the commit 7973353e92ee ("Refactor charge threshold to use the battery hooking API") it makes it implicitly dependent to a corresponding kernel configuration option. Make this dependency explicit in Kconfig. Fixes: 7973353e92ee ("Refactor charge threshold to use the battery hooking API") Reported-by: kbuild test robot Cc: Kristian Klausen Signed-off-by: Andy Shevchenko --- drivers/platform/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 1b67bb578f9f..ae21d08c65e8 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -674,6 +674,7 @@ config EEEPC_LAPTOP config ASUS_WMI tristate "ASUS WMI Driver" depends on ACPI_WMI + depends on ACPI_BATTERY depends on INPUT depends on HWMON depends on BACKLIGHT_CLASS_DEVICE From 473ef57ad8edc25efd083a583a5f6604b47d3822 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 15 Sep 2019 12:19:48 -0400 Subject: [PATCH 0446/2880] afs dynroot: switch to simple_dir_operations no point reinventing it (with wrong ->read(), BTW). Signed-off-by: Al Viro --- fs/afs/dynroot.c | 7 ------- fs/afs/inode.c | 2 +- fs/afs/internal.h | 1 - 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index bcd1bafb0278..4150280509ff 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -10,13 +10,6 @@ #include #include "internal.h" -const struct file_operations afs_dynroot_file_operations = { - .open = dcache_dir_open, - .release = dcache_dir_close, - .iterate_shared = dcache_readdir, - .llseek = dcache_dir_lseek, -}; - /* * Probe to see if a cell may exist. This prevents positive dentries from * being created unnecessarily. diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 7b1c18c32f48..46d2d7cb461d 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -443,7 +443,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { inode->i_op = &afs_dynroot_inode_operations; - inode->i_fop = &afs_dynroot_file_operations; + inode->i_fop = &simple_dir_operations; } else { inode->i_op = &afs_autocell_inode_operations; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index f66a3be12fd6..9b2a2a21e3dc 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -910,7 +910,6 @@ extern int afs_silly_iput(struct dentry *, struct inode *); /* * dynroot.c */ -extern const struct file_operations afs_dynroot_file_operations; extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; From 750670341a24cb714e624e0fd7da30900ad93752 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 23 Jul 2019 16:50:20 +0100 Subject: [PATCH 0447/2880] ceph: fix directories inode i_blkbits initialization When filling an inode with info from the MDS, i_blkbits is being initialized using fl_stripe_unit, which contains the stripe unit in bytes. Unfortunately, this doesn't make sense for directories as they have fl_stripe_unit set to '0'. This means that i_blkbits will be set to 0xff, causing an UBSAN undefined behaviour in i_blocksize(): UBSAN: Undefined behaviour in ./include/linux/fs.h:731:12 shift exponent 255 is too large for 32-bit type 'int' Fix this by initializing i_blkbits to CEPH_BLOCK_SHIFT if fl_stripe_unit is zero. Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 18500edefc56..3b537e7038c7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -801,7 +801,12 @@ static int fill_inode(struct inode *inode, struct page *locked_page, /* update inode */ inode->i_rdev = le32_to_cpu(info->rdev); - inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + /* directories have fl_stripe_unit set to zero */ + if (le32_to_cpu(info->layout.fl_stripe_unit)) + inode->i_blkbits = + fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + else + inode->i_blkbits = CEPH_BLOCK_SHIFT; __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); From e1e44602021358336fb524affe53ce3c09b34beb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 24 Jul 2019 07:59:51 -0400 Subject: [PATCH 0448/2880] ceph: allow copy_file_range when src and dst inode are same There is no reason to prevent this. The OSD should be able to handle this as long as the objects are different, and the existing code falls back when the offset into the object is different. Signed-off-by: Jeff Layton Acked-by: Luis Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 685a03cc4b77..f657fabcb3ee 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1913,8 +1913,6 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, int src_got = 0, dst_got = 0, err, dirty; bool do_final_copy = false; - if (src_inode == dst_inode) - return -EINVAL; if (src_inode->i_sb != dst_inode->i_sb) return -EXDEV; if (ceph_snap(dst_inode) != CEPH_NOSNAP) From e09580b343aa117fd07c1bb7f7dfc5bc630a2953 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 24 Jul 2019 12:46:20 -0400 Subject: [PATCH 0449/2880] ceph: don't list vxattrs in listxattr() Most filesystems that provide virtual xattrs (e.g. CIFS) don't display them via listxattr(). Ceph does, and that causes some of the tests in xfstests to fail. Have cephfs stop listing vxattrs in listxattr. Userspace can always query them directly when the name is known. Signed-off-by: Jeff Layton Acked-by: David Disseldorp Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 939eab7aa219..2fba06b50f25 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -903,11 +903,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); bool len_only = (size == 0); u32 namelen; int err; - int i; spin_lock(&ci->i_ceph_lock); dout("listxattr %p ver=%lld index_ver=%lld\n", inode, @@ -936,33 +934,6 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) names = __copy_xattr_names(ci, names); size -= namelen; } - - - /* virtual xattr names, too */ - if (vxattrs) { - for (i = 0; vxattrs[i].name; i++) { - size_t this_len; - - if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN) - continue; - if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci)) - continue; - - this_len = strlen(vxattrs[i].name) + 1; - namelen += this_len; - if (len_only) - continue; - - if (this_len > size) { - err = -ERANGE; - goto out; - } - - memcpy(names, vxattrs[i].name, this_len); - names += this_len; - size -= this_len; - } - } err = namelen; out: spin_unlock(&ci->i_ceph_lock); From 120a75ea9f4ba02f852171e75d44f29139b9c83e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:39 +0800 Subject: [PATCH 0450/2880] libceph: add function that reset client's entity addr This function also re-open connections to OSD/MON, and re-send in-flight OSD requests after re-opening connections to OSD. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 1 + include/linux/ceph/messenger.h | 1 + include/linux/ceph/mon_client.h | 1 + include/linux/ceph/osd_client.h | 1 + net/ceph/ceph_common.c | 8 ++++++++ net/ceph/messenger.c | 6 ++++++ net/ceph/mon_client.c | 7 +++++++ net/ceph/osd_client.c | 18 ++++++++++++++++++ 8 files changed, 43 insertions(+) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 82156da3c650..b9dbda1c26aa 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -293,6 +293,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private); struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); u64 ceph_client_gid(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); +extern void ceph_reset_client_addr(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, unsigned long started); extern int ceph_open_session(struct ceph_client *client); diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 23895d178149..c4458dc6a757 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -337,6 +337,7 @@ extern void ceph_msgr_flush(void); extern void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr); extern void ceph_messenger_fini(struct ceph_messenger *msgr); +extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr); extern void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index b4d134d3312a..dbb8a6959a73 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -109,6 +109,7 @@ extern int ceph_monmap_contains(struct ceph_monmap *m, extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); extern void ceph_monc_stop(struct ceph_mon_client *monc); +extern void ceph_monc_reopen_session(struct ceph_mon_client *monc); enum { CEPH_SUB_MONMAP = 0, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ad7fe5d10dcd..d1c3e829f30d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -381,6 +381,7 @@ extern void ceph_osdc_cleanup(void); extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); +extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc); extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4eeea4d5c3ef..b412a3ccc4fc 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -694,6 +694,14 @@ void ceph_destroy_client(struct ceph_client *client) } EXPORT_SYMBOL(ceph_destroy_client); +void ceph_reset_client_addr(struct ceph_client *client) +{ + ceph_messenger_reset_nonce(&client->msgr); + ceph_monc_reopen_session(&client->monc); + ceph_osdc_reopen_osds(&client->osdc); +} +EXPORT_SYMBOL(ceph_reset_client_addr); + /* * true if we have the mon map (and have thus joined the cluster) */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 962f521c863e..e4cb3db2ee77 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3031,6 +3031,12 @@ static void con_fault(struct ceph_connection *con) } +void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) +{ + u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; + msgr->inst.addr.nonce = cpu_to_le32(nonce); + encode_my_addr(msgr); +} /* * initialize a new messenger instance diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 0520bf9825aa..7256c402ebaa 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -213,6 +213,13 @@ static void reopen_session(struct ceph_mon_client *monc) __open_session(monc); } +void ceph_monc_reopen_session(struct ceph_mon_client *monc) +{ + mutex_lock(&monc->mutex); + reopen_session(monc); + mutex_unlock(&monc->mutex); +} + static void un_backoff(struct ceph_mon_client *monc) { monc->hunt_mult /= 2; /* reduce by 50% */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 78ae6e8c953d..a252ba80dd82 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5086,6 +5086,24 @@ out_put_req: } EXPORT_SYMBOL(ceph_osdc_call); +/* + * reset all osd connections + */ +void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) +{ + struct rb_node *n; + + down_write(&osdc->lock); + for (n = rb_first(&osdc->osds); n; ) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + n = rb_next(n); + if (!reopen_osd(osd)) + kick_osd_requests(osd); + } + up_write(&osdc->lock); +} + /* * init, shutdown */ From 2cef0ba8032ce0df3f29621921fe7d6372f68d3e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:40 +0800 Subject: [PATCH 0451/2880] libceph: add function that clears osd client's abort_err Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 1 + net/ceph/osd_client.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d1c3e829f30d..eaffbdddf89a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -389,6 +389,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); +void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc); #define osd_req_op_data(oreq, whch, typ, fld) \ ({ \ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a252ba80dd82..6f49c59e5e58 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2476,6 +2476,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) } EXPORT_SYMBOL(ceph_osdc_abort_requests); +void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc) +{ + down_write(&osdc->lock); + osdc->abort_err = 0; + up_write(&osdc->lock); +} +EXPORT_SYMBOL(ceph_osdc_clear_abort_err); + static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) { if (likely(eb > osdc->epoch_barrier)) { From 7e6906c1e670f0b40554f42ba74cc043d094877c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:41 +0800 Subject: [PATCH 0452/2880] ceph: allow closing session in restarting/reconnect state CEPH_MDS_SESSION_{RESTARTING,RECONNECTING} are for for mds failover, they are sub-states of CEPH_MDS_SESSION_OPEN. So __close_session() should send close request for session in these two state. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index f7c8603484fe..810fd8689dff 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -148,9 +148,9 @@ enum { CEPH_MDS_SESSION_OPENING = 2, CEPH_MDS_SESSION_OPEN = 3, CEPH_MDS_SESSION_HUNG = 4, - CEPH_MDS_SESSION_CLOSING = 5, - CEPH_MDS_SESSION_RESTARTING = 6, - CEPH_MDS_SESSION_RECONNECTING = 7, + CEPH_MDS_SESSION_RESTARTING = 5, + CEPH_MDS_SESSION_RECONNECTING = 6, + CEPH_MDS_SESSION_CLOSING = 7, CEPH_MDS_SESSION_REJECTED = 8, }; From f4b97866223b8dddd1bcb9d2a9546c5a5e430249 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:42 +0800 Subject: [PATCH 0453/2880] ceph: track and report error of async metadata operation Use errseq_t to track and report errors of async metadata operations, similar to how kernel handles errors during writeback. If any dirty caps or any unsafe request gets dropped during session eviction, record -EIO in corresponding inode's i_meta_err. The error will be reported by subsequent fsync, Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 24 +++++++++++++++++------- fs/ceph/file.c | 6 ++++-- fs/ceph/inode.c | 2 ++ fs/ceph/mds_client.c | 40 +++++++++++++++++++++++++++------------- fs/ceph/super.h | 4 ++++ 5 files changed, 54 insertions(+), 22 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ce0f5658720a..321ba9b30968 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2261,35 +2261,45 @@ static int unsafe_request_wait(struct inode *inode) int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { + struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); u64 flush_tid; - int ret; + int ret, err; int dirty; dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); ret = file_write_and_wait_range(file, start, end); - if (ret < 0) - goto out; - if (datasync) goto out; dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); - ret = unsafe_request_wait(inode); + err = unsafe_request_wait(inode); /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ - if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { - ret = wait_event_interruptible(ci->i_cap_wq, + if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { + err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } + + if (err < 0) + ret = err; + + if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) { + spin_lock(&file->f_lock); + err = errseq_check_and_advance(&ci->i_meta_err, + &fi->meta_err); + spin_unlock(&file->f_lock); + if (err < 0) + ret = err; + } out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f657fabcb3ee..75a1d12ec46d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -201,6 +201,7 @@ out: static int ceph_init_file_info(struct inode *inode, struct file *file, int fmode, bool isdir) { + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, @@ -211,7 +212,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, struct ceph_dir_file_info *dfi = kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); if (!dfi) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + ceph_put_fmode(ci, fmode); /* clean up */ return -ENOMEM; } @@ -222,7 +223,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, } else { fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); if (!fi) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + ceph_put_fmode(ci, fmode); /* clean up */ return -ENOMEM; } @@ -232,6 +233,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, fi->fmode = fmode; spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); + fi->meta_err = errseq_sample(&ci->i_meta_err); return 0; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 3b537e7038c7..332433b490f5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -515,6 +515,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ceph_fscache_inode_init(ci); + ci->i_meta_err = 0; + return &ci->vfs_inode; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 920e9f048bd8..df4bea231017 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1270,6 +1270,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, { struct ceph_mds_request *req; struct rb_node *p; + struct ceph_inode_info *ci; dout("cleanup_session_requests mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); @@ -1278,6 +1279,16 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, struct ceph_mds_request, r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); + if (req->r_target_inode) { + /* dropping unsafe change of inode's attributes */ + ci = ceph_inode(req->r_target_inode); + errseq_set(&ci->i_meta_err, -EIO); + } + if (req->r_unsafe_dir) { + /* dropping unsafe directory operation */ + ci = ceph_inode(req->r_unsafe_dir); + errseq_set(&ci->i_meta_err, -EIO); + } __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ @@ -1370,7 +1381,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); - bool drop = false; + bool dirty_dropped = false; bool invalidate = false; dout("removing cap %p, ci is %p, inode is %p\n", @@ -1405,7 +1416,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, inode, ceph_ino(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); - drop = true; + dirty_dropped = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited( @@ -1415,10 +1426,22 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; - drop = true; + dirty_dropped = true; } spin_unlock(&mdsc->cap_dirty_lock); + if (dirty_dropped) { + errseq_set(&ci->i_meta_err, -EIO); + + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + } + if (atomic_read(&ci->i_filelock_ref) > 0) { /* make further file lock syscall return -EIO */ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; @@ -1430,15 +1453,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } - - if (drop && - ci->i_wrbuffer_ref_head == 0 && - ci->i_wr_ref == 0 && - ci->i_dirty_caps == 0 && - ci->i_flushing_caps == 0) { - ceph_put_snap_context(ci->i_head_snapc); - ci->i_head_snapc = NULL; - } } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { @@ -1452,7 +1466,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, wake_up_all(&ci->i_cap_wq); if (invalidate) ceph_queue_invalidate(inode); - if (drop) + if (dirty_dropped) iput(inode); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6b9f1ee7de85..66dfe5ebf006 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -395,6 +395,8 @@ struct ceph_inode_info { struct fscache_cookie *fscache; u32 i_fscache_gen; #endif + errseq_t i_meta_err; + struct inode vfs_inode; /* at end */ }; @@ -703,6 +705,8 @@ struct ceph_file_info { spinlock_t rw_contexts_lock; struct list_head rw_contexts; + + errseq_t meta_err; }; struct ceph_dir_file_info { From 5e3ded1bb642f2d7a6ded6deeafb155d5b5312f2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:43 +0800 Subject: [PATCH 0454/2880] ceph: pass filp to ceph_get_caps() Also change several other functions' arguments, no logical changes. This is preparetion for later patch that checks filp error. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 15 +++++++++------ fs/ceph/caps.c | 32 +++++++++++++++++--------------- fs/ceph/file.c | 35 +++++++++++++++++------------------ fs/ceph/super.h | 6 +++--- 4 files changed, 46 insertions(+), 42 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b3c8b886bf64..2d6e23e32e30 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -323,7 +323,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, /* caller of readpages does not hold buffer and read caps * (fadvise, madvise and readahead cases) */ int want = CEPH_CAP_FILE_CACHE; - ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, + true, &got); if (ret < 0) { dout("start_read %p, error getting cap\n", inode); } else if (!(got & want)) { @@ -1452,7 +1453,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, + &got, &pinned_page); if (err < 0) goto out_restore; @@ -1568,7 +1570,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got, NULL); if (err < 0) goto out_free; @@ -1989,10 +1991,11 @@ out: return err; } -int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) +int ceph_pool_perm_check(struct inode *inode, int need) { - s64 pool; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_string *pool_ns; + s64 pool; int ret, flags; if (ci->i_vino.snap != CEPH_NOSNAP) { @@ -2004,7 +2007,7 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) return 0; } - if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), + if (ceph_test_mount_opt(ceph_inode_to_client(inode), NOPOOLPERM)) return 0; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 321ba9b30968..bde81aaa3750 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2570,10 +2570,10 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, * * FIXME: how does a 0 return differ from -EAGAIN? */ -static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, +static int try_get_cap_refs(struct inode *inode, int need, int want, loff_t endoff, bool nonblock, int *got) { - struct inode *inode = &ci->vfs_inode; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; int ret = 0; int have, implemented; @@ -2741,18 +2741,18 @@ static void check_max_size(struct inode *inode, loff_t endoff) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } -int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, +int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got) { int ret; BUG_ON(need & ~CEPH_CAP_FILE_RD); BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); - ret = ceph_pool_perm_check(ci, need); + ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; - ret = try_get_cap_refs(ci, need, want, 0, nonblock, got); + ret = try_get_cap_refs(inode, need, want, 0, nonblock, got); return ret == -EAGAIN ? 0 : ret; } @@ -2761,21 +2761,23 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); int _got, ret; - ret = ceph_pool_perm_check(ci, need); + ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; while (true) { if (endoff > 0) - check_max_size(&ci->vfs_inode, endoff); + check_max_size(inode, endoff); _got = 0; - ret = try_get_cap_refs(ci, need, want, endoff, + ret = try_get_cap_refs(inode, need, want, endoff, false, &_got); if (ret == -EAGAIN) continue; @@ -2783,8 +2785,8 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&ci->i_cap_wq, &wait); - while (!(ret = try_get_cap_refs(ci, need, want, endoff, - true, &_got))) { + while (!(ret = try_get_cap_refs(inode, need, want, + endoff, true, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -2799,7 +2801,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (ret < 0) { if (ret == -ESTALE) { /* session was killed, try renew caps */ - ret = ceph_renew_caps(&ci->vfs_inode); + ret = ceph_renew_caps(inode); if (ret == 0) continue; } @@ -2808,9 +2810,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (ci->i_inline_version != CEPH_INLINE_NONE && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - i_size_read(&ci->vfs_inode) > 0) { + i_size_read(inode) > 0) { struct page *page = - find_get_page(ci->vfs_inode.i_mapping, 0); + find_get_page(inode->i_mapping, 0); if (page) { if (PageUptodate(page)) { *pinned_page = page; @@ -2829,7 +2831,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, * getattr request will bring inline data into * page cache */ - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + ret = __ceph_do_getattr(inode, NULL, CEPH_STAT_CAP_INLINE_DATA, true); if (ret < 0) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 75a1d12ec46d..fb007b75fb17 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1262,7 +1262,8 @@ again: want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, + &got, &pinned_page); if (ret < 0) return ret; @@ -1459,7 +1460,7 @@ retry_snap: else want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got, NULL); if (err < 0) goto out; @@ -1783,7 +1784,7 @@ static long ceph_fallocate(struct file *file, int mode, else want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); if (ret < 0) goto unlock; @@ -1812,16 +1813,15 @@ unlock: * src_ci. Two attempts are made to obtain both caps, and an error is return if * this fails; zero is returned on success. */ -static int get_rd_wr_caps(struct ceph_inode_info *src_ci, - loff_t src_endoff, int *src_got, - struct ceph_inode_info *dst_ci, +static int get_rd_wr_caps(struct file *src_filp, int *src_got, + struct file *dst_filp, loff_t dst_endoff, int *dst_got) { int ret = 0; bool retrying = false; retry_caps: - ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, + ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, dst_endoff, dst_got, NULL); if (ret < 0) return ret; @@ -1831,24 +1831,24 @@ retry_caps: * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some * retry dance instead to try to get both capabilities. */ - ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, + ret = ceph_try_get_caps(file_inode(src_filp), + CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, false, src_got); if (ret <= 0) { /* Start by dropping dst_ci caps and getting src_ci caps */ - ceph_put_cap_refs(dst_ci, *dst_got); + ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); if (retrying) { if (!ret) /* ceph_try_get_caps masks EAGAIN */ ret = -EAGAIN; return ret; } - ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, - CEPH_CAP_FILE_SHARED, src_endoff, - src_got, NULL); + ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, + CEPH_CAP_FILE_SHARED, -1, src_got, NULL); if (ret < 0) return ret; /*... drop src_ci caps too, and retry */ - ceph_put_cap_refs(src_ci, *src_got); + ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); retrying = true; goto retry_caps; } @@ -1960,8 +1960,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * clients may have dirty data in their caches. And OSDs know nothing * about caps, so they can't safely do the remote object copies. */ - err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, - dst_ci, (dst_off + len), &dst_got); + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); if (err < 0) { dout("get_rd_wr_caps returned %d\n", err); ret = -EOPNOTSUPP; @@ -2018,9 +2018,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, goto out; } len -= ret; - err = get_rd_wr_caps(src_ci, (src_off + len), - &src_got, dst_ci, - (dst_off + len), &dst_got); + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); if (err < 0) goto out; err = is_file_size_ok(src_inode, dst_inode, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 66dfe5ebf006..1452f62445c3 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1062,9 +1062,9 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, struct inode *dir, int mds, int drop, int unless); -extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, +extern int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got, struct page **pinned_page); -extern int ceph_try_get_caps(struct ceph_inode_info *ci, +extern int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got); /* for counting open files by mode */ @@ -1075,7 +1075,7 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); extern const struct address_space_operations ceph_aops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); extern int ceph_uninline_data(struct file *filp, struct page *locked_page); -extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); +extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ From d468e729b74eafdfc8306ca8f77e1f26478d67da Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:44 +0800 Subject: [PATCH 0455/2880] ceph: add helper function that forcibly reconnects to ceph cluster. It closes mds sessions, drop all caps and invalidates page caches, then use new entity address to reconnect to the cluster. After reconnect, all dirty data/metadata are dropped, file locks get lost sliently. Open files continue to work because client will try renewing caps on later read/write. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 15 ++++++++++++--- fs/ceph/super.c | 26 +++++++++++++++++++++++++- fs/ceph/super.h | 3 ++- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index df4bea231017..ed4d20a54af7 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1394,9 +1394,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_cap_flush *cf; struct ceph_mds_client *mdsc = fsc->mdsc; - if (ci->i_wrbuffer_ref > 0 && - READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) - invalidate = true; + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (inode->i_data.nrpages > 0) + invalidate = true; + if (ci->i_wrbuffer_ref > 0) + mapping_set_error(&inode->i_data, -EIO); + } while (!list_empty(&ci->i_cap_flush_list)) { cf = list_first_entry(&ci->i_cap_flush_list, @@ -4369,7 +4372,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) session = __ceph_lookup_mds_session(mdsc, mds); if (!session) continue; + + if (session->s_state == CEPH_MDS_SESSION_REJECTED) + __unregister_session(mdsc, session); + __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); __close_session(mdsc, session); if (session->s_state == CEPH_MDS_SESSION_CLOSING) { @@ -4378,6 +4386,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); kick_requests(mdsc, mds); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index ab4868c7308e..a95dd13bb628 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -829,7 +829,6 @@ static void ceph_umount_begin(struct super_block *sb) fsc->mount_state = CEPH_MOUNT_SHUTDOWN; ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); ceph_mdsc_force_umount(fsc->mdsc); - return; } static int ceph_remount(struct super_block *sb, int *flags, char *data) @@ -1152,6 +1151,31 @@ static struct file_system_type ceph_fs_type = { }; MODULE_ALIAS_FS("ceph"); +int ceph_force_reconnect(struct super_block *sb) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + int err = 0; + + ceph_umount_begin(sb); + + /* Make sure all page caches get invalidated. + * see remove_session_caps_cb() */ + flush_workqueue(fsc->inode_wq); + + /* In case that we were blacklisted. This also reset + * all mon/osd connections */ + ceph_reset_client_addr(fsc->client); + + ceph_osdc_clear_abort_err(&fsc->client->osdc); + fsc->mount_state = CEPH_MOUNT_MOUNTED; + + if (sb->s_root) { + err = __ceph_do_getattr(d_inode(sb->s_root), NULL, + CEPH_STAT_CAP_INODE, true); + } + return err; +} + static int __init init_ceph(void) { int ret = init_caches(); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1452f62445c3..7af7f2ba8da5 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -846,7 +846,8 @@ static inline int default_congestion_kb(void) } - +/* super.c */ +extern int ceph_force_reconnect(struct super_block *sb); /* snap.c */ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino); From ff5d913dfc7142974eb1694d5fd6284658e46bc6 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:45 +0800 Subject: [PATCH 0456/2880] ceph: return -EIO if read/write against filp that lost file locks After mds evicts session, file locks get lost sliently. It's not safe to let programs continue to do read/write. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 28 ++++++++++++++++++++++------ fs/ceph/locks.c | 8 ++++++-- fs/ceph/super.h | 1 + 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bde81aaa3750..102192c90edd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2570,8 +2570,13 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, * * FIXME: how does a 0 return differ from -EAGAIN? */ +enum { + NON_BLOCKING = 1, + CHECK_FILELOCK = 2, +}; + static int try_get_cap_refs(struct inode *inode, int need, int want, - loff_t endoff, bool nonblock, int *got) + loff_t endoff, int flags, int *got) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; @@ -2586,6 +2591,13 @@ static int try_get_cap_refs(struct inode *inode, int need, int want, again: spin_lock(&ci->i_ceph_lock); + if ((flags & CHECK_FILELOCK) && + (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { + dout("try_get_cap_refs %p error filelock\n", inode); + ret = -EIO; + goto out_unlock; + } + /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); if ((file_wanted & need) != need) { @@ -2647,7 +2659,7 @@ again: * we can not call down_read() when * task isn't in TASK_RUNNING state */ - if (nonblock) { + if (flags & NON_BLOCKING) { ret = -EAGAIN; goto out_unlock; } @@ -2752,7 +2764,8 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, if (ret < 0) return ret; - ret = try_get_cap_refs(inode, need, want, 0, nonblock, got); + ret = try_get_cap_refs(inode, need, want, 0, + (nonblock ? NON_BLOCKING : 0), got); return ret == -EAGAIN ? 0 : ret; } @@ -2764,9 +2777,10 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { + struct ceph_file_info *fi = filp->private_data; struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - int _got, ret; + int ret, _got, flags; ret = ceph_pool_perm_check(inode, need); if (ret < 0) @@ -2776,17 +2790,19 @@ int ceph_get_caps(struct file *filp, int need, int want, if (endoff > 0) check_max_size(inode, endoff); + flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0; _got = 0; ret = try_get_cap_refs(inode, need, want, endoff, - false, &_got); + flags, &_got); if (ret == -EAGAIN) continue; if (!ret) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&ci->i_cap_wq, &wait); + flags |= NON_BLOCKING; while (!(ret = try_get_cap_refs(inode, need, want, - endoff, true, &_got))) { + endoff, flags, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 5083e238ad15..544e9e85b120 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -32,14 +32,18 @@ void __init ceph_flock_init(void) static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) { - struct inode *inode = file_inode(src->fl_file); + struct ceph_file_info *fi = dst->fl_file->private_data; + struct inode *inode = file_inode(dst->fl_file); atomic_inc(&ceph_inode(inode)->i_filelock_ref); + atomic_inc(&fi->num_locks); } static void ceph_fl_release_lock(struct file_lock *fl) { + struct ceph_file_info *fi = fl->fl_file->private_data; struct inode *inode = file_inode(fl->fl_file); struct ceph_inode_info *ci = ceph_inode(inode); + atomic_dec(&fi->num_locks); if (atomic_dec_and_test(&ci->i_filelock_ref)) { /* clear error when all locks are released */ spin_lock(&ci->i_ceph_lock); @@ -73,7 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, * window. Caller function will decrease the counter. */ fl->fl_ops = &ceph_fl_lock_ops; - atomic_inc(&ceph_inode(inode)->i_filelock_ref); + fl->fl_ops->fl_copy_lock(fl, NULL); } if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7af7f2ba8da5..736318210bc9 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -707,6 +707,7 @@ struct ceph_file_info { struct list_head rw_contexts; errseq_t meta_err; + atomic_t num_locks; }; struct ceph_dir_file_info { From 81f148a910045cd0a139f589a0b42764b172f8f5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:46 +0800 Subject: [PATCH 0457/2880] ceph: invalidate all write mode filp after reconnect Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 13 +++++++++++++ fs/ceph/file.c | 1 + fs/ceph/super.c | 2 ++ fs/ceph/super.h | 3 +++ 4 files changed, 19 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 102192c90edd..d17bde5d4f9a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2780,12 +2780,17 @@ int ceph_get_caps(struct file *filp, int need, int want, struct ceph_file_info *fi = filp->private_data; struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int ret, _got, flags; ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; + if ((fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) + return -EBADF; + while (true) { if (endoff > 0) check_max_size(inode, endoff); @@ -2814,6 +2819,14 @@ int ceph_get_caps(struct file *filp, int need, int want, if (ret == -EAGAIN) continue; } + + if ((fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) { + if (ret >= 0 && _got) + ceph_put_cap_refs(ci, _got); + return -EBADF; + } + if (ret < 0) { if (ret == -ESTALE) { /* session was killed, try renew caps */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index fb007b75fb17..779bf68afd60 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -234,6 +234,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); fi->meta_err = errseq_sample(&ci->i_meta_err); + fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); return 0; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a95dd13bb628..630549ac4f48 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -664,6 +664,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->sb = NULL; fsc->mount_state = CEPH_MOUNT_MOUNTING; + fsc->filp_gen = 1; atomic_long_set(&fsc->writeback_count, 0); @@ -829,6 +830,7 @@ static void ceph_umount_begin(struct super_block *sb) fsc->mount_state = CEPH_MOUNT_SHUTDOWN; ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); ceph_mdsc_force_umount(fsc->mdsc); + fsc->filp_gen++; // invalidate open files } static int ceph_remount(struct super_block *sb, int *flags, char *data) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 736318210bc9..f5e5f6a6bfb8 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -101,6 +101,8 @@ struct ceph_fs_client { struct ceph_client *client; unsigned long mount_state; + + u32 filp_gen; loff_t max_file_size; struct ceph_mds_client *mdsc; @@ -707,6 +709,7 @@ struct ceph_file_info { struct list_head rw_contexts; errseq_t meta_err; + u32 filp_gen; atomic_t num_locks; }; From 131d7eb4faa1fc06b08b633aff0b59ae85f1938e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:47 +0800 Subject: [PATCH 0458/2880] ceph: auto reconnect after blacklisted Make client use osd reply and session message to infer if itself is blacklisted. Client reconnect to cluster using new entity addr if it is blacklisted. Auto reconnect is limited to once every 30 minutes. Auto reconnect is disabled by default. It can be enabled/disabled by recover_session= mount option. In 'clean' mode, client drops any dirty data/metadata, invalidates page caches and invalidates all writable file handles. After reconnect, file locks become stale because MDS loses track of them. If an inode contains any stale file locks, read/write on the indoe are not allowed until applications release all stale file locks. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- Documentation/filesystems/ceph.txt | 14 ++++++++++++ fs/ceph/addr.c | 22 ++++++++++++++----- fs/ceph/file.c | 8 ++++++- fs/ceph/mds_client.c | 34 ++++++++++++++++++++++++++++-- fs/ceph/super.c | 19 +++++++++++++++++ fs/ceph/super.h | 4 ++++ 6 files changed, 93 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index d2c6a5ccf0f5..b19b6a03f91c 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt @@ -158,6 +158,20 @@ Mount Options copies. Currently, it's only used in copy_file_range, which will revert to the default VFS implementation if this option is used. + recover_session= + Set auto reconnect mode in the case where the client is blacklisted. The + available modes are "no" and "clean". The default is "no". + + * no: never attempt to reconnect when client detects that it has been + blacklisted. Operations will generally fail after being blacklisted. + + * clean: client reconnects to the ceph cluster automatically when it + detects that it has been blacklisted. During reconnect, client drops + dirty data/metadata, invalidates page caches and writable file handles. + After reconnect, file locks become stale because the MDS loses track + of them. If an inode contains any stale file locks, read/write on the + inode is not allowed until applications release all stale file locks. + More Information ================ diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 2d6e23e32e30..62602283557f 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = - &ceph_inode_to_client(inode)->client->osdc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int err = 0; u64 off = page_offset(page); u64 len = PAGE_SIZE; @@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); - err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - off, &len, + err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), + &ci->i_layout, off, &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); if (err == -ENOENT) @@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) if (err < 0) { SetPageError(page); ceph_fscache_readpage_cancel(inode, page); + if (err == -EBLACKLISTED) + fsc->blacklisted = true; goto out; } if (err < PAGE_SIZE) @@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req) int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); + if (rc == -EBLACKLISTED) + ceph_inode_to_client(inode)->blacklisted = true; /* unlock all pages, zeroing any data we didn't read */ osd_data = osd_req_op_extent_osd_data(req, 0); @@ -641,6 +644,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) end_page_writeback(page); return err; } + if (err == -EBLACKLISTED) + fsc->blacklisted = true; dout("writepage setting page/mapping error %d %p\n", err, page); SetPageError(page); @@ -721,6 +726,8 @@ static void writepages_finish(struct ceph_osd_request *req) if (rc < 0) { mapping_set_error(mapping, rc); ceph_set_error_write(ci); + if (rc == -EBLACKLISTED) + fsc->blacklisted = true; } else { ceph_clear_error_write(ci); } @@ -1948,12 +1955,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, if (err >= 0 || err == -ENOENT) have |= POOL_READ; - else if (err != -EPERM) + else if (err != -EPERM) { + if (err == -EBLACKLISTED) + fsc->blacklisted = true; goto out_unlock; + } if (err2 == 0 || err2 == -EEXIST) have |= POOL_WRITE; else if (err2 != -EPERM) { + if (err2 == -EBLACKLISTED) + fsc->blacklisted = true; err = err2; goto out_unlock; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 779bf68afd60..5182e1a49d6f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -698,7 +698,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ceph_release_page_vector(pages, num_pages); } - if (ret <= 0 || off >= i_size || !more) + if (ret < 0) { + if (ret == -EBLACKLISTED) + fsc->blacklisted = true; + break; + } + + if (off >= i_size || !more) break; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ed4d20a54af7..5bfbff8f2f64 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3032,18 +3032,23 @@ bad: pr_err("mdsc_handle_forward decode error err=%d\n", err); } -static int __decode_and_drop_session_metadata(void **p, void *end) +static int __decode_session_metadata(void **p, void *end, + bool *blacklisted) { /* map */ u32 n; + bool err_str; ceph_decode_32_safe(p, end, n, bad); while (n-- > 0) { u32 len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); + err_str = !strncmp(*p, "error_string", len); *p += len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); + if (err_str && strnstr(*p, "blacklisted", len)) + *blacklisted = true; *p += len; } return 0; @@ -3067,6 +3072,7 @@ static void handle_session(struct ceph_mds_session *session, u64 seq; unsigned long features = 0; int wake = 0; + bool blacklisted = false; /* decode */ ceph_decode_need(&p, end, sizeof(*h), bad); @@ -3079,7 +3085,7 @@ static void handle_session(struct ceph_mds_session *session, if (msg_version >= 3) { u32 len; /* version >= 2, metadata */ - if (__decode_and_drop_session_metadata(&p, end) < 0) + if (__decode_session_metadata(&p, end, &blacklisted) < 0) goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); @@ -3166,6 +3172,8 @@ static void handle_session(struct ceph_mds_session *session, session->s_state = CEPH_MDS_SESSION_REJECTED; cleanup_session_requests(mdsc, session); remove_session_caps(session); + if (blacklisted) + mdsc->fsc->blacklisted = true; wake = 2; /* for good measure */ break; @@ -4015,7 +4023,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); } +static void maybe_recover_session(struct ceph_mds_client *mdsc) +{ + struct ceph_fs_client *fsc = mdsc->fsc; + if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) + return; + + if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) + return; + + if (!READ_ONCE(fsc->blacklisted)) + return; + + if (fsc->last_auto_reconnect && + time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) + return; + + pr_info("auto reconnect after blacklisted\n"); + fsc->last_auto_reconnect = jiffies; + ceph_force_reconnect(fsc->sb); +} /* * delayed work -- periodically trim expired leases, renew caps with mds @@ -4089,6 +4117,8 @@ static void delayed_work(struct work_struct *work) ceph_trim_snapid_map(mdsc); + maybe_recover_session(mdsc); + schedule_delayed(mdsc); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 630549ac4f48..03b63b1cd32c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -143,6 +143,7 @@ enum { Opt_snapdirname, Opt_mds_namespace, Opt_fscache_uniq, + Opt_recover_session, Opt_last_string, /* string args above */ Opt_dirstat, @@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = { /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, {Opt_mds_namespace, "mds_namespace=%s"}, + {Opt_recover_session, "recover_session=%s"}, {Opt_fscache_uniq, "fsc=%s"}, /* string args above */ {Opt_dirstat, "dirstat"}, @@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private) if (!fsopt->mds_namespace) return -ENOMEM; break; + case Opt_recover_session: + if (!strncmp(argstr[0].from, "no", + argstr[0].to - argstr[0].from)) { + fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; + } else if (!strncmp(argstr[0].from, "clean", + argstr[0].to - argstr[0].from)) { + fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; + } else { + return -EINVAL; + } + break; case Opt_fscache_uniq: kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = kstrndup(argstr[0].from, @@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->mds_namespace) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) + seq_show_option(m, "recover_session", "clean"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) @@ -1169,6 +1186,8 @@ int ceph_force_reconnect(struct super_block *sb) ceph_reset_client_addr(fsc->client); ceph_osdc_clear_abort_err(&fsc->client->osdc); + + fsc->blacklisted = false; fsc->mount_state = CEPH_MOUNT_MOUNTED; if (sb->s_root) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f5e5f6a6bfb8..2105c2c7a2a5 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -31,6 +31,7 @@ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ @@ -102,6 +103,9 @@ struct ceph_fs_client { unsigned long mount_state; + unsigned long last_auto_reconnect; + bool blacklisted; + u32 filp_gen; loff_t max_file_size; From b72b13eb203835a8a07c280939dd17584c6a688e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 2 Jul 2019 12:35:52 -0400 Subject: [PATCH 0459/2880] ceph: don't SetPageError on writepage errors We already mark the mapping in that case, and doing this can cause false positives to occur at fsync time, as well as spurious read errors. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 62602283557f..90e8f8487aaf 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -573,7 +573,7 @@ static u64 get_writepages_data_length(struct inode *inode, /* * Write a single page, but leave the page locked. * - * If we get a write error, set the page error bit, but still adjust the + * If we get a write error, mark the mapping for error, but still adjust the * dirty page accounting (i.e., page is no longer dirty). */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) @@ -648,7 +648,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) fsc->blacklisted = true; dout("writepage setting page/mapping error %d %p\n", err, page); - SetPageError(page); mapping_set_error(&inode->i_data, err); wbc->pages_skipped++; } else { From 5de16b30d3121eca26c8c898cc0eff090e21dda6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 23 Jul 2019 15:09:50 -0400 Subject: [PATCH 0460/2880] ceph: remove ceph_get_cap_mds and __ceph_get_cap_mds Nothing calls these routines. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 31 ------------------------------- fs/ceph/super.h | 1 - 2 files changed, 32 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d17bde5d4f9a..4615f2501e15 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -457,37 +457,6 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) return cap; } -/* - * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. - */ -static int __ceph_get_cap_mds(struct ceph_inode_info *ci) -{ - struct ceph_cap *cap; - int mds = -1; - struct rb_node *p; - - /* prefer mds with WR|BUFFER|EXCL caps */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - cap = rb_entry(p, struct ceph_cap, ci_node); - mds = cap->mds; - if (cap->issued & (CEPH_CAP_FILE_WR | - CEPH_CAP_FILE_BUFFER | - CEPH_CAP_FILE_EXCL)) - break; - } - return mds; -} - -int ceph_get_cap_mds(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int mds; - spin_lock(&ci->i_ceph_lock); - mds = __ceph_get_cap_mds(ceph_inode(inode)); - spin_unlock(&ci->i_ceph_lock); - return mds; -} - /* * Called under i_ceph_lock. */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2105c2c7a2a5..cf16308800fa 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1052,7 +1052,6 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); -extern int ceph_get_cap_mds(struct inode *inode); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, From 606d102327a45a49d293557527802ee7fbfd7af1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 22 Jul 2019 13:12:01 -0400 Subject: [PATCH 0461/2880] ceph: fetch cap_gen under spinlock in ceph_add_cap It's protected by the s_gen_ttl_lock, so we should fetch under it and ensure that we're using the same generation in both places. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 4615f2501e15..bdfec8978479 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -614,6 +614,7 @@ void ceph_add_cap(struct inode *inode, struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; + u32 gen; dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, session->s_mds, cap_id, ceph_cap_string(issued), seq); @@ -625,6 +626,10 @@ void ceph_add_cap(struct inode *inode, if (fmode >= 0) wanted |= ceph_caps_for_mode(fmode); + spin_lock(&session->s_gen_ttl_lock); + gen = session->s_cap_gen; + spin_unlock(&session->s_gen_ttl_lock); + cap = __get_cap_for_mds(ci, mds); if (!cap) { cap = *new_cap; @@ -650,7 +655,7 @@ void ceph_add_cap(struct inode *inode, list_move_tail(&cap->session_caps, &session->s_caps); spin_unlock(&session->s_cap_lock); - if (cap->cap_gen < session->s_cap_gen) + if (cap->cap_gen < gen) cap->issued = cap->implemented = CEPH_CAP_PIN; /* @@ -744,7 +749,7 @@ void ceph_add_cap(struct inode *inode, cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; - cap->cap_gen = session->s_cap_gen; + cap->cap_gen = gen; if (fmode >= 0) __ceph_get_fmode(ci, fmode); From 533a2818dd1a00cdd32d638fea0178e25a683053 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 19 Jul 2019 15:22:28 -0400 Subject: [PATCH 0462/2880] ceph: eliminate session->s_trim_caps It's only used to keep count of caps being trimmed, but that requires that we hold the session->s_mutex to prevent multiple trimming operations from running concurrently. We can achieve the same effect using an integer on the stack, which allows us to (eventually) not need the s_mutex. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 17 ++++++++--------- fs/ceph/mds_client.h | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5bfbff8f2f64..959dcf2ab0b8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -639,7 +639,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); s->s_nr_caps = 0; - s->s_trim_caps = 0; refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); @@ -1722,11 +1721,11 @@ out: */ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { - struct ceph_mds_session *session = arg; + int *remaining = arg; struct ceph_inode_info *ci = ceph_inode(inode); int used, wanted, oissued, mine; - if (session->s_trim_caps <= 0) + if (*remaining <= 0) return -1; spin_lock(&ci->i_ceph_lock); @@ -1763,7 +1762,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if (oissued) { /* we aren't the only cap.. just remove us */ __ceph_remove_cap(cap, true); - session->s_trim_caps--; + (*remaining)--; } else { struct dentry *dentry; /* try dropping referring dentries */ @@ -1775,7 +1774,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) d_prune_aliases(inode); count = atomic_read(&inode->i_count); if (count == 1) - session->s_trim_caps--; + (*remaining)--; dout("trim_caps_cb %p cap %p pruned, count now %d\n", inode, cap, count); } else { @@ -1801,12 +1800,12 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, dout("trim_caps mds%d start: %d / %d, trim %d\n", session->s_mds, session->s_nr_caps, max_caps, trim_caps); if (trim_caps > 0) { - session->s_trim_caps = trim_caps; - ceph_iterate_session_caps(session, trim_caps_cb, session); + int remaining = trim_caps; + + ceph_iterate_session_caps(session, trim_caps_cb, &remaining); dout("trim_caps mds%d done: %d / %d, trimmed %d\n", session->s_mds, session->s_nr_caps, max_caps, - trim_caps - session->s_trim_caps); - session->s_trim_caps = 0; + trim_caps - remaining); } ceph_flush_cap_releases(mdsc, session); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 810fd8689dff..5cd131b41d84 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -176,7 +176,7 @@ struct ceph_mds_session { spinlock_t s_cap_lock; struct list_head s_caps; /* all caps issued by this session */ struct ceph_cap *s_cap_iterator; - int s_nr_caps, s_trim_caps; + int s_nr_caps; int s_num_cap_releases; int s_cap_reconnect; int s_readonly; From 354c63a0033f1ad4bcb208e796f997bfaf946e6d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 19 Jul 2019 09:41:02 -0400 Subject: [PATCH 0463/2880] ceph: fix comments over ceph_add_cap We actually need the ci->i_ceph_lock here. The necessity of the s_mutex is less clear. Also add a lockdep assertion for the i_ceph_lock. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bdfec8978479..2563f42445e6 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -597,7 +597,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, /* * Add a capability under the given MDS session. * - * Caller should hold session snap_rwsem (read) and s_mutex. + * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock * * @fmode is the open file mode, if we are opening a file, otherwise * it is < 0. (This is so we can atomically add the cap and add an @@ -616,6 +616,8 @@ void ceph_add_cap(struct inode *inode, int actual_wanted; u32 gen; + lockdep_assert_held(&ci->i_ceph_lock); + dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, session->s_mds, cap_id, ceph_cap_string(issued), seq); From 9f3345d8ec5e226d116ddb7fb86ee71a08a3b9a7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 8 Jul 2019 12:27:57 -0400 Subject: [PATCH 0464/2880] ceph: have __mark_caps_flushing return flush_tid Currently, this function returns ci->i_dirty_caps, but the callers have to check that that isn't 0 before calling this function. Have the callers grab that value directly out of the inode, and have __mark_caps_flushing return the flush_tid instead. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2563f42445e6..6b8300d72cac 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1722,11 +1722,11 @@ static bool __finish_cap_flush(struct ceph_mds_client *mdsc, * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * - * Called under i_ceph_lock. + * Called under i_ceph_lock. Returns the flush tid. */ -static int __mark_caps_flushing(struct inode *inode, +static u64 __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session, bool wake, - u64 *flush_tid, u64 *oldest_flush_tid) + u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); @@ -1765,8 +1765,7 @@ static int __mark_caps_flushing(struct inode *inode, list_add_tail(&cf->i_list, &ci->i_cap_flush_list); - *flush_tid = cf->tid; - return flushing; + return cf->tid; } /* @@ -2056,9 +2055,9 @@ ack: } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { - flushing = __mark_caps_flushing(inode, session, false, - &flush_tid, - &oldest_flush_tid); + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, false, + &oldest_flush_tid); } else { flushing = 0; flush_tid = 0; @@ -2137,8 +2136,9 @@ retry_locked: goto retry_locked; } - flushing = __mark_caps_flushing(inode, session, true, - &flush_tid, &oldest_flush_tid); + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, true, + &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, From 27b0a392095d30e452772d4ae636150fd7f2172d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Jul 2019 13:26:29 -0400 Subject: [PATCH 0465/2880] ceph: remove unneeded test in try_flush_caps cap->session is always non-NULL, so we can just do a single test for equality w/o testing explicitly for a NULL pointer. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6b8300d72cac..bb91abaf7559 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2114,7 +2114,7 @@ retry_locked: struct ceph_cap *cap = ci->i_auth_cap; int delayed; - if (!session || session != cap->session) { + if (session != cap->session) { spin_unlock(&ci->i_ceph_lock); if (session) mutex_unlock(&session->s_mutex); From daca8bda95d868b528c2fcf01e3d74c3eaf03889 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Jul 2019 10:55:38 -0400 Subject: [PATCH 0466/2880] ceph: remove CEPH_I_NOFLUSH Nothing sets this flag. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 10 ---------- fs/ceph/super.h | 19 +++++++++---------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bb91abaf7559..b1c80d837d0d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2003,11 +2003,6 @@ retry_locked: } ack: - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { - dout(" skipping %p I_NOFLUSH set\n", inode); - continue; - } - if (session && session != cap->session) { dout("oops, wrong session %p mutex\n", session); mutex_unlock(&session->s_mutex); @@ -2105,11 +2100,6 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) retry: spin_lock(&ci->i_ceph_lock); retry_locked: - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { - spin_unlock(&ci->i_ceph_lock); - dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); - goto out; - } if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; int delayed; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cf16308800fa..03e4828c7635 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -507,16 +507,15 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ #define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ -#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ -#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ -#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ -#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ -#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ -#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ -#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ -#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ -#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ +#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ +#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ +#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ +#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ +#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ +#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ +#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ +#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ +#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ /* From 98cd281a76bdf482dd47b282e277c76840ba8eef Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Jul 2019 13:11:51 -0400 Subject: [PATCH 0467/2880] ceph: remove incorrect comment above __send_cap It doesn't do anything to invalidate the cache when dropping RD caps. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b1c80d837d0d..d3b9c9d5c1bd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1260,10 +1260,6 @@ void __ceph_remove_caps(struct ceph_inode_info *ci) * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. * - * Make half-hearted attempt ot to invalidate page cache if we are - * dropping RDCACHE. Note that this will leave behind locked pages - * that we'll then need to deal with elsewhere. - * * Return non-zero if delayed release, or we experienced an error * such that the caller should requeue + retry later. * From 0ed26f3693fbe0aec107e2e97a053b3eabebfc92 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 29 Jul 2019 11:33:08 +0200 Subject: [PATCH 0468/2880] ceph: fix indentation in __get_snap_name() Reported-by: kbuild test robot Reported-by: Julia Lawall Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 15ff1b09cfa2..020d39a85ecc 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -458,33 +458,33 @@ static int __get_snap_name(struct dentry *parent, char *name, if (err < 0) goto out; - rinfo = &req->r_reply_info; - for (i = 0; i < rinfo->dir_nr; i++) { - rde = rinfo->dir_entries + i; - BUG_ON(!rde->inode.in); - if (ceph_snap(inode) == - le64_to_cpu(rde->inode.in->snapid)) { - memcpy(name, rde->name, rde->name_len); - name[rde->name_len] = '\0'; - err = 0; - goto out; - } - } + rinfo = &req->r_reply_info; + for (i = 0; i < rinfo->dir_nr; i++) { + rde = rinfo->dir_entries + i; + BUG_ON(!rde->inode.in); + if (ceph_snap(inode) == + le64_to_cpu(rde->inode.in->snapid)) { + memcpy(name, rde->name, rde->name_len); + name[rde->name_len] = '\0'; + err = 0; + goto out; + } + } - if (rinfo->dir_end) - break; + if (rinfo->dir_end) + break; - BUG_ON(rinfo->dir_nr <= 0); - rde = rinfo->dir_entries + (rinfo->dir_nr - 1); - next_offset += rinfo->dir_nr; - last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); - if (!last_name) { - err = -ENOMEM; - goto out; - } + BUG_ON(rinfo->dir_nr <= 0); + rde = rinfo->dir_entries + (rinfo->dir_nr - 1); + next_offset += rinfo->dir_nr; + last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); + if (!last_name) { + err = -ENOMEM; + goto out; + } - ceph_mdsc_put_request(req); - req = NULL; + ceph_mdsc_put_request(req); + req = NULL; } err = -ENOENT; out: From c62498d7f9d37d5e60d61ca2a4e1f88211af7645 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 25 Jul 2019 13:03:32 -0400 Subject: [PATCH 0469/2880] ceph: update the mtime when truncating up If we have Fx caps, and the we're truncating the size to be larger, then we'll cache the size attribute change, but the mtime won't be updated. Move the size handling before the mtime, and add ATTR_MTIME to ia_valid in that case to make sure the mtime also gets updated. This fixes xfstest generic/313. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 332433b490f5..9f135624ae47 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1989,7 +1989,7 @@ static const struct inode_operations ceph_symlink_iops = { int __ceph_setattr(struct inode *inode, struct iattr *attr) { struct ceph_inode_info *ci = ceph_inode(inode); - const unsigned int ia_valid = attr->ia_valid; + unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf; @@ -2094,6 +2094,26 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } + if (ia_valid & ATTR_SIZE) { + dout("setattr %p size %lld -> %lld\n", inode, + inode->i_size, attr->ia_size); + if ((issued & CEPH_CAP_FILE_EXCL) && + attr->ia_size > inode->i_size) { + i_size_write(inode, attr->ia_size); + inode->i_blocks = calc_inode_blocks(attr->ia_size); + ci->i_reported_size = attr->ia_size; + dirtied |= CEPH_CAP_FILE_EXCL; + ia_valid |= ATTR_MTIME; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + attr->ia_size != inode->i_size) { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = + cpu_to_le64(inode->i_size); + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + } + } if (ia_valid & ATTR_MTIME) { dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, @@ -2116,25 +2136,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } - if (ia_valid & ATTR_SIZE) { - dout("setattr %p size %lld -> %lld\n", inode, - inode->i_size, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && - attr->ia_size > inode->i_size) { - i_size_write(inode, attr->ia_size); - inode->i_blocks = calc_inode_blocks(attr->ia_size); - ci->i_reported_size = attr->ia_size; - dirtied |= CEPH_CAP_FILE_EXCL; - } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - attr->ia_size != inode->i_size) { - req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = - cpu_to_le64(inode->i_size); - mask |= CEPH_SETATTR_SIZE; - release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; - } - } /* these do nothing */ if (ia_valid & ATTR_CTIME) { From 249c1df59a508969f5263ac7e3aedfce7f52c556 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 1 Aug 2019 10:06:40 -0400 Subject: [PATCH 0470/2880] ceph: don't freeze during write page faults Prevent freezing operations during write page faults. This is good practice for most filesystems, but especially for ceph since we're monkeying with the signal table here. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 90e8f8487aaf..9efd51926792 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1548,6 +1548,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) if (!prealloc_cf) return VM_FAULT_OOM; + sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); if (ci->i_inline_version != CEPH_INLINE_NONE) { @@ -1622,6 +1623,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ceph_put_cap_refs(ci, got); out_free: ceph_restore_sigs(&oldset); + sb_end_pagefault(inode->i_sb); ceph_free_cap_flush(prealloc_cf); if (err < 0) ret = vmf_error(err); From 3e8730fac951bf89b3adbb040445bb688af1fd22 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 1 Aug 2019 18:06:58 -0700 Subject: [PATCH 0471/2880] ceph: don't return a value from void function This fixes a build warning to that effect. Fixes: 1a829ff2a6c3 ("ceph: no need to check return value of debugfs_create functions") Signed-off-by: John Hubbard Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 2eb88ed22993..facb387c2735 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -294,7 +294,6 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - return 0; } void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) From 4766815b1179dbe4fe263a5f95795710e29276e2 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Wed, 3 Jul 2019 17:59:20 +0200 Subject: [PATCH 0472/2880] libceph: handle OSD op ceph_pagelist_append() errors osd_req_op_cls_init() and osd_req_op_xattr_init() currently propagate ceph_pagelist_alloc() ENOMEM errors but ignore ceph_pagelist_append() memory allocation failures. Add these checks and cleanup on error. Signed-off-by: David Disseldorp Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6f49c59e5e58..2f44971e1347 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -841,6 +841,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; + int ret; op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); @@ -852,20 +853,27 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, size = strlen(class); BUG_ON(size > (size_t) U8_MAX); op->cls.class_len = size; - ceph_pagelist_append(pagelist, class, size); + ret = ceph_pagelist_append(pagelist, class, size); + if (ret) + goto err_pagelist_free; payload_len += size; op->cls.method_name = method; size = strlen(method); BUG_ON(size > (size_t) U8_MAX); op->cls.method_len = size; - ceph_pagelist_append(pagelist, method, size); + ret = ceph_pagelist_append(pagelist, method, size); + if (ret) + goto err_pagelist_free; payload_len += size; osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); - op->indata_len = payload_len; return 0; + +err_pagelist_free: + ceph_pagelist_release(pagelist); + return ret; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -877,6 +885,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; + int ret; BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); @@ -886,10 +895,14 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, payload_len = strlen(name); op->xattr.name_len = payload_len; - ceph_pagelist_append(pagelist, name, payload_len); + ret = ceph_pagelist_append(pagelist, name, payload_len); + if (ret) + goto err_pagelist_free; op->xattr.value_len = size; - ceph_pagelist_append(pagelist, value, size); + ret = ceph_pagelist_append(pagelist, value, size); + if (ret) + goto err_pagelist_free; payload_len += size; op->xattr.cmp_op = cmp_op; @@ -898,6 +911,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); op->indata_len = payload_len; return 0; + +err_pagelist_free: + ceph_pagelist_release(pagelist); + return ret; } EXPORT_SYMBOL(osd_req_op_xattr_init); From 321fe13c939876b55fbb7780a243c86e577a2151 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 2 Aug 2019 13:15:39 -0400 Subject: [PATCH 0473/2880] ceph: add buffered/direct exclusionary locking for reads and writes xfstest generic/451 intermittently fails. The test does O_DIRECT writes to a file, and then reads back the result using buffered I/O, while running a separate set of tasks that are also doing buffered reads. The client will invalidate the cache prior to a direct write, but it's easy for one of the other readers' replies to race in and reinstantiate the invalidated range with stale data. To fix this, we must to serialize direct I/O writes and buffered reads. We could just sprinkle in some shared locks on the i_rwsem for reads, and increase the exclusive footprint on the write side, but that would cause O_DIRECT writes to end up serialized vs. other direct requests. Instead, borrow the scheme used by nfs.ko. Buffered writes take the i_rwsem exclusively, but buffered reads take a shared lock, allowing them to run in parallel. O_DIRECT requests also take a shared lock, but we need for them to not run in parallel with buffered reads. A flag on the ceph_inode_info is used to indicate whether it's in direct or buffered I/O mode. When a conflicting request is submitted, it will block until the inode can be flipped to the necessary mode. Link: https://tracker.ceph.com/issues/40985 Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/Makefile | 2 +- fs/ceph/file.c | 35 ++++++---- fs/ceph/io.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ceph/io.h | 12 ++++ fs/ceph/super.h | 4 +- 5 files changed, 200 insertions(+), 16 deletions(-) create mode 100644 fs/ceph/io.c create mode 100644 fs/ceph/io.h diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index a699e320393f..c1da294418d1 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ - export.o caps.o snap.o xattr.o quota.o \ + export.o caps.o snap.o xattr.o quota.o io.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ debugfs.o diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5182e1a49d6f..ff17a81bf2e2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -15,6 +15,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "io.h" static __le32 ceph_flags_sys2wire(u32 flags) { @@ -930,7 +931,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct ceph_aio_request *aio_req = NULL; int num_pages = 0; int flags; - int ret; + int ret = 0; struct timespec64 mtime = current_time(inode); size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; @@ -944,11 +945,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, (write ? "write" : "read"), file, pos, (unsigned)count, snapc, snapc ? snapc->seq : 0); - ret = filemap_write_and_wait_range(inode->i_mapping, - pos, pos + count - 1); - if (ret < 0) - return ret; - if (write) { int ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, @@ -1284,12 +1280,16 @@ again: if (ci->i_inline_version == CEPH_INLINE_NONE) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + ceph_start_io_direct(inode); ret = ceph_direct_read_write(iocb, to, NULL, NULL); + ceph_end_io_direct(inode); if (ret >= 0 && ret < len) retry_op = CHECK_EOF; } else { + ceph_start_io_read(inode); ret = ceph_sync_read(iocb, to, &retry_op); + ceph_end_io_read(inode); } } else { retry_op = READ_INLINE; @@ -1300,7 +1300,9 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); ceph_add_rw_context(fi, &rw_ctx); + ceph_start_io_read(inode); ret = generic_file_read_iter(iocb, to); + ceph_end_io_read(inode); ceph_del_rw_context(fi, &rw_ctx); } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", @@ -1409,7 +1411,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) return -ENOMEM; retry_snap: - inode_lock(inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_start_io_direct(inode); + else + ceph_start_io_write(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1480,7 +1485,6 @@ retry_snap: (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { struct ceph_snap_context *snapc; struct iov_iter data; - inode_unlock(inode); spin_lock(&ci->i_ceph_lock); if (__ceph_have_pending_cap_snap(ci)) { @@ -1497,11 +1501,14 @@ retry_snap: /* we might need to revert back to that point */ data = *from; - if (iocb->ki_flags & IOCB_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) { written = ceph_direct_read_write(iocb, &data, snapc, &prealloc_cf); - else + ceph_end_io_direct(inode); + } else { written = ceph_sync_write(iocb, &data, pos, snapc); + ceph_end_io_write(inode); + } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); @@ -1516,7 +1523,7 @@ retry_snap: written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - inode_unlock(inode); + ceph_end_io_write(inode); } if (written >= 0) { @@ -1551,9 +1558,11 @@ retry_snap: } goto out_unlocked; - out: - inode_unlock(inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_write(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; diff --git a/fs/ceph/io.c b/fs/ceph/io.c new file mode 100644 index 000000000000..97602ea92ff4 --- /dev/null +++ b/fs/ceph/io.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Trond Myklebust + * Copyright (c) 2019 Jeff Layton + * + * I/O and data path helper functionality. + * + * Heavily borrowed from equivalent code in fs/nfs/io.c + */ + +#include + +#include +#include +#include +#include + +#include "super.h" +#include "io.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) +{ + lockdep_assert_held_write(&inode->i_rwsem); + + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + inode_dio_wait(inode); + } +} + +/** + * ceph_start_io_read - declare the file is being used for buffered reads + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +void +ceph_start_io_read(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + ceph_block_o_direct(ci, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * ceph_end_io_read - declare that the buffered read operation is done + * @inode: file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * ceph_start_io_write - declare the file is being used for buffered writes + * @inode: file inode + * + * Declare that a buffered write operation is about to start, and ensure + * that we block all direct I/O. + */ +void +ceph_start_io_write(struct inode *inode) +{ + down_write(&inode->i_rwsem); + ceph_block_o_direct(ceph_inode(inode), inode); +} + +/** + * ceph_end_io_write - declare that the buffered write operation is done + * @inode: file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +ceph_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) +{ + lockdep_assert_held_write(&inode->i_rwsem); + + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + /* FIXME: unmap_mapping_range? */ + filemap_write_and_wait(inode->i_mapping); + } +} + +/** + * ceph_end_io_direct - declare the file is being used for direct i/o + * @inode: file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +void +ceph_start_io_direct(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + ceph_block_buffered(ci, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * ceph_end_io_direct - declare that the direct i/o operation is done + * @inode: file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} diff --git a/fs/ceph/io.h b/fs/ceph/io.h new file mode 100644 index 000000000000..fa594cd77348 --- /dev/null +++ b/fs/ceph/io.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_IO_H +#define _FS_CEPH_IO_H + +void ceph_start_io_read(struct inode *inode); +void ceph_end_io_read(struct inode *inode); +void ceph_start_io_write(struct inode *inode); +void ceph_end_io_write(struct inode *inode); +void ceph_start_io_direct(struct inode *inode); +void ceph_end_io_direct(struct inode *inode); + +#endif /* FS_CEPH_IO_H */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 03e4828c7635..98d7190289c8 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -513,10 +513,10 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ #define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ #define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ -#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ +#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ #define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ #define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ - +#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */ /* * Masks of ceph inode work. From 668959a53578076d836905c0bb51f5cfadf1c343 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 6 Aug 2019 09:07:51 -0400 Subject: [PATCH 0474/2880] ceph: turn ceph_security_invalidate_secctx into static inline No need to do an extra jump here. Also add some comments on the endifs. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.h | 6 +++++- fs/ceph/xattr.c | 9 ++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 98d7190289c8..f98d9247f9cb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -971,7 +972,10 @@ static inline bool ceph_security_xattr_wanted(struct inode *in) #ifdef CONFIG_CEPH_FS_SECURITY_LABEL extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx); -extern void ceph_security_invalidate_secctx(struct inode *inode); +static inline void ceph_security_invalidate_secctx(struct inode *inode) +{ + security_inode_invalidate_secctx(inode); +} #else static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 2fba06b50f25..5c608caf0190 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1265,11 +1265,6 @@ out: return err; } -void ceph_security_invalidate_secctx(struct inode *inode) -{ - security_inode_invalidate_secctx(inode); -} - static int ceph_xattr_set_security_label(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *key, const void *buf, @@ -1298,8 +1293,8 @@ static const struct xattr_handler ceph_security_label_handler = { .get = ceph_xattr_get_security_label, .set = ceph_xattr_set_security_label, }; -#endif -#endif +#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */ +#endif /* CONFIG_SECURITY */ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) { From 026105ebb0364fb2155ef3eba65f250efc7aeb45 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 6 Aug 2019 10:41:40 -0400 Subject: [PATCH 0475/2880] ceph: only set CEPH_I_SEC_INITED if we got a MAC label __ceph_getxattr will set the CEPH_I_SEC_INITED flag whenever it gets any xattr that starts with "security.". We only want to set that flag when fetching the MAC label for the currently-active LSM, however. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5c608caf0190..410eaf1ba211 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -892,7 +892,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, memcpy(value, xattr->val, xattr->val_len); if (current->journal_info && - !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN)) ci->i_ceph_flags |= CEPH_I_SEC_INITED; out: spin_unlock(&ci->i_ceph_lock); From b8fe918b090442447d821b32b7dd6e17d5b5dfc1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 6 Aug 2019 10:55:41 -0400 Subject: [PATCH 0476/2880] ceph: allow arbitrary security.* xattrs Most filesystems don't limit what security.* xattrs can be set or fetched. I see no reason that we need to limit that on cephfs either. Drop the special xattr handler for "security." xattrs, and allow the "other" xattr handler to handle security xattrs as well. In addition to fixing xfstest generic/093, this allows us to support per-file capabilities (a'la setcap(8)). Link: https://tracker.ceph.com/issues/41135 Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 410eaf1ba211..cb18ee637cb7 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -20,7 +20,8 @@ static int __remove_xattr(struct ceph_inode_info *ci, static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || + return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -1265,35 +1266,6 @@ out: ceph_pagelist_release(pagelist); return err; } - -static int ceph_xattr_set_security_label(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *key, const void *buf, - size_t buflen, int flags) -{ - if (security_ismaclabel(key)) { - const char *name = xattr_full_name(handler, key); - return __ceph_setxattr(inode, name, buf, buflen, flags); - } - return -EOPNOTSUPP; -} - -static int ceph_xattr_get_security_label(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *key, void *buf, size_t buflen) -{ - if (security_ismaclabel(key)) { - const char *name = xattr_full_name(handler, key); - return __ceph_getxattr(inode, name, buf, buflen); - } - return -EOPNOTSUPP; -} - -static const struct xattr_handler ceph_security_label_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .get = ceph_xattr_get_security_label, - .set = ceph_xattr_set_security_label, -}; #endif /* CONFIG_CEPH_FS_SECURITY_LABEL */ #endif /* CONFIG_SECURITY */ @@ -1318,9 +1290,6 @@ const struct xattr_handler *ceph_xattr_handlers[] = { #ifdef CONFIG_CEPH_FS_POSIX_ACL &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, -#endif -#ifdef CONFIG_CEPH_FS_SECURITY_LABEL - &ceph_security_label_handler, #endif &ceph_other_xattr_handler, NULL, From 5435d2069503e2aa89c34a94154f4f2fa4a0c9c4 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Fri, 9 Aug 2019 07:05:27 +0000 Subject: [PATCH 0477/2880] rbd: fix response length parameter for encoded strings rbd_dev_image_id() allocates space for length but passes a smaller value to rbd_obj_method_sync(). rbd_dev_v2_object_prefix() doesn't allocate space for length. Fix both to be consistent. Signed-off-by: Dongsheng Yang Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c8fb886aebd4..69db7385c8df 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5669,17 +5669,20 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) { + size_t size; void *reply_buf; int ret; void *p; - reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); + /* Response will be an encoded string, which includes a length */ + size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX; + reply_buf = kzalloc(size, GFP_KERNEL); if (!reply_buf) return -ENOMEM; ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, &rbd_dev->header_oloc, "get_object_prefix", - NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); + NULL, 0, reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -6696,7 +6699,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) dout("rbd id object name is %s\n", oid.name); /* Response will be an encoded string, which includes a length */ - size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; response = kzalloc(size, GFP_NOIO); if (!response) { @@ -6708,7 +6710,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, "get_id", NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX); + response, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret == -ENOENT) { image_id = kstrdup("", GFP_KERNEL); From 96ac9158a230e467d4be737c82702355c6838fc4 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 8 Aug 2019 20:56:47 -0700 Subject: [PATCH 0478/2880] ceph: use release_pages() directly release_pages() has been available to modules since Oct, 2010, when commit 0be8557bcd34 ("fuse: use release_pages()") added EXPORT_SYMBOL(release_pages). However, this ceph code was still using a workaround. Remove the workaround, and call release_pages() directly. Signed-off-by: John Hubbard Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9efd51926792..7ab616601141 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -684,23 +684,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) return err; } -/* - * lame release_pages helper. release_pages() isn't exported to - * modules. - */ -static void ceph_release_pages(struct page **pages, int num) -{ - struct pagevec pvec; - int i; - - pagevec_init(&pvec); - for (i = 0; i < num; i++) { - if (pagevec_add(&pvec, pages[i]) == 0) - pagevec_release(&pvec); - } - pagevec_release(&pvec); -} - /* * async writeback completion handler. * @@ -776,7 +759,7 @@ static void writepages_finish(struct ceph_osd_request *req) dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", inode, osd_data->length, rc >= 0 ? num_pages : 0); - ceph_release_pages(osd_data->pages, num_pages); + release_pages(osd_data->pages, num_pages); } ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); From 8edf84ba4d0ecad7d1c5b393ba004b959dae76ea Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 21 Aug 2019 13:57:18 +0200 Subject: [PATCH 0479/2880] libceph: drop unused con parameter of calc_target() This bit was omitted from a561372405cf ("libceph: fix PG split vs OSD (re)connect race") to avoid backport conflicts. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2f44971e1347..ba45b074a362 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1505,7 +1505,6 @@ enum calc_target_result { static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_osd_request_target *t, - struct ceph_connection *con, bool any_change) { struct ceph_pg_pool_info *pi; @@ -2289,7 +2288,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: - ct_res = calc_target(osdc, &req->r_t, NULL, false); + ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) goto promote; @@ -3112,7 +3111,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; } - calc_target(osdc, &lreq->t, NULL, false); + calc_target(osdc, &lreq->t, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); link_linger(osd, lreq); @@ -3729,7 +3728,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq) struct ceph_osd_client *osdc = lreq->osdc; enum calc_target_result ct_res; - ct_res = calc_target(osdc, &lreq->t, NULL, true); + ct_res = calc_target(osdc, &lreq->t, true); if (ct_res == CALC_TARGET_NEED_RESEND) { struct ceph_osd *osd; @@ -3801,8 +3800,7 @@ static void scan_requests(struct ceph_osd *osd, n = rb_next(n); /* unlink_request(), check_pool_dne() */ dout("%s req %p tid %llu\n", __func__, req, req->r_tid); - ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, - false); + ct_res = calc_target(osdc, &req->r_t, false); switch (ct_res) { case CALC_TARGET_NO_ACTION: force_resend_writes = cleared_full || @@ -3911,7 +3909,7 @@ static void kick_requests(struct ceph_osd_client *osdc, n = rb_next(n); if (req->r_t.epoch < osdc->osdmap->epoch) { - ct_res = calc_target(osdc, &req->r_t, NULL, false); + ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE) { erase_request(need_resend, req); check_pool_dne(req); From 71a228bc8d65900179e37ac309e678f8c523f133 Mon Sep 17 00:00:00 2001 From: Erqi Chen Date: Wed, 28 Aug 2019 21:22:45 +0800 Subject: [PATCH 0480/2880] ceph: reconnect connection if session hang in opening state If client mds session is evicted in CEPH_MDS_SESSION_OPENING state, mds won't send session msg to client, and delayed_work skip CEPH_MDS_SESSION_OPENING state session, the session hang forever. Allow ceph_con_keepalive to reconnect a session in OPENING to avoid session hang. Also, ensure that we skip sessions in RESTARTING and REJECTED states since those states can't be resurrected by issuing a keepalive. Link: https://tracker.ceph.com/issues/41551 Signed-off-by: Erqi Chen chenerqi@gmail.com Reviewed-by: "Yan, Zheng" Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 959dcf2ab0b8..a8a8f84f3bbf 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4088,7 +4088,9 @@ static void delayed_work(struct work_struct *work) pr_info("mds%d hung\n", s->s_mds); } } - if (s->s_state < CEPH_MDS_SESSION_OPEN) { + if (s->s_state == CEPH_MDS_SESSION_NEW || + s->s_state == CEPH_MDS_SESSION_RESTARTING || + s->s_state == CEPH_MDS_SESSION_REJECTED) { /* this mds is failed or recovering, just wait */ ceph_put_mds_session(s); continue; From 21ed05a8bae76b5163996e0bbcaa35cd9eff41d7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 30 Aug 2019 17:31:06 +0200 Subject: [PATCH 0481/2880] rbd: pull rbd_img_request_create() dout out into the callers Make it more informative: log op_type, offset and length for block layer requests and initiating obj_req for child requests. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 69db7385c8df..7c4350c0fb77 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1754,8 +1754,6 @@ static struct rbd_img_request *rbd_img_request_create( mutex_init(&img_request->state_mutex); kref_init(&img_request->kref); - dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, - obj_op_name(op_type), img_request); return img_request; } @@ -2944,6 +2942,9 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) __set_bit(IMG_REQ_CHILD, &child_img_req->flags); child_img_req->obj_request = obj_req; + dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req, + obj_req); + if (!rbd_img_is_write(img_req)) { switch (img_req->data_type) { case OBJ_REQUEST_BIO: @@ -4877,6 +4878,9 @@ static void rbd_queue_workfn(struct work_struct *work) img_request->rq = rq; snapc = NULL; /* img_request consumes a ref */ + dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev, + img_request, obj_op_name(op_type), offset, length); + if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) result = rbd_img_fill_nodata(img_request, offset, length); else From 536cc331a4a714289334f0a0ce84318bb7e680d4 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Sat, 31 Aug 2019 23:57:12 +0200 Subject: [PATCH 0482/2880] ceph: move static keyword to the front of declarations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the static keyword to the front of declarations of snap_handle_length, handle_length and connected_handle_length, and resolve the following compiler warnings that can be seen when building with warnings enabled (W=1): fs/ceph/export.c:38:2: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] fs/ceph/export.c:88:2: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] fs/ceph/export.c:90:2: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] Signed-off-by: Krzysztof Wilczynski Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 020d39a85ecc..b6bfa94332c3 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -35,7 +35,7 @@ struct ceph_nfs_snapfh { static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { - const static int snap_handle_length = + static const int snap_handle_length = sizeof(struct ceph_nfs_snapfh) >> 2; struct ceph_nfs_snapfh *sfh = (void *)rawfh; u64 snapid = ceph_snap(inode); @@ -85,9 +85,9 @@ out: static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { - const static int handle_length = + static const int handle_length = sizeof(struct ceph_nfs_fh) >> 2; - const static int connected_handle_length = + static const int connected_handle_length = sizeof(struct ceph_nfs_confh) >> 2; int type; From 48f930ea6de6f48fd20be54ec4716c545751a6d9 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 5 Sep 2019 17:29:29 +0200 Subject: [PATCH 0483/2880] ceph: include ceph_debug.h in cache.c Any file that uses dout() should include ceph_debug.h at the top. Signed-off-by: Ilya Dryomov --- fs/ceph/cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index bc90cf6ad7ed..b2ec29eeb4c4 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -6,6 +6,8 @@ * Written by Milosz Tanski (milosz@adfin.com) */ +#include + #include "super.h" #include "cache.h" From 6fd4e634835208ddb331234bfa51d75396a5c42c Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 9 Sep 2019 16:48:54 +0100 Subject: [PATCH 0484/2880] ceph: allow object copies across different filesystems in the same cluster OSDs are able to perform object copies across different pools. Thus, there's no need to prevent copy_file_range from doing remote copies if the source and destination superblocks are different. Only return -EXDEV if they have different fsid (the cluster ID). Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ff17a81bf2e2..d277f71abe0b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1922,6 +1922,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, struct ceph_inode_info *src_ci = ceph_inode(src_inode); struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); struct ceph_cap_flush *prealloc_cf; + struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); struct ceph_object_locator src_oloc, dst_oloc; struct ceph_object_id src_oid, dst_oid; loff_t endoff = 0, size; @@ -1931,8 +1932,16 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, int src_got = 0, dst_got = 0, err, dirty; bool do_final_copy = false; - if (src_inode->i_sb != dst_inode->i_sb) - return -EXDEV; + if (src_inode->i_sb != dst_inode->i_sb) { + struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); + + if (ceph_fsid_compare(&src_fsc->client->fsid, + &dst_fsc->client->fsid)) { + dout("Copying files across clusters: src: %pU dst: %pU\n", + &src_fsc->client->fsid, &dst_fsc->client->fsid); + return -EXDEV; + } + } if (ceph_snap(dst_inode) != CEPH_NOSNAP) return -EROFS; @@ -1944,7 +1953,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * efficient). */ - if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM)) + if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) return -EOPNOTSUPP; if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || @@ -2059,7 +2068,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, dst_ci->i_vino.ino, dst_objnum); /* Do an object remote copy */ err = ceph_osdc_copy_from( - &ceph_inode_to_client(src_inode)->client->osdc, + &src_fsc->client->osdc, src_ci->i_vino.snap, 0, &src_oid, &src_oloc, CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | From 10c12851a022662bf6085bd4384b4ebed4c447ce Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 30 Aug 2019 20:44:24 +0200 Subject: [PATCH 0485/2880] libceph: avoid a __vmalloc() deadlock in ceph_kvmalloc() The vmalloc allocator doesn't fully respect the specified gfp mask: while the actual pages are allocated as requested, the page table pages are always allocated with GFP_KERNEL. ceph_kvmalloc() may be called with GFP_NOFS and GFP_NOIO (for ceph and rbd respectively), so this may result in a deadlock. There is no real reason for the current PAGE_ALLOC_COSTLY_ORDER logic, it's just something that seemed sensible at the time (ceph_kvmalloc() predates kvmalloc()). kvmalloc() is smarter: in an attempt to reduce long term fragmentation, it first tries to kmalloc non-disruptively. Switch to kvmalloc() and set the respective PF_MEMALLOC_* flag using the scope API to avoid the deadlock. Note that kvmalloc() needs to be passed GFP_KERNEL to enable the fallback. Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- net/ceph/ceph_common.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index b412a3ccc4fc..2d568246803f 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -185,18 +186,34 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); +/* + * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are + * compatible with (a superset of) GFP_KERNEL. This is because while the + * actual pages are allocated with the specified flags, the page table pages + * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take + * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). + * + * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. + */ void *ceph_kvmalloc(size_t size, gfp_t flags) { - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *ptr = kmalloc(size, flags | __GFP_NOWARN); - if (ptr) - return ptr; + void *p; + + if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) { + p = kvmalloc(size, flags); + } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) { + unsigned int nofs_flag = memalloc_nofs_save(); + p = kvmalloc(size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + } else { + unsigned int noio_flag = memalloc_noio_save(); + p = kvmalloc(size, GFP_KERNEL); + memalloc_noio_restore(noio_flag); } - return __vmalloc(size, flags, PAGE_KERNEL); + return p; } - static int parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; From cf73d882cc51c1f245a890cccb79952a260302d3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 4 Sep 2019 15:40:03 +0200 Subject: [PATCH 0486/2880] libceph: use ceph_kvmalloc() for osdmap arrays osdmap has a bunch of arrays that grow linearly with the number of OSDs. osd_state, osd_weight and osd_primary_affinity take 4 bytes per OSD. osd_addr takes 136 bytes per OSD because of sockaddr_storage. The CRUSH workspace area also grows linearly with the number of OSDs. Normally these arrays are allocated at client startup. The osdmap is usually updated in small incrementals, but once in a while a full map may need to be processed. For a cluster with 10000 OSDs, this means a bunch of 40K allocations followed by a 1.3M allocation, all of which are currently required to be physically contiguous. This results in sporadic ENOMEM errors, hanging the client. Go back to manually (re)allocating arrays and use ceph_kvmalloc() to fall back to non-contiguous allocation when necessary. Link: https://tracker.ceph.com/issues/40481 Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- net/ceph/osdmap.c | 69 +++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 90437906b7bc..4e0de14f80bb 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) struct ceph_pg_pool_info, node); __remove_pg_pool(&map->pg_pools, pi); } - kfree(map->osd_state); - kfree(map->osd_weight); - kfree(map->osd_addr); - kfree(map->osd_primary_affinity); - kfree(map->crush_workspace); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + kvfree(map->osd_primary_affinity); + kvfree(map->crush_workspace); kfree(map); } @@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) * * The new elements are properly initialized. */ -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) { u32 *state; u32 *weight; struct ceph_entity_addr *addr; + u32 to_copy; int i; - state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); - if (!state) + dout("%s old %u new %u\n", __func__, map->max_osd, max); + if (max == map->max_osd) + return 0; + + state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); + weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); + addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); + if (!state || !weight || !addr) { + kvfree(state); + kvfree(weight); + kvfree(addr); return -ENOMEM; + } + + to_copy = min(map->max_osd, max); + if (map->osd_state) { + memcpy(state, map->osd_state, to_copy * sizeof(*state)); + memcpy(weight, map->osd_weight, to_copy * sizeof(*weight)); + memcpy(addr, map->osd_addr, to_copy * sizeof(*addr)); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + } + map->osd_state = state; - - weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); - if (!weight) - return -ENOMEM; map->osd_weight = weight; - - addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); - if (!addr) - return -ENOMEM; map->osd_addr = addr; - for (i = map->max_osd; i < max; i++) { map->osd_state[i] = 0; map->osd_weight[i] = CEPH_OSD_OUT; @@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) if (map->osd_primary_affinity) { u32 *affinity; - affinity = krealloc(map->osd_primary_affinity, - max*sizeof(*affinity), GFP_NOFS); + affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), + GFP_NOFS); if (!affinity) return -ENOMEM; - map->osd_primary_affinity = affinity; + memcpy(affinity, map->osd_primary_affinity, + to_copy * sizeof(*affinity)); + kvfree(map->osd_primary_affinity); + + map->osd_primary_affinity = affinity; for (i = map->max_osd; i < max; i++) map->osd_primary_affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; @@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); - workspace = kmalloc(work_size, GFP_NOIO); + workspace = ceph_kvmalloc(work_size, GFP_NOIO); if (!workspace) { crush_destroy(crush); return -ENOMEM; @@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) if (map->crush) crush_destroy(map->crush); - kfree(map->crush_workspace); + kvfree(map->crush_workspace); map->crush = crush; map->crush_workspace = workspace; return 0; @@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = kmalloc_array(map->max_osd, - sizeof(u32), - GFP_NOFS); + map->osd_primary_affinity = ceph_kvmalloc( + array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), + GFP_NOFS); if (!map->osd_primary_affinity) return -ENOMEM; @@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end, ceph_decode_32_safe(p, end, len, e_inval); if (len == 0) { - kfree(map->osd_primary_affinity); + kvfree(map->osd_primary_affinity); map->osd_primary_affinity = NULL; return 0; } From 3ee5a7015c8b7cb4de21f7345f8381946f2fce55 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 12 Sep 2019 08:07:56 -0400 Subject: [PATCH 0487/2880] ceph: call ceph_mdsc_destroy from destroy_fs_client They're always called in succession. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 03b63b1cd32c..7173679c8ed7 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -731,6 +731,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); + ceph_mdsc_destroy(fsc); destroy_workqueue(fsc->inode_wq); destroy_workqueue(fsc->cap_wq); @@ -1105,7 +1106,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } if (ceph_sb_to_client(sb) != fsc) { - ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); fsc = ceph_sb_to_client(sb); dout("get_sb got existing client %p\n", fsc); @@ -1131,7 +1131,6 @@ out_splat: goto out_final; out: - ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); out_final: dout("ceph_mount fail %ld\n", PTR_ERR(res)); @@ -1155,8 +1154,6 @@ static void ceph_kill_sb(struct super_block *s) ceph_fscache_unregister_fs(fsc); - ceph_mdsc_destroy(fsc); - destroy_fs_client(fsc); free_anon_bdev(dev); } From 4a36a60c34f42f75e8b4f8cd24fcfade26111334 Mon Sep 17 00:00:00 2001 From: Jonathan Chocron Date: Thu, 12 Sep 2019 16:00:39 +0300 Subject: [PATCH 0488/2880] PCI: Add Amazon's Annapurna Labs vendor ID Add Amazon's Annapurna Labs vendor ID to pci_ids.h. Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Acked-by: Bjorn Helgaas --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index c842735a4f45..5ce83544f38b 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2570,6 +2570,8 @@ #define PCI_VENDOR_ID_ASMEDIA 0x1b21 +#define PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS 0x1c36 + #define PCI_VENDOR_ID_CIRCUITCO 0x1cc8 #define PCI_SUBSYSTEM_ID_CIRCUITCO_MINNOWBOARD 0x0001 From 76e67e9e0f0f2debf9df192502a759bdd0cb4dab Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Thu, 12 Sep 2019 16:00:40 +0300 Subject: [PATCH 0489/2880] PCI: Add ACS quirk for Amazon Annapurna Labs root ports The Amazon's Annapurna Labs root ports don't advertise an ACS capability, but they don't allow peer-to-peer transactions and do validate bus numbers through the SMMU. Additionally, it's not possible for one RP to pass traffic to another RP. Signed-off-by: Ali Saidi Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Gustavo Pimentel Reviewed-by: Andrew Murray Acked-by: Bjorn Helgaas --- drivers/pci/quirks.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 208aacf39329..e96c03b4e494 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4366,6 +4366,24 @@ static int pci_quirk_qcom_rp_acs(struct pci_dev *dev, u16 acs_flags) return ret; } +static int pci_quirk_al_acs(struct pci_dev *dev, u16 acs_flags) +{ + if (pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) + return -ENOTTY; + + /* + * Amazon's Annapurna Labs root ports don't include an ACS capability, + * but do include ACS-like functionality. The hardware doesn't support + * peer-to-peer transactions via the root port and each has a unique + * segment number. + * + * Additionally, the root ports cannot send traffic to each other. + */ + acs_flags &= ~(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF); + + return acs_flags ? 0 : 1; +} + /* * Sunrise Point PCH root ports implement ACS, but unfortunately as shown in * the datasheet (Intel 100 Series Chipset Family PCH Datasheet, Vol. 2, @@ -4559,6 +4577,8 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_AMPERE, 0xE00A, pci_quirk_xgene_acs }, { PCI_VENDOR_ID_AMPERE, 0xE00B, pci_quirk_xgene_acs }, { PCI_VENDOR_ID_AMPERE, 0xE00C, pci_quirk_xgene_acs }, + /* Amazon Annapurna Labs */ + { PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, pci_quirk_al_acs }, { 0 } }; From a638b5de205af40bdadd867b1cb77320bbb2628e Mon Sep 17 00:00:00 2001 From: Jonathan Chocron Date: Thu, 12 Sep 2019 16:00:41 +0300 Subject: [PATCH 0490/2880] PCI/VPD: Prevent VPD access for Amazon's Annapurna Labs Root Port The Amazon Annapurna Labs PCIe Root Port exposes the VPD capability, but there is no actual support for it. Trying to access the VPD (for example, as part of lspci -vv or when reading the vpd sysfs file), results in the following warning print: pcieport 0001:00:00.0: VPD access failed. This is likely a firmware bug on this device. Contact the card vendor for a firmware update Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Gustavo Pimentel Reviewed-by: Andrew Murray Acked-by: Bjorn Helgaas --- drivers/pci/vpd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 4963c2e2bd4c..7915d10f9aa1 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -571,6 +571,12 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LSI_LOGIC, 0x005f, quirk_blacklist_vpd); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, PCI_ANY_ID, quirk_blacklist_vpd); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_QLOGIC, 0x2261, quirk_blacklist_vpd); +/* + * The Amazon Annapurna Labs 0x0031 device id is reused for other non Root Port + * device types, so the quirk is registered for the PCI_CLASS_BRIDGE_PCI class. + */ +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, + PCI_CLASS_BRIDGE_PCI, 8, quirk_blacklist_vpd); /* * For Broadcom 5706, 5708, 5709 rev. A nics, any read beyond the From 738cb37b013e5ba6abd725a22abbd8baebc95082 Mon Sep 17 00:00:00 2001 From: Jonathan Chocron Date: Thu, 12 Sep 2019 16:00:42 +0300 Subject: [PATCH 0491/2880] PCI: Add quirk to disable MSI-X support for Amazon's Annapurna Labs Root Port The Root Port (identified by [1c36:0031]) doesn't support MSI-X. On some platforms it is configured to not advertise the capability at all, while on others it (mistakenly) does. This causes a panic during initialization by the pcieport driver, since it tries to configure the MSI-X capability. Specifically, when trying to access the MSI-X table a "non-existing addr" exception occurs. Example stacktrace snippet: SError Interrupt on CPU2, code 0xbf000000 -- SError CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.2.0-rc1-Jonny-14847-ge76f1d4a1828-dirty #33 Hardware name: Annapurna Labs Alpine V3 EVP (DT) pstate: 80000005 (Nzcv daif -PAN -UAO) pc : __pci_enable_msix_range+0x4e4/0x608 lr : __pci_enable_msix_range+0x498/0x608 sp : ffffff80117db700 x29: ffffff80117db700 x28: 0000000000000001 x27: 0000000000000001 x26: 0000000000000000 x25: ffffffd3e9d8c0b0 x24: 0000000000000000 x23: 0000000000000000 x22: 0000000000000000 x21: 0000000000000001 x20: 0000000000000000 x19: ffffffd3e9d8c000 x18: ffffffffffffffff x17: 0000000000000000 x16: 0000000000000000 x15: ffffff80116496c8 x14: ffffffd3e9844503 x13: ffffffd3e9844502 x12: 0000000000000038 x11: ffffffffffffff00 x10: 0000000000000040 x9 : ffffff801165e270 x8 : ffffff801165e268 x7 : 0000000000000002 x6 : 00000000000000b2 x5 : ffffffd3e9d8c2c0 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffffd3e9844680 Kernel panic - not syncing: Asynchronous SError Interrupt CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.2.0-rc1-Jonny-14847-ge76f1d4a1828-dirty #33 Hardware name: Annapurna Labs Alpine V3 EVP (DT) Call trace: dump_backtrace+0x0/0x140 show_stack+0x14/0x20 dump_stack+0xa8/0xcc panic+0x140/0x334 nmi_panic+0x6c/0x70 arm64_serror_panic+0x74/0x88 __pte_error+0x0/0x28 el1_error+0x84/0xf8 __pci_enable_msix_range+0x4e4/0x608 pci_alloc_irq_vectors_affinity+0xdc/0x150 pcie_port_device_register+0x2b8/0x4e0 pcie_portdrv_probe+0x34/0xf0 Notice that this quirk also disables MSI (which may work, but hasn't been tested nor has a current use case), since currently there is no standard way to disable only MSI-X. Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Gustavo Pimentel Reviewed-by: Andrew Murray --- drivers/pci/quirks.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index e96c03b4e494..44524d351a26 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2925,6 +2925,24 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, 0x10a1, quirk_msi_intx_disable_qca_bug); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, 0xe091, quirk_msi_intx_disable_qca_bug); + +/* + * Amazon's Annapurna Labs 1c36:0031 Root Ports don't support MSI-X, so it + * should be disabled on platforms where the device (mistakenly) advertises it. + * + * Notice that this quirk also disables MSI (which may work, but hasn't been + * tested), since currently there is no standard way to disable only MSI-X. + * + * The 0031 device id is reused for other non Root Port device types, + * therefore the quirk is registered for the PCI_CLASS_BRIDGE_PCI class. + */ +static void quirk_al_msi_disable(struct pci_dev *dev) +{ + dev->no_msi = 1; + pci_warn(dev, "Disabling MSI/MSI-X\n"); +} +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, + PCI_CLASS_BRIDGE_PCI, 8, quirk_al_msi_disable); #endif /* CONFIG_PCI_MSI */ /* From ed4381da34d4765626aefc9b9ef084cd8e6bb749 Mon Sep 17 00:00:00 2001 From: Jonathan Chocron Date: Thu, 12 Sep 2019 16:02:36 +0300 Subject: [PATCH 0492/2880] dt-bindings: PCI: Add Amazon's Annapurna Labs PCIe host bridge binding Document Amazon's Annapurna Labs PCIe host bridge. Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Reviewed-by: Rob Herring --- .../devicetree/bindings/pci/pcie-al.txt | 46 +++++++++++++++++++ MAINTAINERS | 3 +- 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/pci/pcie-al.txt diff --git a/Documentation/devicetree/bindings/pci/pcie-al.txt b/Documentation/devicetree/bindings/pci/pcie-al.txt new file mode 100644 index 000000000000..557a5089229d --- /dev/null +++ b/Documentation/devicetree/bindings/pci/pcie-al.txt @@ -0,0 +1,46 @@ +* Amazon Annapurna Labs PCIe host bridge + +Amazon's Annapurna Labs PCIe Host Controller is based on the Synopsys DesignWare +PCI core. It inherits common properties defined in +Documentation/devicetree/bindings/pci/designware-pcie.txt. + +Properties of the host controller node that differ from it are: + +- compatible: + Usage: required + Value type: + Definition: Value should contain + - "amazon,al-alpine-v2-pcie" for alpine_v2 + - "amazon,al-alpine-v3-pcie" for alpine_v3 + +- reg: + Usage: required + Value type: + Definition: Register ranges as listed in the reg-names property + +- reg-names: + Usage: required + Value type: + Definition: Must include the following entries + - "config" PCIe ECAM space + - "controller" AL proprietary registers + - "dbi" Designware PCIe registers + +Example: + + pcie-external0: pcie@fb600000 { + compatible = "amazon,al-alpine-v3-pcie"; + reg = <0x0 0xfb600000 0x0 0x00100000 + 0x0 0xfd800000 0x0 0x00010000 + 0x0 0xfd810000 0x0 0x00001000>; + reg-names = "config", "controller", "dbi"; + bus-range = <0 255>; + device_type = "pci"; + #address-cells = <3>; + #size-cells = <2>; + #interrupt-cells = <1>; + interrupts = ; + interrupt-map-mask = <0x00 0 0 7>; + interrupt-map = <0x0000 0 0 1 &gic GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>; /* INTa */ + ranges = <0x02000000 0x0 0xc0010000 0x0 0xc0010000 0x0 0x07ff0000>; + }; diff --git a/MAINTAINERS b/MAINTAINERS index 783569e3c4b4..d200b16fa95c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12448,10 +12448,11 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git/ S: Supported F: drivers/pci/controller/ -PCIE DRIVER FOR ANNAPURNA LABS +PCIE DRIVER FOR AMAZON ANNAPURNA LABS M: Jonathan Chocron L: linux-pci@vger.kernel.org S: Maintained +F: Documentation/devicetree/bindings/pci/pcie-al.txt F: drivers/pci/controller/dwc/pcie-al.c PCIE DRIVER FOR AMLOGIC MESON From a8daea94754989f6c48dafda840482cbc9f882f9 Mon Sep 17 00:00:00 2001 From: Jonathan Chocron Date: Thu, 12 Sep 2019 16:02:37 +0300 Subject: [PATCH 0493/2880] PCI: dwc: al: Add Amazon Annapurna Labs PCIe controller driver This driver is DT based and utilizes the DesignWare APIs. It allows using a smaller ECAM range for a larger bus range - usually an entire bus uses 1MB of address space, but the driver can use it for a larger number of buses. This is achieved by using a HW mechanism which allows changing the BUS part of the "final" outgoing config transaction. There are 2 HW regs, one which is basically a bitmask determining which bits to take from the AXI transaction itself and another which holds the complementary part programmed by the driver. All link initializations are handled by the boot FW. Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Gustavo Pimentel Reviewed-by: Andrew Murray --- drivers/pci/controller/dwc/Kconfig | 12 + drivers/pci/controller/dwc/pcie-al.c | 365 +++++++++++++++++++++++++++ 2 files changed, 377 insertions(+) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 6ea778ae4877..3c6094cbcc3b 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -230,4 +230,16 @@ config PCIE_UNIPHIER Say Y here if you want PCIe controller support on UniPhier SoCs. This driver supports LD20 and PXs3 SoCs. +config PCIE_AL + bool "Amazon Annapurna Labs PCIe controller" + depends on OF && (ARM64 || COMPILE_TEST) + depends on PCI_MSI_IRQ_DOMAIN + select PCIE_DW_HOST + help + Say Y here to enable support of the Amazon's Annapurna Labs PCIe + controller IP on Amazon SoCs. The PCIe controller uses the DesignWare + core plus Annapurna Labs proprietary hardware wrappers. This is + required only for DT-based platforms. ACPI platforms with the + Annapurna Labs PCIe controller don't need to enable this. + endmenu diff --git a/drivers/pci/controller/dwc/pcie-al.c b/drivers/pci/controller/dwc/pcie-al.c index 3ab58f0584a8..1eeda2f6371f 100644 --- a/drivers/pci/controller/dwc/pcie-al.c +++ b/drivers/pci/controller/dwc/pcie-al.c @@ -91,3 +91,368 @@ struct pci_ecam_ops al_pcie_ops = { }; #endif /* defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) */ + +#ifdef CONFIG_PCIE_AL + +#include +#include "pcie-designware.h" + +#define AL_PCIE_REV_ID_2 2 +#define AL_PCIE_REV_ID_3 3 +#define AL_PCIE_REV_ID_4 4 + +#define AXI_BASE_OFFSET 0x0 + +#define DEVICE_ID_OFFSET 0x16c + +#define DEVICE_REV_ID 0x0 +#define DEVICE_REV_ID_DEV_ID_MASK GENMASK(31, 16) + +#define DEVICE_REV_ID_DEV_ID_X4 0 +#define DEVICE_REV_ID_DEV_ID_X8 2 +#define DEVICE_REV_ID_DEV_ID_X16 4 + +#define OB_CTRL_REV1_2_OFFSET 0x0040 +#define OB_CTRL_REV3_5_OFFSET 0x0030 + +#define CFG_TARGET_BUS 0x0 +#define CFG_TARGET_BUS_MASK_MASK GENMASK(7, 0) +#define CFG_TARGET_BUS_BUSNUM_MASK GENMASK(15, 8) + +#define CFG_CONTROL 0x4 +#define CFG_CONTROL_SUBBUS_MASK GENMASK(15, 8) +#define CFG_CONTROL_SEC_BUS_MASK GENMASK(23, 16) + +struct al_pcie_reg_offsets { + unsigned int ob_ctrl; +}; + +struct al_pcie_target_bus_cfg { + u8 reg_val; + u8 reg_mask; + u8 ecam_mask; +}; + +struct al_pcie { + struct dw_pcie *pci; + void __iomem *controller_base; /* base of PCIe unit (not DW core) */ + struct device *dev; + resource_size_t ecam_size; + unsigned int controller_rev_id; + struct al_pcie_reg_offsets reg_offsets; + struct al_pcie_target_bus_cfg target_bus_cfg; +}; + +#define PCIE_ECAM_DEVFN(x) (((x) & 0xff) << 12) + +#define to_al_pcie(x) dev_get_drvdata((x)->dev) + +static inline u32 al_pcie_controller_readl(struct al_pcie *pcie, u32 offset) +{ + return readl_relaxed(pcie->controller_base + offset); +} + +static inline void al_pcie_controller_writel(struct al_pcie *pcie, u32 offset, + u32 val) +{ + writel_relaxed(val, pcie->controller_base + offset); +} + +static int al_pcie_rev_id_get(struct al_pcie *pcie, unsigned int *rev_id) +{ + u32 dev_rev_id_val; + u32 dev_id_val; + + dev_rev_id_val = al_pcie_controller_readl(pcie, AXI_BASE_OFFSET + + DEVICE_ID_OFFSET + + DEVICE_REV_ID); + dev_id_val = FIELD_GET(DEVICE_REV_ID_DEV_ID_MASK, dev_rev_id_val); + + switch (dev_id_val) { + case DEVICE_REV_ID_DEV_ID_X4: + *rev_id = AL_PCIE_REV_ID_2; + break; + case DEVICE_REV_ID_DEV_ID_X8: + *rev_id = AL_PCIE_REV_ID_3; + break; + case DEVICE_REV_ID_DEV_ID_X16: + *rev_id = AL_PCIE_REV_ID_4; + break; + default: + dev_err(pcie->dev, "Unsupported dev_id_val (0x%x)\n", + dev_id_val); + return -EINVAL; + } + + dev_dbg(pcie->dev, "dev_id_val: 0x%x\n", dev_id_val); + + return 0; +} + +static int al_pcie_reg_offsets_set(struct al_pcie *pcie) +{ + switch (pcie->controller_rev_id) { + case AL_PCIE_REV_ID_2: + pcie->reg_offsets.ob_ctrl = OB_CTRL_REV1_2_OFFSET; + break; + case AL_PCIE_REV_ID_3: + case AL_PCIE_REV_ID_4: + pcie->reg_offsets.ob_ctrl = OB_CTRL_REV3_5_OFFSET; + break; + default: + dev_err(pcie->dev, "Unsupported controller rev_id: 0x%x\n", + pcie->controller_rev_id); + return -EINVAL; + } + + return 0; +} + +static inline void al_pcie_target_bus_set(struct al_pcie *pcie, + u8 target_bus, + u8 mask_target_bus) +{ + u32 reg; + + reg = FIELD_PREP(CFG_TARGET_BUS_MASK_MASK, mask_target_bus) | + FIELD_PREP(CFG_TARGET_BUS_BUSNUM_MASK, target_bus); + + al_pcie_controller_writel(pcie, AXI_BASE_OFFSET + + pcie->reg_offsets.ob_ctrl + CFG_TARGET_BUS, + reg); +} + +static void __iomem *al_pcie_conf_addr_map(struct al_pcie *pcie, + unsigned int busnr, + unsigned int devfn) +{ + struct al_pcie_target_bus_cfg *target_bus_cfg = &pcie->target_bus_cfg; + unsigned int busnr_ecam = busnr & target_bus_cfg->ecam_mask; + unsigned int busnr_reg = busnr & target_bus_cfg->reg_mask; + struct pcie_port *pp = &pcie->pci->pp; + void __iomem *pci_base_addr; + + pci_base_addr = (void __iomem *)((uintptr_t)pp->va_cfg0_base + + (busnr_ecam << 20) + + PCIE_ECAM_DEVFN(devfn)); + + if (busnr_reg != target_bus_cfg->reg_val) { + dev_dbg(pcie->pci->dev, "Changing target bus busnum val from 0x%x to 0x%x\n", + target_bus_cfg->reg_val, busnr_reg); + target_bus_cfg->reg_val = busnr_reg; + al_pcie_target_bus_set(pcie, + target_bus_cfg->reg_val, + target_bus_cfg->reg_mask); + } + + return pci_base_addr; +} + +static int al_pcie_rd_other_conf(struct pcie_port *pp, struct pci_bus *bus, + unsigned int devfn, int where, int size, + u32 *val) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct al_pcie *pcie = to_al_pcie(pci); + unsigned int busnr = bus->number; + void __iomem *pci_addr; + int rc; + + pci_addr = al_pcie_conf_addr_map(pcie, busnr, devfn); + + rc = dw_pcie_read(pci_addr + where, size, val); + + dev_dbg(pci->dev, "%d-byte config read from %04x:%02x:%02x.%d offset 0x%x (pci_addr: 0x%px) - val:0x%x\n", + size, pci_domain_nr(bus), bus->number, + PCI_SLOT(devfn), PCI_FUNC(devfn), where, + (pci_addr + where), *val); + + return rc; +} + +static int al_pcie_wr_other_conf(struct pcie_port *pp, struct pci_bus *bus, + unsigned int devfn, int where, int size, + u32 val) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct al_pcie *pcie = to_al_pcie(pci); + unsigned int busnr = bus->number; + void __iomem *pci_addr; + int rc; + + pci_addr = al_pcie_conf_addr_map(pcie, busnr, devfn); + + rc = dw_pcie_write(pci_addr + where, size, val); + + dev_dbg(pci->dev, "%d-byte config write to %04x:%02x:%02x.%d offset 0x%x (pci_addr: 0x%px) - val:0x%x\n", + size, pci_domain_nr(bus), bus->number, + PCI_SLOT(devfn), PCI_FUNC(devfn), where, + (pci_addr + where), val); + + return rc; +} + +static void al_pcie_config_prepare(struct al_pcie *pcie) +{ + struct al_pcie_target_bus_cfg *target_bus_cfg; + struct pcie_port *pp = &pcie->pci->pp; + unsigned int ecam_bus_mask; + u32 cfg_control_offset; + u8 subordinate_bus; + u8 secondary_bus; + u32 cfg_control; + u32 reg; + + target_bus_cfg = &pcie->target_bus_cfg; + + ecam_bus_mask = (pcie->ecam_size >> 20) - 1; + if (ecam_bus_mask > 255) { + dev_warn(pcie->dev, "ECAM window size is larger than 256MB. Cutting off at 256\n"); + ecam_bus_mask = 255; + } + + /* This portion is taken from the transaction address */ + target_bus_cfg->ecam_mask = ecam_bus_mask; + /* This portion is taken from the cfg_target_bus reg */ + target_bus_cfg->reg_mask = ~target_bus_cfg->ecam_mask; + target_bus_cfg->reg_val = pp->busn->start & target_bus_cfg->reg_mask; + + al_pcie_target_bus_set(pcie, target_bus_cfg->reg_val, + target_bus_cfg->reg_mask); + + secondary_bus = pp->busn->start + 1; + subordinate_bus = pp->busn->end; + + /* Set the valid values of secondary and subordinate buses */ + cfg_control_offset = AXI_BASE_OFFSET + pcie->reg_offsets.ob_ctrl + + CFG_CONTROL; + + cfg_control = al_pcie_controller_readl(pcie, cfg_control_offset); + + reg = cfg_control & + ~(CFG_CONTROL_SEC_BUS_MASK | CFG_CONTROL_SUBBUS_MASK); + + reg |= FIELD_PREP(CFG_CONTROL_SUBBUS_MASK, subordinate_bus) | + FIELD_PREP(CFG_CONTROL_SEC_BUS_MASK, secondary_bus); + + al_pcie_controller_writel(pcie, cfg_control_offset, reg); +} + +static int al_pcie_host_init(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct al_pcie *pcie = to_al_pcie(pci); + int rc; + + rc = al_pcie_rev_id_get(pcie, &pcie->controller_rev_id); + if (rc) + return rc; + + rc = al_pcie_reg_offsets_set(pcie); + if (rc) + return rc; + + al_pcie_config_prepare(pcie); + + return 0; +} + +static const struct dw_pcie_host_ops al_pcie_host_ops = { + .rd_other_conf = al_pcie_rd_other_conf, + .wr_other_conf = al_pcie_wr_other_conf, + .host_init = al_pcie_host_init, +}; + +static int al_add_pcie_port(struct pcie_port *pp, + struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + int ret; + + pp->ops = &al_pcie_host_ops; + + ret = dw_pcie_host_init(pp); + if (ret) { + dev_err(dev, "failed to initialize host\n"); + return ret; + } + + return 0; +} + +static const struct dw_pcie_ops dw_pcie_ops = { +}; + +static int al_pcie_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct resource *controller_res; + struct resource *ecam_res; + struct resource *dbi_res; + struct al_pcie *al_pcie; + struct dw_pcie *pci; + + al_pcie = devm_kzalloc(dev, sizeof(*al_pcie), GFP_KERNEL); + if (!al_pcie) + return -ENOMEM; + + pci = devm_kzalloc(dev, sizeof(*pci), GFP_KERNEL); + if (!pci) + return -ENOMEM; + + pci->dev = dev; + pci->ops = &dw_pcie_ops; + + al_pcie->pci = pci; + al_pcie->dev = dev; + + dbi_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); + pci->dbi_base = devm_pci_remap_cfg_resource(dev, dbi_res); + if (IS_ERR(pci->dbi_base)) { + dev_err(dev, "couldn't remap dbi base %pR\n", dbi_res); + return PTR_ERR(pci->dbi_base); + } + + ecam_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "config"); + if (!ecam_res) { + dev_err(dev, "couldn't find 'config' reg in DT\n"); + return -ENOENT; + } + al_pcie->ecam_size = resource_size(ecam_res); + + controller_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "controller"); + al_pcie->controller_base = devm_ioremap_resource(dev, controller_res); + if (IS_ERR(al_pcie->controller_base)) { + dev_err(dev, "couldn't remap controller base %pR\n", + controller_res); + return PTR_ERR(al_pcie->controller_base); + } + + dev_dbg(dev, "From DT: dbi_base: %pR, controller_base: %pR\n", + dbi_res, controller_res); + + platform_set_drvdata(pdev, al_pcie); + + return al_add_pcie_port(&pci->pp, pdev); +} + +static const struct of_device_id al_pcie_of_match[] = { + { .compatible = "amazon,al-alpine-v2-pcie", + }, + { .compatible = "amazon,al-alpine-v3-pcie", + }, + {}, +}; + +static struct platform_driver al_pcie_driver = { + .driver = { + .name = "al-pcie", + .of_match_table = al_pcie_of_match, + .suppress_bind_attrs = true, + }, + .probe = al_pcie_probe, +}; +builtin_platform_driver(al_pcie_driver); + +#endif /* CONFIG_PCIE_AL*/ From 0b24134f7888175c9638e6fd1900e23e44fc172f Mon Sep 17 00:00:00 2001 From: Jonathan Chocron Date: Thu, 12 Sep 2019 16:02:38 +0300 Subject: [PATCH 0494/2880] PCI: dwc: Add validation that PCIe core is set to correct mode Some PCIe controllers can be set to either Host or EP according to some early boot FW. To make sure there is no discrepancy (e.g. FW configured the port to EP mode while the DT specifies it as a host bridge or vice versa), a check has been added for each mode. Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Acked-by: Gustavo Pimentel --- drivers/pci/controller/dwc/pcie-designware-ep.c | 8 ++++++++ .../pci/controller/dwc/pcie-designware-host.c | 16 ++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 2bf5a35c0570..0b9a9b27175c 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -531,6 +531,7 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) int ret; u32 reg; void *addr; + u8 hdr_type; unsigned int nbars; unsigned int offset; struct pci_epc *epc; @@ -595,6 +596,13 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) if (ep->ops->ep_init) ep->ops->ep_init(ep); + hdr_type = dw_pcie_readb_dbi(pci, PCI_HEADER_TYPE); + if (hdr_type != PCI_HEADER_TYPE_NORMAL) { + dev_err(pci->dev, "PCIe controller is not set to EP mode (hdr_type:0x%x)!\n", + hdr_type); + return -EIO; + } + ret = of_property_read_u8(np, "max-functions", &epc->max_functions); if (ret < 0) epc->max_functions = 1; diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index f93252d0da5b..bb275b5e787a 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -323,6 +323,7 @@ int dw_pcie_host_init(struct pcie_port *pp) struct pci_bus *child; struct pci_host_bridge *bridge; struct resource *cfg_res; + u32 hdr_type; int ret; raw_spin_lock_init(&pci->pp.lock); @@ -464,6 +465,21 @@ int dw_pcie_host_init(struct pcie_port *pp) goto err_free_msi; } + ret = dw_pcie_rd_own_conf(pp, PCI_HEADER_TYPE, 1, &hdr_type); + if (ret != PCIBIOS_SUCCESSFUL) { + dev_err(pci->dev, "Failed reading PCI_HEADER_TYPE cfg space reg (ret: 0x%x)\n", + ret); + ret = pcibios_err_to_errno(ret); + goto err_free_msi; + } + if (hdr_type != PCI_HEADER_TYPE_BRIDGE) { + dev_err(pci->dev, + "PCIe controller is not set to bridge type (hdr_type: 0x%x)!\n", + hdr_type); + ret = -EIO; + goto err_free_msi; + } + pp->root_bus_nr = pp->busn->start; bridge->dev.parent = dev; From c9b8af43a7cddf4f0bd8f36a4242cf70e73097fa Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 9 Aug 2019 16:40:29 +0200 Subject: [PATCH 0495/2880] watchdog: pnx4008_wdt: allow compile-testing The only thing that prevents building this driver on other platforms is the mach/hardware.h include, which is not actually used here at all, so remove the line and allow CONFIG_COMPILE_TEST. Acked-by: Sylvain Lemieux Reviewed-by: Guenter Roeck Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20190809144043.476786-4-arnd@arndb.de Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 2 +- drivers/watchdog/pnx4008_wdt.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 8188963a405b..a45f9e3e442b 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -551,7 +551,7 @@ config OMAP_WATCHDOG config PNX4008_WATCHDOG tristate "LPC32XX Watchdog" - depends on ARCH_LPC32XX + depends on ARCH_LPC32XX || COMPILE_TEST select WATCHDOG_CORE help Say Y here if to include support for the watchdog timer diff --git a/drivers/watchdog/pnx4008_wdt.c b/drivers/watchdog/pnx4008_wdt.c index 7b446b696f2b..e0ea133c1690 100644 --- a/drivers/watchdog/pnx4008_wdt.c +++ b/drivers/watchdog/pnx4008_wdt.c @@ -30,7 +30,6 @@ #include #include #include -#include /* WatchDog Timer - Chapter 23 Page 207 */ From a65f506f4a824fc256ff3cae41a14549550f49f2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 9 Aug 2019 22:27:32 +0200 Subject: [PATCH 0496/2880] watchdog: remove ks8695 driver The platform is getting removed, so there are no remaining users of this driver. Signed-off-by: Arnd Bergmann Acked-by: Guenter Roeck Link: https://lore.kernel.org/r/20190809202749.742267-5-arnd@arndb.de Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../watchdog/watchdog-parameters.rst | 9 - drivers/watchdog/Kconfig | 7 - drivers/watchdog/Makefile | 1 - drivers/watchdog/ks8695_wdt.c | 319 ------------------ 4 files changed, 336 deletions(-) delete mode 100644 drivers/watchdog/ks8695_wdt.c diff --git a/Documentation/watchdog/watchdog-parameters.rst b/Documentation/watchdog/watchdog-parameters.rst index a3985cc5aeda..226aba56f704 100644 --- a/Documentation/watchdog/watchdog-parameters.rst +++ b/Documentation/watchdog/watchdog-parameters.rst @@ -301,15 +301,6 @@ ixp4xx_wdt: ------------------------------------------------- -ks8695_wdt: - wdt_time: - Watchdog time in seconds. (default=5) - nowayout: - Watchdog cannot be stopped once started - (default=kernel config parameter) - -------------------------------------------------- - machzwd: nowayout: Watchdog cannot be stopped once started diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index a45f9e3e442b..7e45a7792ed5 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -477,13 +477,6 @@ config IXP4XX_WATCHDOG Say N if you are unsure. -config KS8695_WATCHDOG - tristate "KS8695 watchdog" - depends on ARCH_KS8695 - help - Watchdog timer embedded into KS8695 processor. This will reboot your - system when the timeout is reached. - config HAVE_S3C2410_WATCHDOG bool help diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 7caa920e7e60..85f55ec76f8d 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -49,7 +49,6 @@ obj-$(CONFIG_21285_WATCHDOG) += wdt285.o obj-$(CONFIG_977_WATCHDOG) += wdt977.o obj-$(CONFIG_FTWDT010_WATCHDOG) += ftwdt010_wdt.o obj-$(CONFIG_IXP4XX_WATCHDOG) += ixp4xx_wdt.o -obj-$(CONFIG_KS8695_WATCHDOG) += ks8695_wdt.o obj-$(CONFIG_S3C2410_WATCHDOG) += s3c2410_wdt.o obj-$(CONFIG_SA1100_WATCHDOG) += sa1100_wdt.o obj-$(CONFIG_SAMA5D4_WATCHDOG) += sama5d4_wdt.o diff --git a/drivers/watchdog/ks8695_wdt.c b/drivers/watchdog/ks8695_wdt.c deleted file mode 100644 index 1550ce3c5702..000000000000 --- a/drivers/watchdog/ks8695_wdt.c +++ /dev/null @@ -1,319 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Watchdog driver for Kendin/Micrel KS8695. - * - * (C) 2007 Andrew Victor - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define KS8695_TMR_OFFSET (0xF0000 + 0xE400) -#define KS8695_TMR_VA (KS8695_IO_VA + KS8695_TMR_OFFSET) - -/* - * Timer registers - */ -#define KS8695_TMCON (0x00) /* Timer Control Register */ -#define KS8695_T0TC (0x08) /* Timer 0 Timeout Count Register */ -#define TMCON_T0EN (1 << 0) /* Timer 0 Enable */ - -/* Timer0 Timeout Counter Register */ -#define T0TC_WATCHDOG (0xff) /* Enable watchdog mode */ - -#define WDT_DEFAULT_TIME 5 /* seconds */ -#define WDT_MAX_TIME 171 /* seconds */ - -static int wdt_time = WDT_DEFAULT_TIME; -static bool nowayout = WATCHDOG_NOWAYOUT; - -module_param(wdt_time, int, 0); -MODULE_PARM_DESC(wdt_time, "Watchdog time in seconds. (default=" - __MODULE_STRING(WDT_DEFAULT_TIME) ")"); - -#ifdef CONFIG_WATCHDOG_NOWAYOUT -module_param(nowayout, bool, 0); -MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" - __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); -#endif - - -static unsigned long ks8695wdt_busy; -static DEFINE_SPINLOCK(ks8695_lock); - -/* ......................................................................... */ - -/* - * Disable the watchdog. - */ -static inline void ks8695_wdt_stop(void) -{ - unsigned long tmcon; - - spin_lock(&ks8695_lock); - /* disable timer0 */ - tmcon = __raw_readl(KS8695_TMR_VA + KS8695_TMCON); - __raw_writel(tmcon & ~TMCON_T0EN, KS8695_TMR_VA + KS8695_TMCON); - spin_unlock(&ks8695_lock); -} - -/* - * Enable and reset the watchdog. - */ -static inline void ks8695_wdt_start(void) -{ - unsigned long tmcon; - unsigned long tval = wdt_time * KS8695_CLOCK_RATE; - - spin_lock(&ks8695_lock); - /* disable timer0 */ - tmcon = __raw_readl(KS8695_TMR_VA + KS8695_TMCON); - __raw_writel(tmcon & ~TMCON_T0EN, KS8695_TMR_VA + KS8695_TMCON); - - /* program timer0 */ - __raw_writel(tval | T0TC_WATCHDOG, KS8695_TMR_VA + KS8695_T0TC); - - /* re-enable timer0 */ - tmcon = __raw_readl(KS8695_TMR_VA + KS8695_TMCON); - __raw_writel(tmcon | TMCON_T0EN, KS8695_TMR_VA + KS8695_TMCON); - spin_unlock(&ks8695_lock); -} - -/* - * Reload the watchdog timer. (ie, pat the watchdog) - */ -static inline void ks8695_wdt_reload(void) -{ - unsigned long tmcon; - - spin_lock(&ks8695_lock); - /* disable, then re-enable timer0 */ - tmcon = __raw_readl(KS8695_TMR_VA + KS8695_TMCON); - __raw_writel(tmcon & ~TMCON_T0EN, KS8695_TMR_VA + KS8695_TMCON); - __raw_writel(tmcon | TMCON_T0EN, KS8695_TMR_VA + KS8695_TMCON); - spin_unlock(&ks8695_lock); -} - -/* - * Change the watchdog time interval. - */ -static int ks8695_wdt_settimeout(int new_time) -{ - /* - * All counting occurs at KS8695_CLOCK_RATE / 128 = 0.256 Hz - * - * Since WDV is a 16-bit counter, the maximum period is - * 65536 / 0.256 = 256 seconds. - */ - if ((new_time <= 0) || (new_time > WDT_MAX_TIME)) - return -EINVAL; - - /* Set new watchdog time. It will be used when - ks8695_wdt_start() is called. */ - wdt_time = new_time; - return 0; -} - -/* ......................................................................... */ - -/* - * Watchdog device is opened, and watchdog starts running. - */ -static int ks8695_wdt_open(struct inode *inode, struct file *file) -{ - if (test_and_set_bit(0, &ks8695wdt_busy)) - return -EBUSY; - - ks8695_wdt_start(); - return stream_open(inode, file); -} - -/* - * Close the watchdog device. - * If CONFIG_WATCHDOG_NOWAYOUT is NOT defined then the watchdog is also - * disabled. - */ -static int ks8695_wdt_close(struct inode *inode, struct file *file) -{ - /* Disable the watchdog when file is closed */ - if (!nowayout) - ks8695_wdt_stop(); - clear_bit(0, &ks8695wdt_busy); - return 0; -} - -static const struct watchdog_info ks8695_wdt_info = { - .identity = "ks8695 watchdog", - .options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING, -}; - -/* - * Handle commands from user-space. - */ -static long ks8695_wdt_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int __user *p = argp; - int new_value; - - switch (cmd) { - case WDIOC_GETSUPPORT: - return copy_to_user(argp, &ks8695_wdt_info, - sizeof(ks8695_wdt_info)) ? -EFAULT : 0; - case WDIOC_GETSTATUS: - case WDIOC_GETBOOTSTATUS: - return put_user(0, p); - case WDIOC_SETOPTIONS: - if (get_user(new_value, p)) - return -EFAULT; - if (new_value & WDIOS_DISABLECARD) - ks8695_wdt_stop(); - if (new_value & WDIOS_ENABLECARD) - ks8695_wdt_start(); - return 0; - case WDIOC_KEEPALIVE: - ks8695_wdt_reload(); /* pat the watchdog */ - return 0; - case WDIOC_SETTIMEOUT: - if (get_user(new_value, p)) - return -EFAULT; - if (ks8695_wdt_settimeout(new_value)) - return -EINVAL; - /* Enable new time value */ - ks8695_wdt_start(); - /* Return current value */ - return put_user(wdt_time, p); - case WDIOC_GETTIMEOUT: - return put_user(wdt_time, p); - default: - return -ENOTTY; - } -} - -/* - * Pat the watchdog whenever device is written to. - */ -static ssize_t ks8695_wdt_write(struct file *file, const char *data, - size_t len, loff_t *ppos) -{ - ks8695_wdt_reload(); /* pat the watchdog */ - return len; -} - -/* ......................................................................... */ - -static const struct file_operations ks8695wdt_fops = { - .owner = THIS_MODULE, - .llseek = no_llseek, - .unlocked_ioctl = ks8695_wdt_ioctl, - .open = ks8695_wdt_open, - .release = ks8695_wdt_close, - .write = ks8695_wdt_write, -}; - -static struct miscdevice ks8695wdt_miscdev = { - .minor = WATCHDOG_MINOR, - .name = "watchdog", - .fops = &ks8695wdt_fops, -}; - -static int ks8695wdt_probe(struct platform_device *pdev) -{ - int res; - - if (ks8695wdt_miscdev.parent) - return -EBUSY; - ks8695wdt_miscdev.parent = &pdev->dev; - - res = misc_register(&ks8695wdt_miscdev); - if (res) - return res; - - pr_info("KS8695 Watchdog Timer enabled (%d seconds%s)\n", - wdt_time, nowayout ? ", nowayout" : ""); - return 0; -} - -static int ks8695wdt_remove(struct platform_device *pdev) -{ - misc_deregister(&ks8695wdt_miscdev); - ks8695wdt_miscdev.parent = NULL; - - return 0; -} - -static void ks8695wdt_shutdown(struct platform_device *pdev) -{ - ks8695_wdt_stop(); -} - -#ifdef CONFIG_PM - -static int ks8695wdt_suspend(struct platform_device *pdev, pm_message_t message) -{ - ks8695_wdt_stop(); - return 0; -} - -static int ks8695wdt_resume(struct platform_device *pdev) -{ - if (ks8695wdt_busy) - ks8695_wdt_start(); - return 0; -} - -#else -#define ks8695wdt_suspend NULL -#define ks8695wdt_resume NULL -#endif - -static struct platform_driver ks8695wdt_driver = { - .probe = ks8695wdt_probe, - .remove = ks8695wdt_remove, - .shutdown = ks8695wdt_shutdown, - .suspend = ks8695wdt_suspend, - .resume = ks8695wdt_resume, - .driver = { - .name = "ks8695_wdt", - }, -}; - -static int __init ks8695_wdt_init(void) -{ - /* Check that the heartbeat value is within range; - if not reset to the default */ - if (ks8695_wdt_settimeout(wdt_time)) { - ks8695_wdt_settimeout(WDT_DEFAULT_TIME); - pr_info("ks8695_wdt: wdt_time value must be 1 <= wdt_time <= %i" - ", using %d\n", wdt_time, WDT_MAX_TIME); - } - return platform_driver_register(&ks8695wdt_driver); -} - -static void __exit ks8695_wdt_exit(void) -{ - platform_driver_unregister(&ks8695wdt_driver); -} - -module_init(ks8695_wdt_init); -module_exit(ks8695_wdt_exit); - -MODULE_AUTHOR("Andrew Victor"); -MODULE_DESCRIPTION("Watchdog driver for KS8695"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:ks8695_wdt"); From 58e4db99123343be174afbdd20751a18bfffc74b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 9 Aug 2019 22:27:34 +0200 Subject: [PATCH 0497/2880] watchdog: remove w90x900 driver The ARM w90x900 platform is getting removed, so this driver is obsolete Signed-off-by: Arnd Bergmann Acked-by: Guenter Roeck Link: https://lore.kernel.org/r/20190809202749.742267-7-arnd@arndb.de Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../watchdog/watchdog-parameters.rst | 10 - drivers/watchdog/Kconfig | 9 - drivers/watchdog/Makefile | 1 - drivers/watchdog/nuc900_wdt.c | 302 ------------------ 4 files changed, 322 deletions(-) delete mode 100644 drivers/watchdog/nuc900_wdt.c diff --git a/Documentation/watchdog/watchdog-parameters.rst b/Documentation/watchdog/watchdog-parameters.rst index 226aba56f704..223c99361a30 100644 --- a/Documentation/watchdog/watchdog-parameters.rst +++ b/Documentation/watchdog/watchdog-parameters.rst @@ -366,16 +366,6 @@ nic7018_wdt: ------------------------------------------------- -nuc900_wdt: - heartbeat: - Watchdog heartbeats in seconds. - (default = 15) - nowayout: - Watchdog cannot be stopped once started - (default=kernel config parameter) - -------------------------------------------------- - omap_wdt: timer_margin: initial watchdog timeout (in seconds) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 7e45a7792ed5..a8f5c8147d4b 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -655,15 +655,6 @@ config STMP3XXX_RTC_WATCHDOG To compile this driver as a module, choose M here: the module will be called stmp3xxx_rtc_wdt. -config NUC900_WATCHDOG - tristate "Nuvoton NUC900 watchdog" - depends on ARCH_W90X900 || COMPILE_TEST - help - Say Y here if to include support for the watchdog timer - for the Nuvoton NUC900 series SoCs. - To compile this driver as a module, choose M here: the - module will be called nuc900_wdt. - config TS4800_WATCHDOG tristate "TS-4800 Watchdog" depends on HAS_IOMEM && OF diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 85f55ec76f8d..b5a0aed537af 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -63,7 +63,6 @@ obj-$(CONFIG_RN5T618_WATCHDOG) += rn5t618_wdt.o obj-$(CONFIG_COH901327_WATCHDOG) += coh901327_wdt.o obj-$(CONFIG_NPCM7XX_WATCHDOG) += npcm_wdt.o obj-$(CONFIG_STMP3XXX_RTC_WATCHDOG) += stmp3xxx_rtc_wdt.o -obj-$(CONFIG_NUC900_WATCHDOG) += nuc900_wdt.o obj-$(CONFIG_TS4800_WATCHDOG) += ts4800_wdt.o obj-$(CONFIG_TS72XX_WATCHDOG) += ts72xx_wdt.o obj-$(CONFIG_IMX2_WDT) += imx2_wdt.o diff --git a/drivers/watchdog/nuc900_wdt.c b/drivers/watchdog/nuc900_wdt.c deleted file mode 100644 index db124cebe838..000000000000 --- a/drivers/watchdog/nuc900_wdt.c +++ /dev/null @@ -1,302 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (c) 2009 Nuvoton technology corporation. - * - * Wan ZongShun - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define REG_WTCR 0x1c -#define WTCLK (0x01 << 10) -#define WTE (0x01 << 7) /*wdt enable*/ -#define WTIS (0x03 << 4) -#define WTIF (0x01 << 3) -#define WTRF (0x01 << 2) -#define WTRE (0x01 << 1) -#define WTR (0x01 << 0) -/* - * The watchdog time interval can be calculated via following formula: - * WTIS real time interval (formula) - * 0x00 ((2^ 14 ) * ((external crystal freq) / 256))seconds - * 0x01 ((2^ 16 ) * ((external crystal freq) / 256))seconds - * 0x02 ((2^ 18 ) * ((external crystal freq) / 256))seconds - * 0x03 ((2^ 20 ) * ((external crystal freq) / 256))seconds - * - * The external crystal freq is 15Mhz in the nuc900 evaluation board. - * So 0x00 = +-0.28 seconds, 0x01 = +-1.12 seconds, 0x02 = +-4.48 seconds, - * 0x03 = +- 16.92 seconds.. - */ -#define WDT_HW_TIMEOUT 0x02 -#define WDT_TIMEOUT (HZ/2) -#define WDT_HEARTBEAT 15 - -static int heartbeat = WDT_HEARTBEAT; -module_param(heartbeat, int, 0); -MODULE_PARM_DESC(heartbeat, "Watchdog heartbeats in seconds. " - "(default = " __MODULE_STRING(WDT_HEARTBEAT) ")"); - -static bool nowayout = WATCHDOG_NOWAYOUT; -module_param(nowayout, bool, 0); -MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started " - "(default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); - -struct nuc900_wdt { - struct clk *wdt_clock; - struct platform_device *pdev; - void __iomem *wdt_base; - char expect_close; - struct timer_list timer; - spinlock_t wdt_lock; - unsigned long next_heartbeat; -}; - -static unsigned long nuc900wdt_busy; -static struct nuc900_wdt *nuc900_wdt; - -static inline void nuc900_wdt_keepalive(void) -{ - unsigned int val; - - spin_lock(&nuc900_wdt->wdt_lock); - - val = __raw_readl(nuc900_wdt->wdt_base + REG_WTCR); - val |= (WTR | WTIF); - __raw_writel(val, nuc900_wdt->wdt_base + REG_WTCR); - - spin_unlock(&nuc900_wdt->wdt_lock); -} - -static inline void nuc900_wdt_start(void) -{ - unsigned int val; - - spin_lock(&nuc900_wdt->wdt_lock); - - val = __raw_readl(nuc900_wdt->wdt_base + REG_WTCR); - val |= (WTRE | WTE | WTR | WTCLK | WTIF); - val &= ~WTIS; - val |= (WDT_HW_TIMEOUT << 0x04); - __raw_writel(val, nuc900_wdt->wdt_base + REG_WTCR); - - spin_unlock(&nuc900_wdt->wdt_lock); - - nuc900_wdt->next_heartbeat = jiffies + heartbeat * HZ; - mod_timer(&nuc900_wdt->timer, jiffies + WDT_TIMEOUT); -} - -static inline void nuc900_wdt_stop(void) -{ - unsigned int val; - - del_timer(&nuc900_wdt->timer); - - spin_lock(&nuc900_wdt->wdt_lock); - - val = __raw_readl(nuc900_wdt->wdt_base + REG_WTCR); - val &= ~WTE; - __raw_writel(val, nuc900_wdt->wdt_base + REG_WTCR); - - spin_unlock(&nuc900_wdt->wdt_lock); -} - -static inline void nuc900_wdt_ping(void) -{ - nuc900_wdt->next_heartbeat = jiffies + heartbeat * HZ; -} - -static int nuc900_wdt_open(struct inode *inode, struct file *file) -{ - - if (test_and_set_bit(0, &nuc900wdt_busy)) - return -EBUSY; - - nuc900_wdt_start(); - - return stream_open(inode, file); -} - -static int nuc900_wdt_close(struct inode *inode, struct file *file) -{ - if (nuc900_wdt->expect_close == 42) - nuc900_wdt_stop(); - else { - dev_crit(&nuc900_wdt->pdev->dev, - "Unexpected close, not stopping watchdog!\n"); - nuc900_wdt_ping(); - } - - nuc900_wdt->expect_close = 0; - clear_bit(0, &nuc900wdt_busy); - return 0; -} - -static const struct watchdog_info nuc900_wdt_info = { - .identity = "nuc900 watchdog", - .options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | - WDIOF_MAGICCLOSE, -}; - -static long nuc900_wdt_ioctl(struct file *file, - unsigned int cmd, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int __user *p = argp; - int new_value; - - switch (cmd) { - case WDIOC_GETSUPPORT: - return copy_to_user(argp, &nuc900_wdt_info, - sizeof(nuc900_wdt_info)) ? -EFAULT : 0; - case WDIOC_GETSTATUS: - case WDIOC_GETBOOTSTATUS: - return put_user(0, p); - - case WDIOC_KEEPALIVE: - nuc900_wdt_ping(); - return 0; - - case WDIOC_SETTIMEOUT: - if (get_user(new_value, p)) - return -EFAULT; - - heartbeat = new_value; - nuc900_wdt_ping(); - - return put_user(new_value, p); - case WDIOC_GETTIMEOUT: - return put_user(heartbeat, p); - default: - return -ENOTTY; - } -} - -static ssize_t nuc900_wdt_write(struct file *file, const char __user *data, - size_t len, loff_t *ppos) -{ - if (!len) - return 0; - - /* Scan for magic character */ - if (!nowayout) { - size_t i; - - nuc900_wdt->expect_close = 0; - - for (i = 0; i < len; i++) { - char c; - if (get_user(c, data + i)) - return -EFAULT; - if (c == 'V') { - nuc900_wdt->expect_close = 42; - break; - } - } - } - - nuc900_wdt_ping(); - return len; -} - -static void nuc900_wdt_timer_ping(struct timer_list *unused) -{ - if (time_before(jiffies, nuc900_wdt->next_heartbeat)) { - nuc900_wdt_keepalive(); - mod_timer(&nuc900_wdt->timer, jiffies + WDT_TIMEOUT); - } else - dev_warn(&nuc900_wdt->pdev->dev, "Will reset the machine !\n"); -} - -static const struct file_operations nuc900wdt_fops = { - .owner = THIS_MODULE, - .llseek = no_llseek, - .unlocked_ioctl = nuc900_wdt_ioctl, - .open = nuc900_wdt_open, - .release = nuc900_wdt_close, - .write = nuc900_wdt_write, -}; - -static struct miscdevice nuc900wdt_miscdev = { - .minor = WATCHDOG_MINOR, - .name = "watchdog", - .fops = &nuc900wdt_fops, -}; - -static int nuc900wdt_probe(struct platform_device *pdev) -{ - int ret = 0; - - nuc900_wdt = devm_kzalloc(&pdev->dev, sizeof(*nuc900_wdt), - GFP_KERNEL); - if (!nuc900_wdt) - return -ENOMEM; - - nuc900_wdt->pdev = pdev; - - spin_lock_init(&nuc900_wdt->wdt_lock); - - nuc900_wdt->wdt_base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(nuc900_wdt->wdt_base)) - return PTR_ERR(nuc900_wdt->wdt_base); - - nuc900_wdt->wdt_clock = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(nuc900_wdt->wdt_clock)) { - dev_err(&pdev->dev, "failed to find watchdog clock source\n"); - return PTR_ERR(nuc900_wdt->wdt_clock); - } - - clk_enable(nuc900_wdt->wdt_clock); - - timer_setup(&nuc900_wdt->timer, nuc900_wdt_timer_ping, 0); - - ret = misc_register(&nuc900wdt_miscdev); - if (ret) { - dev_err(&pdev->dev, "err register miscdev on minor=%d (%d)\n", - WATCHDOG_MINOR, ret); - goto err_clk; - } - - return 0; - -err_clk: - clk_disable(nuc900_wdt->wdt_clock); - return ret; -} - -static int nuc900wdt_remove(struct platform_device *pdev) -{ - misc_deregister(&nuc900wdt_miscdev); - - clk_disable(nuc900_wdt->wdt_clock); - - return 0; -} - -static struct platform_driver nuc900wdt_driver = { - .probe = nuc900wdt_probe, - .remove = nuc900wdt_remove, - .driver = { - .name = "nuc900-wdt", - }, -}; - -module_platform_driver(nuc900wdt_driver); - -MODULE_AUTHOR("Wan ZongShun "); -MODULE_DESCRIPTION("Watchdog driver for NUC900"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:nuc900-wdt"); From 3a9236e97207f2469254b4098995159b80174d95 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 16 Sep 2019 19:18:51 +0900 Subject: [PATCH 0498/2880] ALSA: dice: fix wrong packet parameter for Alesis iO26 At higher sampling rate (e.g. 192.0 kHz), Alesis iO26 transfers 4 data channels per data block in CIP. Both iO14 and iO26 have the same contents in their configuration ROM. For this reason, ALSA Dice driver attempts to distinguish them according to the value of TX0_AUDIO register at probe callback. Although the way is valid at lower and middle sampling rate, it's lastly invalid at higher sampling rate because because the two models returns the same value for read transaction to the register. In the most cases, users just plug-in the device and ALSA dice driver detects it. In the case, the device runs at lower sampling rate and the driver detects expectedly. For this reason, this commit leaves the way to detect as is. Fixes: 28b208f600a3 ("ALSA: dice: add parameters of stream formats for models produced by Alesis") Cc: # v4.18+ Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20190916101851.30409-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai --- sound/firewire/dice/dice-alesis.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/firewire/dice/dice-alesis.c b/sound/firewire/dice/dice-alesis.c index 218292bdace6..f5b325263b67 100644 --- a/sound/firewire/dice/dice-alesis.c +++ b/sound/firewire/dice/dice-alesis.c @@ -15,7 +15,7 @@ alesis_io14_tx_pcm_chs[MAX_STREAMS][SND_DICE_RATE_MODE_COUNT] = { static const unsigned int alesis_io26_tx_pcm_chs[MAX_STREAMS][SND_DICE_RATE_MODE_COUNT] = { - {10, 10, 8}, /* Tx0 = Analog + S/PDIF. */ + {10, 10, 4}, /* Tx0 = Analog + S/PDIF. */ {16, 8, 0}, /* Tx1 = ADAT1 + ADAT2. */ }; From 543242211879ebe021b00f7e33eb4e733bcd6c47 Mon Sep 17 00:00:00 2001 From: James McDonnell Date: Mon, 16 Sep 2019 14:53:38 +0000 Subject: [PATCH 0499/2880] ALSA: hda/realtek - Fix alienware headset mic Headset microphone quirk for alienware 15r3. Without this using a headset with mic attached will not work. Signed-off-by: James McDonnell Link: https://lore.kernel.org/r/QB1PR01MB2337D0367C2E3ADB0010134F808C0@QB1PR01MB2337.CANPRD01.PROD.OUTLOOK.COM Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index da1695418731..e27ef434b60d 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5817,6 +5817,7 @@ enum { ALC292_FIXUP_DELL_E7X, ALC292_FIXUP_DISABLE_AAMIX, ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK, + ALC298_FIXUP_ALIENWARE_MIC_NO_PRESENCE, ALC298_FIXUP_DELL1_MIC_NO_PRESENCE, ALC298_FIXUP_DELL_AIO_MIC_NO_PRESENCE, ALC275_FIXUP_DELL_XPS, @@ -6506,6 +6507,15 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC292_FIXUP_DISABLE_AAMIX }, + [ALC298_FIXUP_ALIENWARE_MIC_NO_PRESENCE] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x18, 0x01a1913c }, /* headset mic w/o jack detect */ + { } + }, + .chained_before = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE, + }, [ALC298_FIXUP_DELL1_MIC_NO_PRESENCE] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -7770,6 +7780,11 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x17, 0x90170110}, {0x1a, 0x03011020}, {0x21, 0x03211030}), + SND_HDA_PIN_QUIRK(0x10ec0298, 0x1028, "Dell", ALC298_FIXUP_ALIENWARE_MIC_NO_PRESENCE, + {0x12, 0xb7a60140}, + {0x17, 0x90170110}, + {0x1a, 0x03a11030}, + {0x21, 0x03211020}), SND_HDA_PIN_QUIRK(0x10ec0299, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, ALC225_STANDARD_PINS, {0x12, 0xb7a60130}, From 31bfa64e9428c2ab6608377fb3d4c40e29c7f4ce Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 11 Aug 2019 14:23:46 -0700 Subject: [PATCH 0500/2880] watchdog: diag288_wdt: Remove leftover includes from conversion to watchdog API Commit f7a94db4e959 ("s390/watchdog: use watchdog API") converted the driver to use the watchdog API, but some includes as well as MODULE_ALIAS_MISCDEV() were missed. Cc: Philipp Hachtmann Cc: Martin Schwidefsky Signed-off-by: Guenter Roeck Reviewed-by: Wim Van Sebroeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/diag288_wdt.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/watchdog/diag288_wdt.c b/drivers/watchdog/diag288_wdt.c index 181440b7b4d0..aafc8d98bf9f 100644 --- a/drivers/watchdog/diag288_wdt.c +++ b/drivers/watchdog/diag288_wdt.c @@ -26,13 +26,11 @@ #include #include #include -#include #include #include #include #include #include -#include #define MAX_CMDLEN 240 #define DEFAULT_CMD "SYSTEM RESTART" @@ -70,7 +68,6 @@ MODULE_PARM_DESC(conceal, "Enable the CONCEAL CP option while the watchdog is ac module_param_named(nowayout, nowayout_info, bool, 0444); MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default = CONFIG_WATCHDOG_NOWAYOUT)"); -MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR); MODULE_ALIAS("vmwatchdog"); static int __diag288(unsigned int func, unsigned int timeout, From 68f28b01fb9e5fc3ec273104714bd71bac783845 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 14 Aug 2019 22:42:32 +0200 Subject: [PATCH 0501/2880] watchdog: cpwd: use generic compat_ptr_ioctl The cpwd_compat_ioctl() contains a bogus mutex that dates back to a leftover BKL instance. Simplify the implementation by using the new compat_ptr_ioctl() helper function that will do the right thing for all calls here. Note that WIOCSTART/WIOCSTOP don't take any arguments, so the compat_ptr() conversion is not needed here, but it also doesn't hurt. Signed-off-by: Arnd Bergmann Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190814204259.120942-6-arnd@arndb.de Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/cpwd.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c index b973b31179df..9393be584e72 100644 --- a/drivers/watchdog/cpwd.c +++ b/drivers/watchdog/cpwd.c @@ -473,29 +473,6 @@ static long cpwd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } -static long cpwd_compat_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - int rval = -ENOIOCTLCMD; - - switch (cmd) { - /* solaris ioctls are specific to this driver */ - case WIOCSTART: - case WIOCSTOP: - case WIOCGSTAT: - mutex_lock(&cpwd_mutex); - rval = cpwd_ioctl(file, cmd, arg); - mutex_unlock(&cpwd_mutex); - break; - - /* everything else is handled by the generic compat layer */ - default: - break; - } - - return rval; -} - static ssize_t cpwd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -520,7 +497,7 @@ static ssize_t cpwd_read(struct file *file, char __user *buffer, static const struct file_operations cpwd_fops = { .owner = THIS_MODULE, .unlocked_ioctl = cpwd_ioctl, - .compat_ioctl = cpwd_compat_ioctl, + .compat_ioctl = compat_ptr_ioctl, .open = cpwd_open, .write = cpwd_write, .read = cpwd_read, From 144783a80cd2cbc45c6ce17db649140b65f203dd Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 12 Aug 2019 15:13:56 +0200 Subject: [PATCH 0502/2880] watchdog: imx2_wdt: fix min() calculation in imx2_wdt_set_timeout Converting from ms to s requires dividing by 1000, not multiplying. So this is currently taking the smaller of new_timeout and 1.28e8, i.e. effectively new_timeout. The driver knows what it set max_hw_heartbeat_ms to, so use that value instead of doing a division at run-time. FWIW, this can easily be tested by booting into a busybox shell and doing "watchdog -t 5 -T 130 /dev/watchdog" - without this patch, the watchdog fires after 130&127 == 2 seconds. Fixes: b07e228eee69 "watchdog: imx2_wdt: Fix set_timeout for big timeout values" Cc: stable@vger.kernel.org # 5.2 plus anything the above got backported to Signed-off-by: Rasmus Villemoes Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812131356.23039-1-linux@rasmusvillemoes.dk Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/imx2_wdt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c index 32af3974e6bb..8d019a961ccc 100644 --- a/drivers/watchdog/imx2_wdt.c +++ b/drivers/watchdog/imx2_wdt.c @@ -55,7 +55,7 @@ #define IMX2_WDT_WMCR 0x08 /* Misc Register */ -#define IMX2_WDT_MAX_TIME 128 +#define IMX2_WDT_MAX_TIME 128U #define IMX2_WDT_DEFAULT_TIME 60 /* in seconds */ #define WDOG_SEC_TO_COUNT(s) ((s * 2 - 1) << 8) @@ -180,7 +180,7 @@ static int imx2_wdt_set_timeout(struct watchdog_device *wdog, { unsigned int actual; - actual = min(new_timeout, wdog->max_hw_heartbeat_ms * 1000); + actual = min(new_timeout, IMX2_WDT_MAX_TIME); __imx2_wdt_set_timeout(wdog, actual); wdog->timeout = new_timeout; return 0; From 30520ee8e3bac25dbb1bb43da0e49177be3e19c0 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Mon, 12 Aug 2019 16:44:34 +0800 Subject: [PATCH 0503/2880] watchdog: imx_sc: Remove unnecessary error log An error message is already displayed by watchdog_register_device() when failed, so no need to have error log again for failure of calling devm_watchdog_register_device(). Signed-off-by: Anson Huang Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812084434.13316-1-Anson.Huang@nxp.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/imx_sc_wdt.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/watchdog/imx_sc_wdt.c b/drivers/watchdog/imx_sc_wdt.c index 78eaaf75a263..9260475439eb 100644 --- a/drivers/watchdog/imx_sc_wdt.c +++ b/drivers/watchdog/imx_sc_wdt.c @@ -175,11 +175,8 @@ static int imx_sc_wdt_probe(struct platform_device *pdev) watchdog_stop_on_unregister(wdog); ret = devm_watchdog_register_device(dev, wdog); - - if (ret) { - dev_err(dev, "Failed to register watchdog device\n"); + if (ret) return ret; - } ret = imx_scu_irq_group_enable(SC_IRQ_GROUP_WDOG, SC_IRQ_WDOG, From 670e51b0301e844d53da7e0a37c5063cdab81876 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:45 -0700 Subject: [PATCH 0504/2880] watchdog: ziirave_wdt: Add missing newline Add missing newline. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-2-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index dec660c509b3..6ec028fb2635 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -671,7 +671,7 @@ static int ziirave_wdt_probe(struct i2c_client *client, if (ret) return ret; - dev_info(&client->dev, "Timeout set to %ds.", + dev_info(&client->dev, "Timeout set to %ds\n", w_priv->wdd.timeout); } From 4a9600c7e735c343cf723bf4a97bfb0435748e20 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:46 -0700 Subject: [PATCH 0505/2880] watchdog: ziirave_wdt: Be verbose about errors in probe() The driver is quite silent in case of probe failure, which makes it more difficult to diagnose problem from the kernel log. Add logging to all of the silent error paths ziirave_wdt_probe() to improve that. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-3-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 6ec028fb2635..8c71341a9c1c 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -658,8 +658,10 @@ static int ziirave_wdt_probe(struct i2c_client *client, */ if (w_priv->wdd.timeout == 0) { val = i2c_smbus_read_byte_data(client, ZIIRAVE_WDT_TIMEOUT); - if (val < 0) + if (val < 0) { + dev_err(&client->dev, "Failed to read timeout\n"); return val; + } if (val < ZIIRAVE_TIMEOUT_MIN) return -ENODEV; @@ -668,8 +670,10 @@ static int ziirave_wdt_probe(struct i2c_client *client, } else { ret = ziirave_wdt_set_timeout(&w_priv->wdd, w_priv->wdd.timeout); - if (ret) + if (ret) { + dev_err(&client->dev, "Failed to set timeout\n"); return ret; + } dev_info(&client->dev, "Timeout set to %ds\n", w_priv->wdd.timeout); @@ -681,34 +685,46 @@ static int ziirave_wdt_probe(struct i2c_client *client, /* If in unconfigured state, set to stopped */ val = i2c_smbus_read_byte_data(client, ZIIRAVE_WDT_STATE); - if (val < 0) + if (val < 0) { + dev_err(&client->dev, "Failed to read state\n"); return val; + } if (val == ZIIRAVE_STATE_INITIAL) ziirave_wdt_stop(&w_priv->wdd); ret = ziirave_wdt_init_duration(client); - if (ret) + if (ret) { + dev_err(&client->dev, "Failed to init duration\n"); return ret; + } ret = ziirave_wdt_revision(client, &w_priv->firmware_rev, ZIIRAVE_WDT_FIRM_VER_MAJOR); - if (ret) + if (ret) { + dev_err(&client->dev, "Failed to read firmware version\n"); return ret; + } ret = ziirave_wdt_revision(client, &w_priv->bootloader_rev, ZIIRAVE_WDT_BOOT_VER_MAJOR); - if (ret) + if (ret) { + dev_err(&client->dev, "Failed to read bootloader version\n"); return ret; + } w_priv->reset_reason = i2c_smbus_read_byte_data(client, ZIIRAVE_WDT_RESET_REASON); - if (w_priv->reset_reason < 0) + if (w_priv->reset_reason < 0) { + dev_err(&client->dev, "Failed to read reset reason\n"); return w_priv->reset_reason; + } if (w_priv->reset_reason >= ARRAY_SIZE(ziirave_reasons) || - !ziirave_reasons[w_priv->reset_reason]) + !ziirave_reasons[w_priv->reset_reason]) { + dev_err(&client->dev, "Invalid reset reason\n"); return -ENODEV; + } ret = watchdog_register_device(&w_priv->wdd); From b774fcef7dde27da8e8ed21571a254b1a78d200f Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:47 -0700 Subject: [PATCH 0506/2880] watchdog: ziirave_wdt: Be more verbose during firmware update Add more error logging to ziirave_firm_upload() for diagnostics. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-4-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 8c71341a9c1c..b3e255b40209 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -335,14 +335,18 @@ static int ziirave_firm_upload(struct watchdog_device *wdd, ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_JUMP_TO_BOOTLOADER, 1, false); - if (ret) + if (ret) { + dev_err(&client->dev, "Failed to jump to bootloader\n"); return ret; + } msleep(500); ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_DOWNLOAD_START, 1, true); - if (ret) + if (ret) { + dev_err(&client->dev, "Failed to start download\n"); return ret; + } msleep(500); @@ -438,14 +442,20 @@ static int ziirave_firm_upload(struct watchdog_device *wdd, /* End download operation */ ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_DOWNLOAD_END, 1, false); - if (ret) + if (ret) { + dev_err(&client->dev, + "Failed to end firmware download: %d\n", ret); return ret; + } /* Reset the processor */ ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_RESET_PROCESSOR, 1, false); - if (ret) + if (ret) { + dev_err(&client->dev, + "Failed to reset the watchdog: %d\n", ret); return ret; + } msleep(500); From 39d0387d5e5e9bbd1fd9cfaab19581939b05f1c8 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:48 -0700 Subject: [PATCH 0507/2880] watchdog: ziirave_wdt: Don't bail out on unexpected timeout value Reprogramming bootloader on watchdog MCU will result in reported default timeout value of "0". That in turn will be unnecessarily rejected by the driver as invalid device (-ENODEV). Simplify probe to read stored timeout value, set it to a sane default if it is bogus, and then program that value unconditionally. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-5-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index b3e255b40209..a11b92383c5f 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -23,6 +23,7 @@ #define ZIIRAVE_TIMEOUT_MIN 3 #define ZIIRAVE_TIMEOUT_MAX 255 +#define ZIIRAVE_TIMEOUT_DEFAULT 30 #define ZIIRAVE_PING_VALUE 0x0 @@ -673,22 +674,21 @@ static int ziirave_wdt_probe(struct i2c_client *client, return val; } - if (val < ZIIRAVE_TIMEOUT_MIN) - return -ENODEV; + if (val > ZIIRAVE_TIMEOUT_MAX || + val < ZIIRAVE_TIMEOUT_MIN) + val = ZIIRAVE_TIMEOUT_DEFAULT; w_priv->wdd.timeout = val; - } else { - ret = ziirave_wdt_set_timeout(&w_priv->wdd, - w_priv->wdd.timeout); - if (ret) { - dev_err(&client->dev, "Failed to set timeout\n"); - return ret; - } - - dev_info(&client->dev, "Timeout set to %ds\n", - w_priv->wdd.timeout); } + ret = ziirave_wdt_set_timeout(&w_priv->wdd, w_priv->wdd.timeout); + if (ret) { + dev_err(&client->dev, "Failed to set timeout\n"); + return ret; + } + + dev_info(&client->dev, "Timeout set to %ds\n", w_priv->wdd.timeout); + watchdog_set_nowayout(&w_priv->wdd, nowayout); i2c_set_clientdata(client, w_priv); From 42abc12464f74feb2e868fba439c713d56ef9573 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:49 -0700 Subject: [PATCH 0508/2880] watchdog: ziirave_wdt: Log bootloader/firmware info during probe Log bootloader/firmware info during probe. This information is available via sysfs already, but it's really helpful to have this in kernel log during startup as well. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-6-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index a11b92383c5f..75c066602c00 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -69,6 +69,9 @@ static char *ziirave_reasons[] = {"power cycle", "hw watchdog", NULL, NULL, #define ZIIRAVE_CMD_JUMP_TO_BOOTLOADER 0x0c #define ZIIRAVE_CMD_DOWNLOAD_PACKET 0x0e +#define ZIIRAVE_FW_VERSION_FMT "02.%02u.%02u" +#define ZIIRAVE_BL_VERSION_FMT "01.%02u.%02u" + struct ziirave_wdt_rev { unsigned char major; unsigned char minor; @@ -489,7 +492,7 @@ static ssize_t ziirave_wdt_sysfs_show_firm(struct device *dev, if (ret) return ret; - ret = sprintf(buf, "02.%02u.%02u", w_priv->firmware_rev.major, + ret = sprintf(buf, ZIIRAVE_FW_VERSION_FMT, w_priv->firmware_rev.major, w_priv->firmware_rev.minor); mutex_unlock(&w_priv->sysfs_mutex); @@ -512,7 +515,7 @@ static ssize_t ziirave_wdt_sysfs_show_boot(struct device *dev, if (ret) return ret; - ret = sprintf(buf, "01.%02u.%02u", w_priv->bootloader_rev.major, + ret = sprintf(buf, ZIIRAVE_BL_VERSION_FMT, w_priv->bootloader_rev.major, w_priv->bootloader_rev.minor); mutex_unlock(&w_priv->sysfs_mutex); @@ -579,7 +582,8 @@ static ssize_t ziirave_wdt_sysfs_store_firm(struct device *dev, goto unlock_mutex; } - dev_info(&client->dev, "Firmware updated to version 02.%02u.%02u\n", + dev_info(&client->dev, + "Firmware updated to version " ZIIRAVE_FW_VERSION_FMT "\n", w_priv->firmware_rev.major, w_priv->firmware_rev.minor); /* Restore the watchdog timeout */ @@ -716,6 +720,10 @@ static int ziirave_wdt_probe(struct i2c_client *client, return ret; } + dev_info(&client->dev, + "Firmware version: " ZIIRAVE_FW_VERSION_FMT "\n", + w_priv->firmware_rev.major, w_priv->firmware_rev.minor); + ret = ziirave_wdt_revision(client, &w_priv->bootloader_rev, ZIIRAVE_WDT_BOOT_VER_MAJOR); if (ret) { @@ -723,6 +731,10 @@ static int ziirave_wdt_probe(struct i2c_client *client, return ret; } + dev_info(&client->dev, + "Bootloader version: " ZIIRAVE_BL_VERSION_FMT "\n", + w_priv->bootloader_rev.major, w_priv->bootloader_rev.minor); + w_priv->reset_reason = i2c_smbus_read_byte_data(client, ZIIRAVE_WDT_RESET_REASON); if (w_priv->reset_reason < 0) { From 5870f4958ccf90f4cdf3911bade77cb72a3bf28c Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:50 -0700 Subject: [PATCH 0509/2880] watchdog: ziirave_wdt: Simplify ziirave_firm_write_pkt() There no reason why ziirave_firm_write_pkt() has to take firmware data via 'struct ihex_binrec' and it can just take address, data pointer and data length as individual arguments. Make this change to allow us to drastically simplify handling page crossing case by removing all of the extra code required to split 'struct ihex_binrec' into two. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-7-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 116 +++++++++++++-------------------- 1 file changed, 44 insertions(+), 72 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 75c066602c00..b2d5ff0c22fe 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -51,6 +51,7 @@ static char *ziirave_reasons[] = {"power cycle", "hw watchdog", NULL, NULL, #define ZIIRAVE_FIRM_PKT_DATA_SIZE 16 #define ZIIRAVE_FIRM_FLASH_MEMORY_START 0x1600 #define ZIIRAVE_FIRM_FLASH_MEMORY_END 0x2bbf +#define ZIIRAVE_FIRM_PAGE_SIZE 128 /* Received and ready for next Download packet. */ #define ZIIRAVE_FIRM_DOWNLOAD_ACK 1 @@ -244,27 +245,26 @@ static int ziirave_firm_write_byte(struct watchdog_device *wdd, u8 command, * Data0 .. Data15: Array of 16 bytes of data. * Checksum: Checksum byte to verify data integrity. */ -static int ziirave_firm_write_pkt(struct watchdog_device *wdd, - const struct ihex_binrec *rec) +static int __ziirave_firm_write_pkt(struct watchdog_device *wdd, + u32 addr, const u8 *data, u8 len) { + const u16 addr16 = (u16)addr / 2; struct i2c_client *client = to_i2c_client(wdd->parent); u8 i, checksum = 0, packet[ZIIRAVE_FIRM_PKT_TOTAL_SIZE]; int ret; - u16 addr; memset(packet, 0, ARRAY_SIZE(packet)); /* Packet length */ - packet[0] = (u8)be16_to_cpu(rec->len); + packet[0] = len; /* Packet address */ - addr = (be32_to_cpu(rec->addr) & 0xffff) >> 1; - packet[1] = addr & 0xff; - packet[2] = (addr & 0xff00) >> 8; + packet[1] = addr16 & 0xff; + packet[2] = (addr16 & 0xff00) >> 8; /* Packet data */ - if (be16_to_cpu(rec->len) > ZIIRAVE_FIRM_PKT_DATA_SIZE) + if (len > ZIIRAVE_FIRM_PKT_DATA_SIZE) return -EMSGSIZE; - memcpy(packet + 3, rec->data, be16_to_cpu(rec->len)); + memcpy(packet + 3, data, len); /* Packet checksum */ for (i = 0; i < ZIIRAVE_FIRM_PKT_TOTAL_SIZE - 1; i++) @@ -276,11 +276,35 @@ static int ziirave_firm_write_pkt(struct watchdog_device *wdd, if (ret) dev_err(&client->dev, "Failed to write firmware packet at address 0x%04x: %d\n", - addr, ret); + addr16, ret); return ret; } +static int ziirave_firm_write_pkt(struct watchdog_device *wdd, + u32 addr, const u8 *data, u8 len) +{ + const u8 max_write_len = ZIIRAVE_FIRM_PAGE_SIZE - + (addr - ALIGN_DOWN(addr, ZIIRAVE_FIRM_PAGE_SIZE)); + int ret; + + if (len > max_write_len) { + /* + * If data crossed page boundary we need to split this + * write in two + */ + ret = __ziirave_firm_write_pkt(wdd, addr, data, max_write_len); + if (ret) + return ret; + + addr += max_write_len; + data += max_write_len; + len -= max_write_len; + } + + return __ziirave_firm_write_pkt(wdd, addr, data, len); +} + static int ziirave_firm_verify(struct watchdog_device *wdd, const struct firmware *fw) { @@ -333,9 +357,8 @@ static int ziirave_firm_upload(struct watchdog_device *wdd, const struct firmware *fw) { struct i2c_client *client = to_i2c_client(wdd->parent); - int ret, words_till_page_break; const struct ihex_binrec *rec; - struct ihex_binrec *rec_new; + int ret; ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_JUMP_TO_BOOTLOADER, 1, false); @@ -366,68 +389,17 @@ static int ziirave_firm_upload(struct watchdog_device *wdd, return -EMSGSIZE; } - /* Calculate words till page break */ - words_till_page_break = (64 - ((be32_to_cpu(rec->addr) >> 1) & - 0x3f)); - if ((be16_to_cpu(rec->len) >> 1) > words_till_page_break) { - /* - * Data in passes page boundary, so we need to split in - * two blocks of data. Create a packet with the first - * block of data. - */ - rec_new = kzalloc(sizeof(struct ihex_binrec) + - (words_till_page_break << 1), - GFP_KERNEL); - if (!rec_new) - return -ENOMEM; - - rec_new->len = cpu_to_be16(words_till_page_break << 1); - rec_new->addr = rec->addr; - memcpy(rec_new->data, rec->data, - be16_to_cpu(rec_new->len)); - - ret = ziirave_firm_write_pkt(wdd, rec_new); - kfree(rec_new); - if (ret) - return ret; - - /* Create a packet with the second block of data */ - rec_new = kzalloc(sizeof(struct ihex_binrec) + - be16_to_cpu(rec->len) - - (words_till_page_break << 1), - GFP_KERNEL); - if (!rec_new) - return -ENOMEM; - - /* Remaining bytes */ - rec_new->len = rec->len - - cpu_to_be16(words_till_page_break << 1); - - rec_new->addr = cpu_to_be32(be32_to_cpu(rec->addr) + - (words_till_page_break << 1)); - - memcpy(rec_new->data, - rec->data + (words_till_page_break << 1), - be16_to_cpu(rec_new->len)); - - ret = ziirave_firm_write_pkt(wdd, rec_new); - kfree(rec_new); - if (ret) - return ret; - } else { - ret = ziirave_firm_write_pkt(wdd, rec); - if (ret) - return ret; - } + ret = ziirave_firm_write_pkt(wdd, be32_to_cpu(rec->addr), + rec->data, be16_to_cpu(rec->len)); + if (ret) + return ret; } - /* For end of download, the length field will be set to 0 */ - rec_new = kzalloc(sizeof(struct ihex_binrec) + 1, GFP_KERNEL); - if (!rec_new) - return -ENOMEM; - - ret = ziirave_firm_write_pkt(wdd, rec_new); - kfree(rec_new); + /* + * Finish firmware download process by sending a zero length + * payload + */ + ret = ziirave_firm_write_pkt(wdd, 0, NULL, 0); if (ret) { dev_err(&client->dev, "Failed to send EMPTY packet: %d\n", ret); return ret; From 08188e8dbc7587d58948efca493219515d6a5639 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:51 -0700 Subject: [PATCH 0510/2880] watchdog: ziirave_wdt: Check packet length only once We don't need to check for packet length more than once, so drop the extra check in ziirave_firm_upload(). While at it move the check at the very start of __ziirave_firm_write_pkt(), as to not waste any time preparing a packet we'll never use. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-8-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index b2d5ff0c22fe..9d9c669b8b47 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -253,6 +253,13 @@ static int __ziirave_firm_write_pkt(struct watchdog_device *wdd, u8 i, checksum = 0, packet[ZIIRAVE_FIRM_PKT_TOTAL_SIZE]; int ret; + /* Check max data size */ + if (len > ZIIRAVE_FIRM_PKT_DATA_SIZE) { + dev_err(&client->dev, "Firmware packet too long (%d)\n", + len); + return -EMSGSIZE; + } + memset(packet, 0, ARRAY_SIZE(packet)); /* Packet length */ @@ -261,9 +268,6 @@ static int __ziirave_firm_write_pkt(struct watchdog_device *wdd, packet[1] = addr16 & 0xff; packet[2] = (addr16 & 0xff00) >> 8; - /* Packet data */ - if (len > ZIIRAVE_FIRM_PKT_DATA_SIZE) - return -EMSGSIZE; memcpy(packet + 3, data, len); /* Packet checksum */ @@ -382,13 +386,6 @@ static int ziirave_firm_upload(struct watchdog_device *wdd, if (!be16_to_cpu(rec->len)) break; - /* Check max data size */ - if (be16_to_cpu(rec->len) > ZIIRAVE_FIRM_PKT_DATA_SIZE) { - dev_err(&client->dev, "Firmware packet too long (%d)\n", - be16_to_cpu(rec->len)); - return -EMSGSIZE; - } - ret = ziirave_firm_write_pkt(wdd, be32_to_cpu(rec->addr), rec->data, be16_to_cpu(rec->len)); if (ret) From dc0dd28951f16be9714e90468e14566d186a7388 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:52 -0700 Subject: [PATCH 0511/2880] watchdog: ziirave_wdt: Skip zeros when calculating checksum Zeros don't contribute anything to checksum value, so we can skip unused portion of the packet when calculating its checksum. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-9-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 9d9c669b8b47..19da0910c2d1 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -271,7 +271,7 @@ static int __ziirave_firm_write_pkt(struct watchdog_device *wdd, memcpy(packet + 3, data, len); /* Packet checksum */ - for (i = 0; i < ZIIRAVE_FIRM_PKT_TOTAL_SIZE - 1; i++) + for (i = 0; i < len + 3; i++) checksum += packet[i]; packet[ZIIRAVE_FIRM_PKT_TOTAL_SIZE - 1] = checksum; From e6bd448653d61d17ddf9a663ca84d1f422846907 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:53 -0700 Subject: [PATCH 0512/2880] watchdog: ziirave_wdt: Fix incorrect use of ARRAY_SIZE Both memset() and ziirave_firm_write_block_data() expect length in bytes as an argument, not a number of elements in array. It just happens that in this particular case both values are equal. Modify the code to use sizeof() instead. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-10-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 19da0910c2d1..e0f55cbdc603 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -203,7 +203,7 @@ static int ziirave_firm_set_read_addr(struct watchdog_device *wdd, u16 addr) return i2c_smbus_write_block_data(client, ZIIRAVE_CMD_DOWNLOAD_SET_READ_ADDR, - ARRAY_SIZE(address), address); + sizeof(address), address); } static int ziirave_firm_write_block_data(struct watchdog_device *wdd, @@ -260,7 +260,7 @@ static int __ziirave_firm_write_pkt(struct watchdog_device *wdd, return -EMSGSIZE; } - memset(packet, 0, ARRAY_SIZE(packet)); + memset(packet, 0, sizeof(packet)); /* Packet length */ packet[0] = len; @@ -276,7 +276,7 @@ static int __ziirave_firm_write_pkt(struct watchdog_device *wdd, packet[ZIIRAVE_FIRM_PKT_TOTAL_SIZE - 1] = checksum; ret = ziirave_firm_write_block_data(wdd, ZIIRAVE_CMD_DOWNLOAD_PACKET, - ARRAY_SIZE(packet), packet, true); + sizeof(packet), packet, true); if (ret) dev_err(&client->dev, "Failed to write firmware packet at address 0x%04x: %d\n", From 10f98fef7ba65fcb74129296631adc99750f1a96 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:54 -0700 Subject: [PATCH 0513/2880] watchdog: ziirave_wdt: Zero out only what's necessary Instead of zeroing out all of the packet and then overwriting a significant portion of those zeros via memcpy(), zero out only a portion of the packet that is known to not contain any data. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-11-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index e0f55cbdc603..69694f2836d7 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -260,8 +260,6 @@ static int __ziirave_firm_write_pkt(struct watchdog_device *wdd, return -EMSGSIZE; } - memset(packet, 0, sizeof(packet)); - /* Packet length */ packet[0] = len; /* Packet address */ @@ -269,6 +267,7 @@ static int __ziirave_firm_write_pkt(struct watchdog_device *wdd, packet[2] = (addr16 & 0xff00) >> 8; memcpy(packet + 3, data, len); + memset(packet + 3 + len, 0, ZIIRAVE_FIRM_PKT_DATA_SIZE - len); /* Packet checksum */ for (i = 0; i < len + 3; i++) From 08f980a8ffc4c0891b3998f588c1b02fe57caec9 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:55 -0700 Subject: [PATCH 0514/2880] watchdog: ziirave_wdt: Make use of put_unaligned_le16 Instead of doing this explicitly use put_unaligned_le16() to place 16-bit address value into command payload. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-12-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 69694f2836d7..38cf3ca329d7 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -21,6 +21,8 @@ #include #include +#include + #define ZIIRAVE_TIMEOUT_MIN 3 #define ZIIRAVE_TIMEOUT_MAX 255 #define ZIIRAVE_TIMEOUT_DEFAULT 30 @@ -198,8 +200,7 @@ static int ziirave_firm_set_read_addr(struct watchdog_device *wdd, u16 addr) struct i2c_client *client = to_i2c_client(wdd->parent); u8 address[2]; - address[0] = addr & 0xff; - address[1] = (addr >> 8) & 0xff; + put_unaligned_le16(addr, address); return i2c_smbus_write_block_data(client, ZIIRAVE_CMD_DOWNLOAD_SET_READ_ADDR, @@ -263,8 +264,7 @@ static int __ziirave_firm_write_pkt(struct watchdog_device *wdd, /* Packet length */ packet[0] = len; /* Packet address */ - packet[1] = addr16 & 0xff; - packet[2] = (addr16 & 0xff00) >> 8; + put_unaligned_le16(addr16, packet + 1); memcpy(packet + 3, data, len); memset(packet + 3 + len, 0, ZIIRAVE_FIRM_PKT_DATA_SIZE - len); From d91bb8d9162586bc17afb7ffd28b03cce064e6f4 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:56 -0700 Subject: [PATCH 0515/2880] watchdog: ziirave_wdt: Don't check if ihex record length is zero Ihex_next_binrec() will return NULL if next record's 'len' is zero, so explicit checks for that in the driver are unnecessary. Drop them. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-13-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 38cf3ca329d7..4b95467bf239 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -318,10 +318,6 @@ static int ziirave_firm_verify(struct watchdog_device *wdd, u16 addr; for (rec = (void *)fw->data; rec; rec = ihex_next_binrec(rec)) { - /* Zero length marks end of records */ - if (!be16_to_cpu(rec->len)) - break; - addr = (be32_to_cpu(rec->addr) & 0xffff) >> 1; if (addr < ZIIRAVE_FIRM_FLASH_MEMORY_START || addr > ZIIRAVE_FIRM_FLASH_MEMORY_END) @@ -381,10 +377,6 @@ static int ziirave_firm_upload(struct watchdog_device *wdd, msleep(500); for (rec = (void *)fw->data; rec; rec = ihex_next_binrec(rec)) { - /* Zero length marks end of records */ - if (!be16_to_cpu(rec->len)) - break; - ret = ziirave_firm_write_pkt(wdd, be32_to_cpu(rec->addr), rec->data, be16_to_cpu(rec->len)); if (ret) From de88053807d848a504a65fbe7ea03c9aedc16b90 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:57 -0700 Subject: [PATCH 0516/2880] watchdog: ziirave_wdt: Don't read out more than 'len' firmware bytes We only compare first 'len' bytes of read firmware, so we don't need to read more that that. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-14-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 4b95467bf239..f05095b08016 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -318,6 +318,8 @@ static int ziirave_firm_verify(struct watchdog_device *wdd, u16 addr; for (rec = (void *)fw->data; rec; rec = ihex_next_binrec(rec)) { + const u16 len = be16_to_cpu(rec->len); + addr = (be32_to_cpu(rec->addr) & 0xffff) >> 1; if (addr < ZIIRAVE_FIRM_FLASH_MEMORY_START || addr > ZIIRAVE_FIRM_FLASH_MEMORY_END) @@ -331,7 +333,7 @@ static int ziirave_firm_verify(struct watchdog_device *wdd, return ret; } - for (i = 0; i < ARRAY_SIZE(data); i++) { + for (i = 0; i < len; i++) { ret = i2c_smbus_read_byte_data(client, ZIIRAVE_CMD_DOWNLOAD_READ_BYTE); if (ret < 0) { @@ -342,7 +344,7 @@ static int ziirave_firm_verify(struct watchdog_device *wdd, data[i] = ret; } - if (memcmp(data, rec->data, be16_to_cpu(rec->len))) { + if (memcmp(data, rec->data, len)) { dev_err(&client->dev, "Firmware mismatch at address 0x%04x\n", addr); return -EINVAL; From d2ddc4505ed268a57bfc07817bc0494aca6d8818 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:58 -0700 Subject: [PATCH 0517/2880] watchdog: ziirave_wdt: Don't try to program readonly flash Bootloader code will ignore any attempts to write data to any flash area outside of [ZIIRAVE_FIRM_FLASH_MEMORY_START; ZIIRAVE_FIRM_FLASH_MEMORY_END]. Firmware update code already have an appropriate check to skip those areas when validating updated firmware. Firmware programming code, OTOH, does not and will needlessly send no-op I2C traffic. Add an appropriate check to __ziirave_firm_write_pkt() so as to save all of that wasted effort. While at it, normalize all of the address handling code to use full 32-bit address in units of bytes and convert it to an appropriate value only in places where that is necessary. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-15-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index f05095b08016..a3cc936858ad 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -51,8 +51,8 @@ static char *ziirave_reasons[] = {"power cycle", "hw watchdog", NULL, NULL, #define ZIIRAVE_FIRM_PKT_TOTAL_SIZE 20 #define ZIIRAVE_FIRM_PKT_DATA_SIZE 16 -#define ZIIRAVE_FIRM_FLASH_MEMORY_START 0x1600 -#define ZIIRAVE_FIRM_FLASH_MEMORY_END 0x2bbf +#define ZIIRAVE_FIRM_FLASH_MEMORY_START (2 * 0x1600) +#define ZIIRAVE_FIRM_FLASH_MEMORY_END (2 * 0x2bbf) #define ZIIRAVE_FIRM_PAGE_SIZE 128 /* Received and ready for next Download packet. */ @@ -195,12 +195,13 @@ static int ziirave_firm_wait_for_ack(struct watchdog_device *wdd) return ret == ZIIRAVE_FIRM_DOWNLOAD_ACK ? 0 : -EIO; } -static int ziirave_firm_set_read_addr(struct watchdog_device *wdd, u16 addr) +static int ziirave_firm_set_read_addr(struct watchdog_device *wdd, u32 addr) { struct i2c_client *client = to_i2c_client(wdd->parent); + const u16 addr16 = (u16)addr / 2; u8 address[2]; - put_unaligned_le16(addr, address); + put_unaligned_le16(addr16, address); return i2c_smbus_write_block_data(client, ZIIRAVE_CMD_DOWNLOAD_SET_READ_ADDR, @@ -234,6 +235,12 @@ static int ziirave_firm_write_byte(struct watchdog_device *wdd, u8 command, wait_for_ack); } +static bool ziirave_firm_addr_readonly(u32 addr) +{ + return addr < ZIIRAVE_FIRM_FLASH_MEMORY_START || + addr > ZIIRAVE_FIRM_FLASH_MEMORY_END; +} + /* * ziirave_firm_write_pkt() - Build and write a firmware packet * @@ -261,6 +268,16 @@ static int __ziirave_firm_write_pkt(struct watchdog_device *wdd, return -EMSGSIZE; } + /* + * Ignore packets that are targeting program memory outisde of + * app partition, since they will be ignored by the + * bootloader. At the same time, we need to make sure we'll + * allow zero length packet that will be sent as the last step + * of firmware update + */ + if (len && ziirave_firm_addr_readonly(addr)) + return 0; + /* Packet length */ packet[0] = len; /* Packet address */ @@ -279,7 +296,7 @@ static int __ziirave_firm_write_pkt(struct watchdog_device *wdd, if (ret) dev_err(&client->dev, "Failed to write firmware packet at address 0x%04x: %d\n", - addr16, ret); + addr, ret); return ret; } @@ -315,14 +332,12 @@ static int ziirave_firm_verify(struct watchdog_device *wdd, const struct ihex_binrec *rec; int i, ret; u8 data[ZIIRAVE_FIRM_PKT_DATA_SIZE]; - u16 addr; for (rec = (void *)fw->data; rec; rec = ihex_next_binrec(rec)) { const u16 len = be16_to_cpu(rec->len); + const u32 addr = be32_to_cpu(rec->addr); - addr = (be32_to_cpu(rec->addr) & 0xffff) >> 1; - if (addr < ZIIRAVE_FIRM_FLASH_MEMORY_START || - addr > ZIIRAVE_FIRM_FLASH_MEMORY_END) + if (ziirave_firm_addr_readonly(addr)) continue; ret = ziirave_firm_set_read_addr(wdd, addr); From d2c1d4258f7fbcfe386795c83abc9e6261ed9397 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:08:59 -0700 Subject: [PATCH 0518/2880] watchdog: ziirave_wdt: Fix misleading error message Fix misleading error message in ziirave_wdt_init_duration(). Saying "unable to set ..." implies that an attempt at communication with watchdog device has taken palce and was not successful. In this case, however, all it indicates is that no reset pulse duration was specified either via kernel parameter or Device Tree. Re-phase the log message to be more clear about benign nature of this event. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-16-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index a3cc936858ad..0c150f3cf408 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -603,7 +603,7 @@ static int ziirave_wdt_init_duration(struct i2c_client *client) &reset_duration); if (ret) { dev_info(&client->dev, - "Unable to set reset pulse duration, using default\n"); + "No reset pulse duration specified, using default\n"); return 0; } } From 910d0f968727b9e456e440596540f4059f8e74cf Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:09:00 -0700 Subject: [PATCH 0519/2880] watchdog: ziirave_wdt: Fix JUMP_TO_BOOTLOADER payload Bootloader firmware expects the following traffic for JUMP_TO_BOOTLOADER: S Addr Wr [A] 0x0c [A] 0x01 [A] P using ziirave_firm_write_byte() will result in S Addr Wr [A] 0x0c [A] 0x01 [A] 0x01 [A] P which happens to work because firmware will ignore any extra bytes and expected magic value matches byte count sent by i2c_smbus_write_block_data(). Fix this by converting the code to use i2c_smbus_write_byte_data() instead. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-17-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 0c150f3cf408..0185b9175cc0 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -72,6 +72,8 @@ static char *ziirave_reasons[] = {"power cycle", "hw watchdog", NULL, NULL, #define ZIIRAVE_CMD_JUMP_TO_BOOTLOADER 0x0c #define ZIIRAVE_CMD_DOWNLOAD_PACKET 0x0e +#define ZIIRAVE_CMD_JUMP_TO_BOOTLOADER_MAGIC 1 + #define ZIIRAVE_FW_VERSION_FMT "02.%02u.%02u" #define ZIIRAVE_BL_VERSION_FMT "01.%02u.%02u" @@ -376,8 +378,9 @@ static int ziirave_firm_upload(struct watchdog_device *wdd, const struct ihex_binrec *rec; int ret; - ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_JUMP_TO_BOOTLOADER, 1, - false); + ret = i2c_smbus_write_byte_data(client, + ZIIRAVE_CMD_JUMP_TO_BOOTLOADER, + ZIIRAVE_CMD_JUMP_TO_BOOTLOADER_MAGIC); if (ret) { dev_err(&client->dev, "Failed to jump to bootloader\n"); return ret; From c47825fb72ea660b7d15f40dc7f8a73dcdd1c670 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:09:01 -0700 Subject: [PATCH 0520/2880] watchdog: ziirave_wdt: Fix DOWNLOAD_END payload Bootloader firmware expects the following traffic for DOWNLOAD_END: S Addr Wr [A] 0x11 [A] P using ziirave_firm_write_byte() will result in S Addr Wr [A] 0x11 [A] 0x01 [A] 0x01 [A] P which happens to work because firmware will ignore any extra bytes sent. Fix this by converting the code to use i2c_smbus_write_byte() instead. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-18-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 0185b9175cc0..a598780d73d3 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -425,7 +425,7 @@ static int ziirave_firm_upload(struct watchdog_device *wdd, } /* End download operation */ - ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_DOWNLOAD_END, 1, false); + ret = i2c_smbus_write_byte(client, ZIIRAVE_CMD_DOWNLOAD_END); if (ret) { dev_err(&client->dev, "Failed to end firmware download: %d\n", ret); From 0007cbd517a23fe5e46328baec89ffee0269f780 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:09:02 -0700 Subject: [PATCH 0521/2880] watchdog: ziirave_wdt: Fix RESET_PROCESSOR payload Bootloader firmware expects the following traffic for RESET_PROCESSOR: S Addr Wr [A] 0x0b [A] 0x01 [A] P using ziirave_firm_write_byte() will result in S Addr Wr [A] 0x0b [A] 0x01 [A] 0x01 [A] P which happens to work because firmware will ignore any extra bytes and expected magic value matches byte count sent by i2c_smbus_write_block_data(). Fix this by converting the code to use i2c_smbus_write_byte_data() instead. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-19-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index a598780d73d3..92df27350e41 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -73,6 +73,7 @@ static char *ziirave_reasons[] = {"power cycle", "hw watchdog", NULL, NULL, #define ZIIRAVE_CMD_DOWNLOAD_PACKET 0x0e #define ZIIRAVE_CMD_JUMP_TO_BOOTLOADER_MAGIC 1 +#define ZIIRAVE_CMD_RESET_PROCESSOR_MAGIC 1 #define ZIIRAVE_FW_VERSION_FMT "02.%02u.%02u" #define ZIIRAVE_BL_VERSION_FMT "01.%02u.%02u" @@ -433,8 +434,9 @@ static int ziirave_firm_upload(struct watchdog_device *wdd, } /* Reset the processor */ - ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_RESET_PROCESSOR, 1, - false); + ret = i2c_smbus_write_byte_data(client, + ZIIRAVE_CMD_RESET_PROCESSOR, + ZIIRAVE_CMD_RESET_PROCESSOR_MAGIC); if (ret) { dev_err(&client->dev, "Failed to reset the watchdog: %d\n", ret); From fe05178c7891c546e07e24836c0af967996766c6 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:09:03 -0700 Subject: [PATCH 0522/2880] watchdog: ziirave_wdt: Drop status polling code Bootloader firmware doesn't implement DOWNLOAD_START or DOWNLOAD_PACKET in a non-blocking way. It will stretch the clock of the first status byte read until the operation is complete. Polling for the status is not really necessary since it will always succed on the first try. Replace polling code with a simple single byte read to simplify things. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-20-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 92df27350e41..681f65349779 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -57,11 +57,6 @@ static char *ziirave_reasons[] = {"power cycle", "hw watchdog", NULL, NULL, /* Received and ready for next Download packet. */ #define ZIIRAVE_FIRM_DOWNLOAD_ACK 1 -/* Currently writing to flash. Retry Download status in a moment! */ -#define ZIIRAVE_FIRM_DOWNLOAD_BUSY 2 - -/* Wait for ACK timeout in ms */ -#define ZIIRAVE_FIRM_WAIT_FOR_ACK_TIMEOUT 50 /* Firmware commands */ #define ZIIRAVE_CMD_DOWNLOAD_START 0x10 @@ -175,25 +170,16 @@ static unsigned int ziirave_wdt_get_timeleft(struct watchdog_device *wdd) return ret; } -static int ziirave_firm_wait_for_ack(struct watchdog_device *wdd) +static int ziirave_firm_read_ack(struct watchdog_device *wdd) { struct i2c_client *client = to_i2c_client(wdd->parent); int ret; - unsigned long timeout; - timeout = jiffies + msecs_to_jiffies(ZIIRAVE_FIRM_WAIT_FOR_ACK_TIMEOUT); - do { - if (time_after(jiffies, timeout)) - return -ETIMEDOUT; - - usleep_range(5000, 10000); - - ret = i2c_smbus_read_byte(client); - if (ret < 0) { - dev_err(&client->dev, "Failed to read byte\n"); - return ret; - } - } while (ret == ZIIRAVE_FIRM_DOWNLOAD_BUSY); + ret = i2c_smbus_read_byte(client); + if (ret < 0) { + dev_err(&client->dev, "Failed to read status byte\n"); + return ret; + } return ret == ZIIRAVE_FIRM_DOWNLOAD_ACK ? 0 : -EIO; } @@ -226,7 +212,7 @@ static int ziirave_firm_write_block_data(struct watchdog_device *wdd, } if (wait_for_ack) - ret = ziirave_firm_wait_for_ack(wdd); + ret = ziirave_firm_read_ack(wdd); return ret; } From fa0d2f44aa6879e21843d517138510ccb7e1e39f Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:09:04 -0700 Subject: [PATCH 0523/2880] watchdog: ziirave_wdt: Fix DOWNLOAD_START payload Bootloader firmware expects the following traffic for DOWNLOAD_END: S Addr Wr [A] 0x10 [A] P using ziirave_firm_write_byte() will result in S Addr Wr [A] 0x10 [A] 0x01 [A] 0x01 [A] P which happens to work because firmware will ignore any extra bytes sent. Fix this by converting the code to use i2c_smbus_write_byte() instead. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-21-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 681f65349779..ed69fa82e09c 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -217,13 +217,6 @@ static int ziirave_firm_write_block_data(struct watchdog_device *wdd, return ret; } -static int ziirave_firm_write_byte(struct watchdog_device *wdd, u8 command, - u8 byte, bool wait_for_ack) -{ - return ziirave_firm_write_block_data(wdd, command, 1, &byte, - wait_for_ack); -} - static bool ziirave_firm_addr_readonly(u32 addr) { return addr < ZIIRAVE_FIRM_FLASH_MEMORY_START || @@ -375,12 +368,18 @@ static int ziirave_firm_upload(struct watchdog_device *wdd, msleep(500); - ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_DOWNLOAD_START, 1, true); + ret = i2c_smbus_write_byte(client, ZIIRAVE_CMD_DOWNLOAD_START); if (ret) { dev_err(&client->dev, "Failed to start download\n"); return ret; } + ret = ziirave_firm_read_ack(wdd); + if (ret) { + dev_err(&client->dev, "No ACK for start download\n"); + return ret; + } + msleep(500); for (rec = (void *)fw->data; rec; rec = ihex_next_binrec(rec)) { From 08c913fe3ea67c54318b91ed3d29cd98d8012ab9 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:09:05 -0700 Subject: [PATCH 0524/2880] watchdog: ziirave_wdt: Drop ziirave_firm_write_block_data() There's only one user of ziirave_firm_write_block_data(), so we may as well inline it. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-22-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index ed69fa82e09c..48278034cda6 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -197,26 +197,6 @@ static int ziirave_firm_set_read_addr(struct watchdog_device *wdd, u32 addr) sizeof(address), address); } -static int ziirave_firm_write_block_data(struct watchdog_device *wdd, - u8 command, u8 length, const u8 *data, - bool wait_for_ack) -{ - struct i2c_client *client = to_i2c_client(wdd->parent); - int ret; - - ret = i2c_smbus_write_block_data(client, command, length, data); - if (ret) { - dev_err(&client->dev, - "Failed to send command 0x%02x: %d\n", command, ret); - return ret; - } - - if (wait_for_ack) - ret = ziirave_firm_read_ack(wdd); - - return ret; -} - static bool ziirave_firm_addr_readonly(u32 addr) { return addr < ZIIRAVE_FIRM_FLASH_MEMORY_START || @@ -273,8 +253,15 @@ static int __ziirave_firm_write_pkt(struct watchdog_device *wdd, checksum += packet[i]; packet[ZIIRAVE_FIRM_PKT_TOTAL_SIZE - 1] = checksum; - ret = ziirave_firm_write_block_data(wdd, ZIIRAVE_CMD_DOWNLOAD_PACKET, - sizeof(packet), packet, true); + ret = i2c_smbus_write_block_data(client, ZIIRAVE_CMD_DOWNLOAD_PACKET, + sizeof(packet), packet); + if (ret) { + dev_err(&client->dev, + "Failed to send DOWNLOAD_PACKET: %d\n", ret); + return ret; + } + + ret = ziirave_firm_read_ack(wdd); if (ret) dev_err(&client->dev, "Failed to write firmware packet at address 0x%04x: %d\n", From f676ac8305f776c31f347080d64a2474cffbcab1 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 12 Aug 2019 13:09:06 -0700 Subject: [PATCH 0525/2880] watchdog: ziirave_wdt: Update checked I2C functionality mask Update checked I2C functionality mask to reflect all of the SMBus primitives used by this driver. Signed-off-by: Andrey Smirnov Cc: Chris Healy Cc: Guenter Roeck Cc: Rick Ramstetter Cc: linux-watchdog@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190812200906.31344-23-andrew.smirnov@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ziirave_wdt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c index 48278034cda6..4a363a8b2d20 100644 --- a/drivers/watchdog/ziirave_wdt.c +++ b/drivers/watchdog/ziirave_wdt.c @@ -602,7 +602,10 @@ static int ziirave_wdt_probe(struct i2c_client *client, struct ziirave_wdt_data *w_priv; int val; - if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE_DATA)) + if (!i2c_check_functionality(client->adapter, + I2C_FUNC_SMBUS_BYTE | + I2C_FUNC_SMBUS_BYTE_DATA | + I2C_FUNC_SMBUS_WRITE_BLOCK_DATA)) return -ENODEV; w_priv = devm_kzalloc(&client->dev, sizeof(*w_priv), GFP_KERNEL); From ff45d87dd8a85ab66027540f7b9c61d3eee2c992 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Mon, 19 Aug 2019 14:47:37 +0930 Subject: [PATCH 0526/2880] dt-bindings: watchdog: Add ast2600 compatible This adds a compatible for the ast2600, a new ASPEED SoC. Signed-off-by: Joel Stanley Reviewed-by: Andrew Jeffery Reviewed-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190819051738.17370-2-joel@jms.id.au Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/aspeed-wdt.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/watchdog/aspeed-wdt.txt b/Documentation/devicetree/bindings/watchdog/aspeed-wdt.txt index c5077a1f5cb3..d78d4a8fb868 100644 --- a/Documentation/devicetree/bindings/watchdog/aspeed-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/aspeed-wdt.txt @@ -4,6 +4,7 @@ Required properties: - compatible: must be one of: - "aspeed,ast2400-wdt" - "aspeed,ast2500-wdt" + - "aspeed,ast2600-wdt" - reg: physical base address of the controller and length of memory mapped region From b3528b4874480818e38e4da019d655413c233e6a Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Mon, 19 Aug 2019 14:47:38 +0930 Subject: [PATCH 0527/2880] watchdog: aspeed: Add support for AST2600 The ast2600 can be supported by the same code as the ast2500. Signed-off-by: Ryan Chen Signed-off-by: Joel Stanley Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190819051738.17370-3-joel@jms.id.au Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/aspeed_wdt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/watchdog/aspeed_wdt.c b/drivers/watchdog/aspeed_wdt.c index cc71861e033a..5b64bc2e8788 100644 --- a/drivers/watchdog/aspeed_wdt.c +++ b/drivers/watchdog/aspeed_wdt.c @@ -34,6 +34,7 @@ static const struct aspeed_wdt_config ast2500_config = { static const struct of_device_id aspeed_wdt_of_table[] = { { .compatible = "aspeed,ast2400-wdt", .data = &ast2400_config }, { .compatible = "aspeed,ast2500-wdt", .data = &ast2500_config }, + { .compatible = "aspeed,ast2600-wdt", .data = &ast2500_config }, { }, }; MODULE_DEVICE_TABLE(of, aspeed_wdt_of_table); @@ -259,7 +260,8 @@ static int aspeed_wdt_probe(struct platform_device *pdev) set_bit(WDOG_HW_RUNNING, &wdt->wdd.status); } - if (of_device_is_compatible(np, "aspeed,ast2500-wdt")) { + if ((of_device_is_compatible(np, "aspeed,ast2500-wdt")) || + (of_device_is_compatible(np, "aspeed,ast2600-wdt"))) { u32 reg = readl(wdt->base + WDT_RESET_WIDTH); reg &= config->ext_pulse_width_mask; From f9eaba57c183b216afe8fc2665fd885f1caa378d Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 19 Aug 2019 20:20:34 +0200 Subject: [PATCH 0528/2880] dt-bindings: watchdog: Add YAML schemas for the generic watchdog bindings The watchdogs have a bunch of generic properties that are needed in a device tree. Add a YAML schemas for those. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190821143835.7294-1-mripard@kernel.org Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../bindings/watchdog/watchdog.yaml | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 Documentation/devicetree/bindings/watchdog/watchdog.yaml diff --git a/Documentation/devicetree/bindings/watchdog/watchdog.yaml b/Documentation/devicetree/bindings/watchdog/watchdog.yaml new file mode 100644 index 000000000000..187bf6cb62bf --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/watchdog.yaml @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/watchdog/watchdog.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Watchdog Generic Bindings + +maintainers: + - Guenter Roeck + - Wim Van Sebroeck + +description: | + This document describes generic bindings which can be used to + describe watchdog devices in a device tree. + +properties: + $nodename: + pattern: "^watchdog(@.*|-[0-9a-f])?$" + + timeout-sec: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Contains the watchdog timeout in seconds. + +... From 2de4de20dd36daef97d550e5895ed6daebbc3254 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 21 Aug 2019 16:38:31 +0200 Subject: [PATCH 0529/2880] dt-bindings: watchdog: Convert Allwinner watchdog to a schema The Allwinner SoCs have a watchdog supported in Linux, with a matching Device Tree binding. Now that we have the DT validation in place, let's convert the device tree bindings for that controller over to a YAML schemas. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190821143835.7294-2-mripard@kernel.org Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../watchdog/allwinner,sun4i-a10-wdt.yaml | 48 +++++++++++++++++++ .../bindings/watchdog/sunxi-wdt.txt | 22 --------- 2 files changed, 48 insertions(+), 22 deletions(-) create mode 100644 Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml delete mode 100644 Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt diff --git a/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml new file mode 100644 index 000000000000..dc7553f57708 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/watchdog/allwinner,sun4i-a10-wdt.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner A10 Watchdog Device Tree Bindings + +allOf: + - $ref: "watchdog.yaml#" + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + compatible: + oneOf: + - const: allwinner,sun4i-a10-wdt + - const: allwinner,sun6i-a31-wdt + - items: + - const: allwinner,sun50i-a64-wdt + - const: allwinner,sun6i-a31-wdt + - items: + - const: allwinner,sun50i-h6-wdt + - const: allwinner,sun6i-a31-wdt + - items: + - const: allwinner,suniv-f1c100s-wdt + - const: allwinner,sun4i-a10-wdt + + reg: + maxItems: 1 + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + wdt: watchdog@1c20c90 { + compatible = "allwinner,sun4i-a10-wdt"; + reg = <0x01c20c90 0x10>; + timeout-sec = <10>; + }; + +... diff --git a/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt b/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt deleted file mode 100644 index e65198d82a2b..000000000000 --- a/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt +++ /dev/null @@ -1,22 +0,0 @@ -Allwinner SoCs Watchdog timer - -Required properties: - -- compatible : should be one of - "allwinner,sun4i-a10-wdt" - "allwinner,sun6i-a31-wdt" - "allwinner,sun50i-a64-wdt","allwinner,sun6i-a31-wdt" - "allwinner,sun50i-h6-wdt","allwinner,sun6i-a31-wdt" - "allwinner,suniv-f1c100s-wdt", "allwinner,sun4i-a10-wdt" -- reg : Specifies base physical address and size of the registers. - -Optional properties: -- timeout-sec : Contains the watchdog timeout in seconds - -Example: - -wdt: watchdog@1c20c90 { - compatible = "allwinner,sun4i-a10-wdt"; - reg = <0x01c20c90 0x10>; - timeout-sec = <10>; -}; From f285e78fb77806900d0f938b9dd791974fb5e694 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 21 Aug 2019 16:38:32 +0200 Subject: [PATCH 0530/2880] dt-bindings: watchdog: sun4i: Add the watchdog interrupts The Allwinner watchdog has an interrupt, either shared or dedicated depending on the SoC, that has been described in some DT, but not all of them. The binding is also completely missing that description. Let's add that property to be consistent. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190821143835.7294-3-mripard@kernel.org Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../bindings/watchdog/allwinner,sun4i-a10-wdt.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml index dc7553f57708..31c95c404619 100644 --- a/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml @@ -31,9 +31,13 @@ properties: reg: maxItems: 1 + interrupts: + maxItems: 1 + required: - compatible - reg + - interrupts unevaluatedProperties: false @@ -42,6 +46,7 @@ examples: wdt: watchdog@1c20c90 { compatible = "allwinner,sun4i-a10-wdt"; reg = <0x01c20c90 0x10>; + interrupts = <24>; timeout-sec = <10>; }; From 284ec100d0910b6789b6079a0718120b48a51f37 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 21 Aug 2019 16:38:33 +0200 Subject: [PATCH 0531/2880] dt-bindings: watchdog: sun4i: Add the watchdog clock The Allwinner watchdog has a clock that has been described in some DT, but not all of them. The binding is also completely missing that description. Let's add that property to be consistent. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190821143835.7294-4-mripard@kernel.org Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../bindings/watchdog/allwinner,sun4i-a10-wdt.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml index 31c95c404619..3a54f58683a0 100644 --- a/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml @@ -31,12 +31,16 @@ properties: reg: maxItems: 1 + clocks: + maxItems: 1 + interrupts: maxItems: 1 required: - compatible - reg + - clocks - interrupts unevaluatedProperties: false @@ -47,6 +51,7 @@ examples: compatible = "allwinner,sun4i-a10-wdt"; reg = <0x01c20c90 0x10>; interrupts = <24>; + clocks = <&osc24M>; timeout-sec = <10>; }; From 69eb8b118631e5a6741edc90aac6d9139a43e9ec Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Wed, 28 Aug 2019 09:35:00 -0400 Subject: [PATCH 0532/2880] dt-bindings: watchdog: Add i.MX7ULP bindings Add the watchdog bindings for Freescale i.MX7ULP. Signed-off-by: Anson Huang Reviewed-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/1566999303-18795-1-git-send-email-Anson.Huang@nxp.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../bindings/watchdog/fsl-imx7ulp-wdt.txt | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.txt diff --git a/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.txt b/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.txt new file mode 100644 index 000000000000..f902508d6cac --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.txt @@ -0,0 +1,22 @@ +* Freescale i.MX7ULP Watchdog Timer (WDT) Controller + +Required properties: +- compatible : Should be "fsl,imx7ulp-wdt" +- reg : Should contain WDT registers location and length +- interrupts : Should contain WDT interrupt +- clocks: Should contain a phandle pointing to the gated peripheral clock. + +Optional properties: +- timeout-sec : Contains the watchdog timeout in seconds + +Examples: + +wdog1: watchdog@403d0000 { + compatible = "fsl,imx7ulp-wdt"; + reg = <0x403d0000 0x10000>; + interrupts = ; + clocks = <&pcc2 IMX7ULP_CLK_WDG1>; + assigned-clocks = <&pcc2 IMX7ULP_CLK_WDG1>; + assigned-clocks-parents = <&scg1 IMX7ULP_CLK_FIRC_BUS_CLK>; + timeout-sec = <40>; +}; From 41b630f41bf744b0eed92a53ff8c716cfc71920a Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Wed, 28 Aug 2019 09:35:01 -0400 Subject: [PATCH 0533/2880] watchdog: Add i.MX7ULP watchdog support The i.MX7ULP Watchdog Timer (WDOG) module is an independent timer that is available for system use. It provides a safety feature to ensure that software is executing as planned and that the CPU is not stuck in an infinite loop or executing unintended code. If the WDOG module is not serviced (refreshed) within a certain period, it resets the MCU. Add driver support for i.MX7ULP watchdog. Signed-off-by: Anson Huang Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/1566999303-18795-2-git-send-email-Anson.Huang@nxp.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 13 ++ drivers/watchdog/Makefile | 1 + drivers/watchdog/imx7ulp_wdt.c | 243 +++++++++++++++++++++++++++++++++ 3 files changed, 257 insertions(+) create mode 100644 drivers/watchdog/imx7ulp_wdt.c diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index a8f5c8147d4b..d68e5b500aef 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -724,6 +724,19 @@ config IMX_SC_WDT To compile this driver as a module, choose M here: the module will be called imx_sc_wdt. +config IMX7ULP_WDT + tristate "IMX7ULP Watchdog" + depends on ARCH_MXC || COMPILE_TEST + select WATCHDOG_CORE + help + This is the driver for the hardware watchdog on the Freescale + IMX7ULP and later processors. If you have one of these + processors and wish to have watchdog support enabled, + say Y, otherwise say N. + + To compile this driver as a module, choose M here: the + module will be called imx7ulp_wdt. + config UX500_WATCHDOG tristate "ST-Ericsson Ux500 watchdog" depends on MFD_DB8500_PRCMU diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index b5a0aed537af..2ee352bf3372 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -67,6 +67,7 @@ obj-$(CONFIG_TS4800_WATCHDOG) += ts4800_wdt.o obj-$(CONFIG_TS72XX_WATCHDOG) += ts72xx_wdt.o obj-$(CONFIG_IMX2_WDT) += imx2_wdt.o obj-$(CONFIG_IMX_SC_WDT) += imx_sc_wdt.o +obj-$(CONFIG_IMX7ULP_WDT) += imx7ulp_wdt.o obj-$(CONFIG_UX500_WATCHDOG) += ux500_wdt.o obj-$(CONFIG_RETU_WATCHDOG) += retu_wdt.o obj-$(CONFIG_BCM2835_WDT) += bcm2835_wdt.o diff --git a/drivers/watchdog/imx7ulp_wdt.c b/drivers/watchdog/imx7ulp_wdt.c new file mode 100644 index 000000000000..5ce51026989a --- /dev/null +++ b/drivers/watchdog/imx7ulp_wdt.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 NXP. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define WDOG_CS 0x0 +#define WDOG_CS_CMD32EN BIT(13) +#define WDOG_CS_ULK BIT(11) +#define WDOG_CS_RCS BIT(10) +#define WDOG_CS_EN BIT(7) +#define WDOG_CS_UPDATE BIT(5) + +#define WDOG_CNT 0x4 +#define WDOG_TOVAL 0x8 + +#define REFRESH_SEQ0 0xA602 +#define REFRESH_SEQ1 0xB480 +#define REFRESH ((REFRESH_SEQ1 << 16) | REFRESH_SEQ0) + +#define UNLOCK_SEQ0 0xC520 +#define UNLOCK_SEQ1 0xD928 +#define UNLOCK ((UNLOCK_SEQ1 << 16) | UNLOCK_SEQ0) + +#define DEFAULT_TIMEOUT 60 +#define MAX_TIMEOUT 128 +#define WDOG_CLOCK_RATE 1000 + +static bool nowayout = WATCHDOG_NOWAYOUT; +module_param(nowayout, bool, 0000); +MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +struct imx7ulp_wdt_device { + struct notifier_block restart_handler; + struct watchdog_device wdd; + void __iomem *base; + struct clk *clk; +}; + +static inline void imx7ulp_wdt_enable(void __iomem *base, bool enable) +{ + u32 val = readl(base + WDOG_CS); + + writel(UNLOCK, base + WDOG_CNT); + if (enable) + writel(val | WDOG_CS_EN, base + WDOG_CS); + else + writel(val & ~WDOG_CS_EN, base + WDOG_CS); +} + +static inline bool imx7ulp_wdt_is_enabled(void __iomem *base) +{ + u32 val = readl(base + WDOG_CS); + + return val & WDOG_CS_EN; +} + +static int imx7ulp_wdt_ping(struct watchdog_device *wdog) +{ + struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog); + + writel(REFRESH, wdt->base + WDOG_CNT); + + return 0; +} + +static int imx7ulp_wdt_start(struct watchdog_device *wdog) +{ + struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog); + + imx7ulp_wdt_enable(wdt->base, true); + + return 0; +} + +static int imx7ulp_wdt_stop(struct watchdog_device *wdog) +{ + struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog); + + imx7ulp_wdt_enable(wdt->base, false); + + return 0; +} + +static int imx7ulp_wdt_set_timeout(struct watchdog_device *wdog, + unsigned int timeout) +{ + struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog); + u32 val = WDOG_CLOCK_RATE * timeout; + + writel(UNLOCK, wdt->base + WDOG_CNT); + writel(val, wdt->base + WDOG_TOVAL); + + wdog->timeout = timeout; + + return 0; +} + +static const struct watchdog_ops imx7ulp_wdt_ops = { + .owner = THIS_MODULE, + .start = imx7ulp_wdt_start, + .stop = imx7ulp_wdt_stop, + .ping = imx7ulp_wdt_ping, + .set_timeout = imx7ulp_wdt_set_timeout, +}; + +static const struct watchdog_info imx7ulp_wdt_info = { + .identity = "i.MX7ULP watchdog timer", + .options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | + WDIOF_MAGICCLOSE, +}; + +static inline void imx7ulp_wdt_init(void __iomem *base, unsigned int timeout) +{ + u32 val; + + /* unlock the wdog for reconfiguration */ + writel_relaxed(UNLOCK_SEQ0, base + WDOG_CNT); + writel_relaxed(UNLOCK_SEQ1, base + WDOG_CNT); + + /* set an initial timeout value in TOVAL */ + writel(timeout, base + WDOG_TOVAL); + /* enable 32bit command sequence and reconfigure */ + val = BIT(13) | BIT(8) | BIT(5); + writel(val, base + WDOG_CS); +} + +static void imx7ulp_wdt_action(void *data) +{ + clk_disable_unprepare(data); +} + +static int imx7ulp_wdt_probe(struct platform_device *pdev) +{ + struct imx7ulp_wdt_device *imx7ulp_wdt; + struct device *dev = &pdev->dev; + struct watchdog_device *wdog; + int ret; + + imx7ulp_wdt = devm_kzalloc(dev, sizeof(*imx7ulp_wdt), GFP_KERNEL); + if (!imx7ulp_wdt) + return -ENOMEM; + + platform_set_drvdata(pdev, imx7ulp_wdt); + + imx7ulp_wdt->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(imx7ulp_wdt->base)) + return PTR_ERR(imx7ulp_wdt->base); + + imx7ulp_wdt->clk = devm_clk_get(dev, NULL); + if (IS_ERR(imx7ulp_wdt->clk)) { + dev_err(dev, "Failed to get watchdog clock\n"); + return PTR_ERR(imx7ulp_wdt->clk); + } + + ret = clk_prepare_enable(imx7ulp_wdt->clk); + if (ret) + return ret; + + ret = devm_add_action_or_reset(dev, imx7ulp_wdt_action, imx7ulp_wdt->clk); + if (ret) + return ret; + + wdog = &imx7ulp_wdt->wdd; + wdog->info = &imx7ulp_wdt_info; + wdog->ops = &imx7ulp_wdt_ops; + wdog->min_timeout = 1; + wdog->max_timeout = MAX_TIMEOUT; + wdog->parent = dev; + wdog->timeout = DEFAULT_TIMEOUT; + + watchdog_init_timeout(wdog, 0, dev); + watchdog_stop_on_reboot(wdog); + watchdog_stop_on_unregister(wdog); + watchdog_set_drvdata(wdog, imx7ulp_wdt); + imx7ulp_wdt_init(imx7ulp_wdt->base, wdog->timeout * WDOG_CLOCK_RATE); + + return devm_watchdog_register_device(dev, wdog); +} + +static int __maybe_unused imx7ulp_wdt_suspend(struct device *dev) +{ + struct imx7ulp_wdt_device *imx7ulp_wdt = dev_get_drvdata(dev); + + if (watchdog_active(&imx7ulp_wdt->wdd)) + imx7ulp_wdt_stop(&imx7ulp_wdt->wdd); + + clk_disable_unprepare(imx7ulp_wdt->clk); + + return 0; +} + +static int __maybe_unused imx7ulp_wdt_resume(struct device *dev) +{ + struct imx7ulp_wdt_device *imx7ulp_wdt = dev_get_drvdata(dev); + u32 timeout = imx7ulp_wdt->wdd.timeout * WDOG_CLOCK_RATE; + int ret; + + ret = clk_prepare_enable(imx7ulp_wdt->clk); + if (ret) + return ret; + + if (imx7ulp_wdt_is_enabled(imx7ulp_wdt->base)) + imx7ulp_wdt_init(imx7ulp_wdt->base, timeout); + + if (watchdog_active(&imx7ulp_wdt->wdd)) + imx7ulp_wdt_start(&imx7ulp_wdt->wdd); + + return 0; +} + +static SIMPLE_DEV_PM_OPS(imx7ulp_wdt_pm_ops, imx7ulp_wdt_suspend, + imx7ulp_wdt_resume); + +static const struct of_device_id imx7ulp_wdt_dt_ids[] = { + { .compatible = "fsl,imx7ulp-wdt", }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, imx7ulp_wdt_dt_ids); + +static struct platform_driver imx7ulp_wdt_driver = { + .probe = imx7ulp_wdt_probe, + .driver = { + .name = "imx7ulp-wdt", + .pm = &imx7ulp_wdt_pm_ops, + .of_match_table = imx7ulp_wdt_dt_ids, + }, +}; +module_platform_driver(imx7ulp_wdt_driver); + +MODULE_AUTHOR("Anson Huang "); +MODULE_DESCRIPTION("Freescale i.MX7ULP watchdog driver"); +MODULE_LICENSE("GPL v2"); From e07a4c79ca75bf41d73ba74c06f7220cd2741bc9 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Fri, 30 Aug 2019 09:52:24 +1200 Subject: [PATCH 0534/2880] watchdog: orion_wdt: use timer1 as a pretimeout The orion watchdog can either reset the CPU or generate an interrupt. The interrupt would be useful for debugging as it provides panic() output about the watchdog expiry, however if the interrupt is used the watchdog can't reset the CPU in the event of being stuck in a loop with interrupts disabled or if the CPU is prevented from accessing memory (e.g. an unterminated DMA). The Armada SoCs have spare timers that aren't currently used by the Linux kernel. We can use timer1 to provide a pre-timeout ahead of the watchdog timer and provide the possibility of gathering debug before the reset triggers. Signed-off-by: Chris Packham Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190829215224.27956-1-chris.packham@alliedtelesis.co.nz Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/orion_wdt.c | 66 +++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/drivers/watchdog/orion_wdt.c b/drivers/watchdog/orion_wdt.c index cdb0d174c5e2..1cccf8eb1c5d 100644 --- a/drivers/watchdog/orion_wdt.c +++ b/drivers/watchdog/orion_wdt.c @@ -35,7 +35,15 @@ * Watchdog timer block registers. */ #define TIMER_CTRL 0x0000 -#define TIMER_A370_STATUS 0x04 +#define TIMER1_FIXED_ENABLE_BIT BIT(12) +#define WDT_AXP_FIXED_ENABLE_BIT BIT(10) +#define TIMER1_ENABLE_BIT BIT(2) + +#define TIMER_A370_STATUS 0x0004 +#define WDT_A370_EXPIRED BIT(31) +#define TIMER1_STATUS_BIT BIT(8) + +#define TIMER1_VAL_OFF 0x001c #define WDT_MAX_CYCLE_COUNT 0xffffffff @@ -43,9 +51,6 @@ #define WDT_A370_RATIO_SHIFT 5 #define WDT_A370_RATIO (1 << WDT_A370_RATIO_SHIFT) -#define WDT_AXP_FIXED_ENABLE_BIT BIT(10) -#define WDT_A370_EXPIRED BIT(31) - static bool nowayout = WATCHDOG_NOWAYOUT; static int heartbeat = -1; /* module parameter (seconds) */ @@ -158,6 +163,7 @@ static int armadaxp_wdt_clock_init(struct platform_device *pdev, struct orion_watchdog *dev) { int ret; + u32 val; dev->clk = of_clk_get_by_name(pdev->dev.of_node, "fixed"); if (IS_ERR(dev->clk)) @@ -168,10 +174,9 @@ static int armadaxp_wdt_clock_init(struct platform_device *pdev, return ret; } - /* Enable the fixed watchdog clock input */ - atomic_io_modify(dev->reg + TIMER_CTRL, - WDT_AXP_FIXED_ENABLE_BIT, - WDT_AXP_FIXED_ENABLE_BIT); + /* Fix the wdt and timer1 clock freqency to 25MHz */ + val = WDT_AXP_FIXED_ENABLE_BIT | TIMER1_FIXED_ENABLE_BIT; + atomic_io_modify(dev->reg + TIMER_CTRL, val, val); dev->clk_rate = clk_get_rate(dev->clk); return 0; @@ -183,6 +188,10 @@ static int orion_wdt_ping(struct watchdog_device *wdt_dev) /* Reload watchdog duration */ writel(dev->clk_rate * wdt_dev->timeout, dev->reg + dev->data->wdt_counter_offset); + if (dev->wdt.info->options & WDIOF_PRETIMEOUT) + writel(dev->clk_rate * (wdt_dev->timeout - wdt_dev->pretimeout), + dev->reg + TIMER1_VAL_OFF); + return 0; } @@ -194,13 +203,18 @@ static int armada375_start(struct watchdog_device *wdt_dev) /* Set watchdog duration */ writel(dev->clk_rate * wdt_dev->timeout, dev->reg + dev->data->wdt_counter_offset); + if (dev->wdt.info->options & WDIOF_PRETIMEOUT) + writel(dev->clk_rate * (wdt_dev->timeout - wdt_dev->pretimeout), + dev->reg + TIMER1_VAL_OFF); /* Clear the watchdog expiration bit */ atomic_io_modify(dev->reg + TIMER_A370_STATUS, WDT_A370_EXPIRED, 0); /* Enable watchdog timer */ - atomic_io_modify(dev->reg + TIMER_CTRL, dev->data->wdt_enable_bit, - dev->data->wdt_enable_bit); + reg = dev->data->wdt_enable_bit; + if (dev->wdt.info->options & WDIOF_PRETIMEOUT) + reg |= TIMER1_ENABLE_BIT; + atomic_io_modify(dev->reg + TIMER_CTRL, reg, reg); /* Enable reset on watchdog */ reg = readl(dev->rstout); @@ -277,7 +291,7 @@ static int orion_stop(struct watchdog_device *wdt_dev) static int armada375_stop(struct watchdog_device *wdt_dev) { struct orion_watchdog *dev = watchdog_get_drvdata(wdt_dev); - u32 reg; + u32 reg, mask; /* Disable reset on watchdog */ atomic_io_modify(dev->rstout_mask, dev->data->rstout_mask_bit, @@ -287,7 +301,10 @@ static int armada375_stop(struct watchdog_device *wdt_dev) writel(reg, dev->rstout); /* Disable watchdog timer */ - atomic_io_modify(dev->reg + TIMER_CTRL, dev->data->wdt_enable_bit, 0); + mask = dev->data->wdt_enable_bit; + if (wdt_dev->info->options & WDIOF_PRETIMEOUT) + mask |= TIMER1_ENABLE_BIT; + atomic_io_modify(dev->reg + TIMER_CTRL, mask, 0); return 0; } @@ -349,7 +366,7 @@ static unsigned int orion_wdt_get_timeleft(struct watchdog_device *wdt_dev) return readl(dev->reg + dev->data->wdt_counter_offset) / dev->clk_rate; } -static const struct watchdog_info orion_wdt_info = { +static struct watchdog_info orion_wdt_info = { .options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | WDIOF_MAGICCLOSE, .identity = "Orion Watchdog", }; @@ -368,6 +385,16 @@ static irqreturn_t orion_wdt_irq(int irq, void *devid) return IRQ_HANDLED; } +static irqreturn_t orion_wdt_pre_irq(int irq, void *devid) +{ + struct orion_watchdog *dev = devid; + + atomic_io_modify(dev->reg + TIMER_A370_STATUS, + TIMER1_STATUS_BIT, 0); + watchdog_notify_pretimeout(&dev->wdt); + return IRQ_HANDLED; +} + /* * The original devicetree binding for this driver specified only * one memory resource, so in order to keep DT backwards compatibility @@ -589,6 +616,19 @@ static int orion_wdt_probe(struct platform_device *pdev) } } + /* Optional 2nd interrupt for pretimeout */ + irq = platform_get_irq(pdev, 1); + if (irq > 0) { + orion_wdt_info.options |= WDIOF_PRETIMEOUT; + ret = devm_request_irq(&pdev->dev, irq, orion_wdt_pre_irq, + 0, pdev->name, dev); + if (ret < 0) { + dev_err(&pdev->dev, "failed to request IRQ\n"); + goto disable_clk; + } + } + + watchdog_set_nowayout(&dev->wdt, nowayout); ret = watchdog_register_device(&dev->wdt); if (ret) From 3d9e89bda9e9f01d55ff72f58d619e77d0c5b248 Mon Sep 17 00:00:00 2001 From: Ivan Mikhaylov Date: Wed, 28 Aug 2019 13:24:01 +0300 Subject: [PATCH 0535/2880] watchdog: aspeed: add support for dual boot Set WDT_CLEAR_TIMEOUT_AND_BOOT_CODE_SELECTION into WDT_CLEAR_TIMEOUT_STATUS to clear out boot code source and re-enable access to the primary SPI flash chip while booted via wdt2 from the alternate chip. AST2400 datasheet says: "In the 2nd flash booting mode, all the address mapping to CS0# would be re-directed to CS1#. And CS0# is not accessible under this mode. To access CS0#, firmware should clear the 2nd boot mode register in the WDT2 status register WDT30.bit[1]." Signed-off-by: Ivan Mikhaylov Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190828102402.13155-4-i.mikhaylov@yadro.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/aspeed_wdt.c | 65 ++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/drivers/watchdog/aspeed_wdt.c b/drivers/watchdog/aspeed_wdt.c index 5b64bc2e8788..4ec0906bf12c 100644 --- a/drivers/watchdog/aspeed_wdt.c +++ b/drivers/watchdog/aspeed_wdt.c @@ -54,6 +54,8 @@ MODULE_DEVICE_TABLE(of, aspeed_wdt_of_table); #define WDT_CTRL_ENABLE BIT(0) #define WDT_TIMEOUT_STATUS 0x10 #define WDT_TIMEOUT_STATUS_BOOT_SECONDARY BIT(1) +#define WDT_CLEAR_TIMEOUT_STATUS 0x14 +#define WDT_CLEAR_TIMEOUT_AND_BOOT_CODE_SELECTION BIT(0) /* * WDT_RESET_WIDTH controls the characteristics of the external pulse (if @@ -166,6 +168,60 @@ static int aspeed_wdt_restart(struct watchdog_device *wdd, return 0; } +/* access_cs0 shows if cs0 is accessible, hence the reverted bit */ +static ssize_t access_cs0_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct aspeed_wdt *wdt = dev_get_drvdata(dev); + u32 status = readl(wdt->base + WDT_TIMEOUT_STATUS); + + return sprintf(buf, "%u\n", + !(status & WDT_TIMEOUT_STATUS_BOOT_SECONDARY)); +} + +static ssize_t access_cs0_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t size) +{ + struct aspeed_wdt *wdt = dev_get_drvdata(dev); + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val) + writel(WDT_CLEAR_TIMEOUT_AND_BOOT_CODE_SELECTION, + wdt->base + WDT_CLEAR_TIMEOUT_STATUS); + + return size; +} + +/* + * This attribute exists only if the system has booted from the alternate + * flash with 'alt-boot' option. + * + * At alternate flash the 'access_cs0' sysfs node provides: + * ast2400: a way to get access to the primary SPI flash chip at CS0 + * after booting from the alternate chip at CS1. + * ast2500: a way to restore the normal address mapping from + * (CS0->CS1, CS1->CS0) to (CS0->CS0, CS1->CS1). + * + * Clearing the boot code selection and timeout counter also resets to the + * initial state the chip select line mapping. When the SoC is in normal + * mapping state (i.e. booted from CS0), clearing those bits does nothing for + * both versions of the SoC. For alternate boot mode (booted from CS1 due to + * wdt2 expiration) the behavior differs as described above. + * + * This option can be used with wdt2 (watchdog1) only. + */ +static DEVICE_ATTR_RW(access_cs0); + +static struct attribute *bswitch_attrs[] = { + &dev_attr_access_cs0.attr, + NULL +}; +ATTRIBUTE_GROUPS(bswitch); + static const struct watchdog_ops aspeed_wdt_ops = { .start = aspeed_wdt_start, .stop = aspeed_wdt_stop, @@ -308,9 +364,16 @@ static int aspeed_wdt_probe(struct platform_device *pdev) } status = readl(wdt->base + WDT_TIMEOUT_STATUS); - if (status & WDT_TIMEOUT_STATUS_BOOT_SECONDARY) + if (status & WDT_TIMEOUT_STATUS_BOOT_SECONDARY) { wdt->wdd.bootstatus = WDIOF_CARDRESET; + if (of_device_is_compatible(np, "aspeed,ast2400-wdt") || + of_device_is_compatible(np, "aspeed,ast2500-wdt")) + wdt->wdd.groups = bswitch_groups; + } + + dev_set_drvdata(dev, wdt); + return devm_watchdog_register_device(dev, &wdt->wdd); } From ebdc0f5817321cd5a45e76c6653adf7c7b1d0aee Mon Sep 17 00:00:00 2001 From: Ivan Mikhaylov Date: Wed, 28 Aug 2019 13:24:02 +0300 Subject: [PATCH 0536/2880] watchdog: apseed: Add access_cs0 option for alt-boot The option for the ast2400/2500 to get access to CS0 at runtime. Signed-off-by: Ivan Mikhaylov Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190828102402.13155-5-i.mikhaylov@yadro.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../ABI/testing/sysfs-class-watchdog | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-watchdog b/Documentation/ABI/testing/sysfs-class-watchdog index 6317ade5ad19..675f9b537661 100644 --- a/Documentation/ABI/testing/sysfs-class-watchdog +++ b/Documentation/ABI/testing/sysfs-class-watchdog @@ -72,3 +72,37 @@ Description: It is a read/write file. When read, the currently assigned pretimeout governor is returned. When written, it sets the pretimeout governor. + +What: /sys/class/watchdog/watchdog1/access_cs0 +Date: August 2019 +Contact: Ivan Mikhaylov , + Alexander Amelkin +Description: + It is a read/write file. This attribute exists only if the + system has booted from the alternate flash chip due to + expiration of a watchdog timer of AST2400/AST2500 when + alternate boot function was enabled with 'aspeed,alt-boot' + devicetree option for that watchdog or with an appropriate + h/w strapping (for WDT2 only). + + At alternate flash the 'access_cs0' sysfs node provides: + ast2400: a way to get access to the primary SPI flash + chip at CS0 after booting from the alternate + chip at CS1. + ast2500: a way to restore the normal address mapping + from (CS0->CS1, CS1->CS0) to (CS0->CS0, + CS1->CS1). + + Clearing the boot code selection and timeout counter also + resets to the initial state the chip select line mapping. When + the SoC is in normal mapping state (i.e. booted from CS0), + clearing those bits does nothing for both versions of the SoC. + For alternate boot mode (booted from CS1 due to wdt2 + expiration) the behavior differs as described above. + + This option can be used with wdt2 (watchdog1) only. + + When read, the current status of the boot code selection is + shown. When written with any non-zero value, it clears + the boot code selection and the timeout counter, which results + in chipselect reset for AST2400/AST2500. From 3b7c09fd645ba62b3d6346625db1fcd3803bdd33 Mon Sep 17 00:00:00 2001 From: Oliver Graute Date: Thu, 5 Sep 2019 14:36:49 +0000 Subject: [PATCH 0537/2880] watchdog: imx_sc: this patch just fixes whitespaces Fix only whitespace errors in imx_sc_wdt_probe() Signed-off-by: Oliver Graute Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190905143644.20952-1-oliver.graute@kococonnector.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/imx_sc_wdt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/imx_sc_wdt.c b/drivers/watchdog/imx_sc_wdt.c index 9260475439eb..7ea5cf54e94a 100644 --- a/drivers/watchdog/imx_sc_wdt.c +++ b/drivers/watchdog/imx_sc_wdt.c @@ -176,8 +176,8 @@ static int imx_sc_wdt_probe(struct platform_device *pdev) ret = devm_watchdog_register_device(dev, wdog); if (ret) - return ret; - + return ret; + ret = imx_scu_irq_group_enable(SC_IRQ_GROUP_WDOG, SC_IRQ_WDOG, true); From 36375491a439565402d1cb2cf12955c11f2ed5a6 Mon Sep 17 00:00:00 2001 From: Jorge Ramirez-Ortiz Date: Fri, 6 Sep 2019 22:54:10 +0200 Subject: [PATCH 0538/2880] watchdog: qcom: support pre-timeout when the bark irq is available Use the bark interrupt as the pre-timeout notifier whenever this interrupt is available. By default, the pretimeout notification shall occur one second earlier than the timeout. Signed-off-by: Jorge Ramirez-Ortiz Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190906205411.31666-2-jorge.ramirez-ortiz@linaro.org Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/qcom-wdt.c | 69 ++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/drivers/watchdog/qcom-wdt.c b/drivers/watchdog/qcom-wdt.c index 7be7f87be28f..aa22e625144a 100644 --- a/drivers/watchdog/qcom-wdt.c +++ b/drivers/watchdog/qcom-wdt.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2014, The Linux Foundation. All rights reserved. */ +#include #include #include +#include #include #include #include @@ -19,6 +21,9 @@ enum wdt_reg { WDT_BITE_TIME, }; +#define QCOM_WDT_ENABLE BIT(0) +#define QCOM_WDT_ENABLE_IRQ BIT(1) + static const u32 reg_offset_data_apcs_tmr[] = { [WDT_RST] = 0x38, [WDT_EN] = 0x40, @@ -54,15 +59,35 @@ struct qcom_wdt *to_qcom_wdt(struct watchdog_device *wdd) return container_of(wdd, struct qcom_wdt, wdd); } +static inline int qcom_get_enable(struct watchdog_device *wdd) +{ + int enable = QCOM_WDT_ENABLE; + + if (wdd->pretimeout) + enable |= QCOM_WDT_ENABLE_IRQ; + + return enable; +} + +static irqreturn_t qcom_wdt_isr(int irq, void *arg) +{ + struct watchdog_device *wdd = arg; + + watchdog_notify_pretimeout(wdd); + + return IRQ_HANDLED; +} + static int qcom_wdt_start(struct watchdog_device *wdd) { struct qcom_wdt *wdt = to_qcom_wdt(wdd); + unsigned int bark = wdd->timeout - wdd->pretimeout; writel(0, wdt_addr(wdt, WDT_EN)); writel(1, wdt_addr(wdt, WDT_RST)); - writel(wdd->timeout * wdt->rate, wdt_addr(wdt, WDT_BARK_TIME)); + writel(bark * wdt->rate, wdt_addr(wdt, WDT_BARK_TIME)); writel(wdd->timeout * wdt->rate, wdt_addr(wdt, WDT_BITE_TIME)); - writel(1, wdt_addr(wdt, WDT_EN)); + writel(qcom_get_enable(wdd), wdt_addr(wdt, WDT_EN)); return 0; } @@ -89,6 +114,13 @@ static int qcom_wdt_set_timeout(struct watchdog_device *wdd, return qcom_wdt_start(wdd); } +static int qcom_wdt_set_pretimeout(struct watchdog_device *wdd, + unsigned int timeout) +{ + wdd->pretimeout = timeout; + return qcom_wdt_start(wdd); +} + static int qcom_wdt_restart(struct watchdog_device *wdd, unsigned long action, void *data) { @@ -105,7 +137,7 @@ static int qcom_wdt_restart(struct watchdog_device *wdd, unsigned long action, writel(1, wdt_addr(wdt, WDT_RST)); writel(timeout, wdt_addr(wdt, WDT_BARK_TIME)); writel(timeout, wdt_addr(wdt, WDT_BITE_TIME)); - writel(1, wdt_addr(wdt, WDT_EN)); + writel(QCOM_WDT_ENABLE, wdt_addr(wdt, WDT_EN)); /* * Actually make sure the above sequence hits hardware before sleeping. @@ -121,6 +153,7 @@ static const struct watchdog_ops qcom_wdt_ops = { .stop = qcom_wdt_stop, .ping = qcom_wdt_ping, .set_timeout = qcom_wdt_set_timeout, + .set_pretimeout = qcom_wdt_set_pretimeout, .restart = qcom_wdt_restart, .owner = THIS_MODULE, }; @@ -133,6 +166,15 @@ static const struct watchdog_info qcom_wdt_info = { .identity = KBUILD_MODNAME, }; +static const struct watchdog_info qcom_wdt_pt_info = { + .options = WDIOF_KEEPALIVEPING + | WDIOF_MAGICCLOSE + | WDIOF_SETTIMEOUT + | WDIOF_PRETIMEOUT + | WDIOF_CARDRESET, + .identity = KBUILD_MODNAME, +}; + static void qcom_clk_disable_unprepare(void *data) { clk_disable_unprepare(data); @@ -146,7 +188,7 @@ static int qcom_wdt_probe(struct platform_device *pdev) struct device_node *np = dev->of_node; const u32 *regs; u32 percpu_offset; - int ret; + int irq, ret; regs = of_device_get_match_data(dev); if (!regs) { @@ -204,7 +246,24 @@ static int qcom_wdt_probe(struct platform_device *pdev) return -EINVAL; } - wdt->wdd.info = &qcom_wdt_info; + /* check if there is pretimeout support */ + irq = platform_get_irq(pdev, 0); + if (irq > 0) { + ret = devm_request_irq(dev, irq, qcom_wdt_isr, + IRQF_TRIGGER_RISING, + "wdt_bark", &wdt->wdd); + if (ret) + return ret; + + wdt->wdd.info = &qcom_wdt_pt_info; + wdt->wdd.pretimeout = 1; + } else { + if (irq == -EPROBE_DEFER) + return -EPROBE_DEFER; + + wdt->wdd.info = &qcom_wdt_info; + } + wdt->wdd.ops = &qcom_wdt_ops; wdt->wdd.min_timeout = 1; wdt->wdd.max_timeout = 0x10000000U / wdt->rate; From 52a142140e14d30f99b6661bef8812a70a029124 Mon Sep 17 00:00:00 2001 From: Jorge Ramirez-Ortiz Date: Fri, 6 Sep 2019 22:54:11 +0200 Subject: [PATCH 0539/2880] watchdog: qcom: remove unnecessary variable from private storage there is no need to continue keeping the clock in private storage. Signed-off-by: Jorge Ramirez-Ortiz Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190906205411.31666-3-jorge.ramirez-ortiz@linaro.org Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/qcom-wdt.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/watchdog/qcom-wdt.c b/drivers/watchdog/qcom-wdt.c index aa22e625144a..a494543d3ae1 100644 --- a/drivers/watchdog/qcom-wdt.c +++ b/drivers/watchdog/qcom-wdt.c @@ -42,7 +42,6 @@ static const u32 reg_offset_data_kpss[] = { struct qcom_wdt { struct watchdog_device wdd; - struct clk *clk; unsigned long rate; void __iomem *base; const u32 *layout; @@ -189,6 +188,7 @@ static int qcom_wdt_probe(struct platform_device *pdev) const u32 *regs; u32 percpu_offset; int irq, ret; + struct clk *clk; regs = of_device_get_match_data(dev); if (!regs) { @@ -215,19 +215,18 @@ static int qcom_wdt_probe(struct platform_device *pdev) if (IS_ERR(wdt->base)) return PTR_ERR(wdt->base); - wdt->clk = devm_clk_get(dev, NULL); - if (IS_ERR(wdt->clk)) { + clk = devm_clk_get(dev, NULL); + if (IS_ERR(clk)) { dev_err(dev, "failed to get input clock\n"); - return PTR_ERR(wdt->clk); + return PTR_ERR(clk); } - ret = clk_prepare_enable(wdt->clk); + ret = clk_prepare_enable(clk); if (ret) { dev_err(dev, "failed to setup clock\n"); return ret; } - ret = devm_add_action_or_reset(dev, qcom_clk_disable_unprepare, - wdt->clk); + ret = devm_add_action_or_reset(dev, qcom_clk_disable_unprepare, clk); if (ret) return ret; @@ -239,7 +238,7 @@ static int qcom_wdt_probe(struct platform_device *pdev) * that it would bite before a second elapses it's usefulness is * limited. Bail if this is the case. */ - wdt->rate = clk_get_rate(wdt->clk); + wdt->rate = clk_get_rate(clk); if (wdt->rate == 0 || wdt->rate > 0x10000000U) { dev_err(dev, "invalid clock rate\n"); From ca2fc5efffde5a3827adfb0ab6a51b6f1c64d5ff Mon Sep 17 00:00:00 2001 From: Jaret Cantu Date: Thu, 12 Sep 2019 13:55:50 -0400 Subject: [PATCH 0540/2880] watchdog: f71808e_wdt: Add F81803 support This adds watchdog support for the Fintek F81803 Super I/O chip. Testing was done on the Seneca XK-QUAD. Signed-off-by: Jaret Cantu Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190912175550.9340-1-jaret.cantu@timesys.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 4 ++-- drivers/watchdog/f71808e_wdt.c | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index d68e5b500aef..58e7c100b6ad 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -1043,8 +1043,8 @@ config F71808E_WDT depends on X86 help This is the driver for the hardware watchdog on the Fintek F71808E, - F71862FG, F71868, F71869, F71882FG, F71889FG, F81865 and F81866 - Super I/O controllers. + F71862FG, F71868, F71869, F71882FG, F71889FG, F81803, F81865, and + F81866 Super I/O controllers. You can compile this driver directly into the kernel, or use it as a module. The module will be called f71808e_wdt. diff --git a/drivers/watchdog/f71808e_wdt.c b/drivers/watchdog/f71808e_wdt.c index ff5cf1b48a4d..e46104c2fd94 100644 --- a/drivers/watchdog/f71808e_wdt.c +++ b/drivers/watchdog/f71808e_wdt.c @@ -31,8 +31,10 @@ #define SIO_REG_DEVID 0x20 /* Device ID (2 bytes) */ #define SIO_REG_DEVREV 0x22 /* Device revision */ #define SIO_REG_MANID 0x23 /* Fintek ID (2 bytes) */ +#define SIO_REG_CLOCK_SEL 0x26 /* Clock select */ #define SIO_REG_ROM_ADDR_SEL 0x27 /* ROM address select */ #define SIO_F81866_REG_PORT_SEL 0x27 /* F81866 Multi-Function Register */ +#define SIO_REG_TSI_LEVEL_SEL 0x28 /* TSI Level select */ #define SIO_REG_MFUNCT1 0x29 /* Multi function select 1 */ #define SIO_REG_MFUNCT2 0x2a /* Multi function select 2 */ #define SIO_REG_MFUNCT3 0x2b /* Multi function select 3 */ @@ -49,6 +51,7 @@ #define SIO_F71869A_ID 0x1007 /* Chipset ID */ #define SIO_F71882_ID 0x0541 /* Chipset ID */ #define SIO_F71889_ID 0x0723 /* Chipset ID */ +#define SIO_F81803_ID 0x1210 /* Chipset ID */ #define SIO_F81865_ID 0x0704 /* Chipset ID */ #define SIO_F81866_ID 0x1010 /* Chipset ID */ @@ -108,7 +111,7 @@ MODULE_PARM_DESC(start_withtimeout, "Start watchdog timer on module load with" " given initial timeout. Zero (default) disables this feature."); enum chips { f71808fg, f71858fg, f71862fg, f71868, f71869, f71882fg, f71889fg, - f81865, f81866}; + f81803, f81865, f81866}; static const char *f71808e_names[] = { "f71808fg", @@ -118,6 +121,7 @@ static const char *f71808e_names[] = { "f71869", "f71882fg", "f71889fg", + "f81803", "f81865", "f81866", }; @@ -370,6 +374,14 @@ static int watchdog_start(void) superio_inb(watchdog.sioaddr, SIO_REG_MFUNCT3) & 0xcf); break; + case f81803: + /* Enable TSI Level register bank */ + superio_clear_bit(watchdog.sioaddr, SIO_REG_CLOCK_SEL, 3); + /* Set pin 27 to WDTRST# */ + superio_outb(watchdog.sioaddr, SIO_REG_TSI_LEVEL_SEL, 0x5f & + superio_inb(watchdog.sioaddr, SIO_REG_TSI_LEVEL_SEL)); + break; + case f81865: /* Set pin 70 to WDTRST# */ superio_clear_bit(watchdog.sioaddr, SIO_REG_MFUNCT3, 5); @@ -809,6 +821,9 @@ static int __init f71808e_find(int sioaddr) /* Confirmed (by datasheet) not to have a watchdog. */ err = -ENODEV; goto exit; + case SIO_F81803_ID: + watchdog.type = f81803; + break; case SIO_F81865_ID: watchdog.type = f81865; break; From 029d2c0fd61eac74700fb4ffff36fc63bfff7e5e Mon Sep 17 00:00:00 2001 From: Ilya Pshonkin Date: Tue, 17 Sep 2019 10:49:34 +0300 Subject: [PATCH 0541/2880] ALSA: usb-audio: Add Hiby device family to quirks for native DSD support This patch adds quirk VID ID for Hiby portable players family with native DSD playback support. Signed-off-by: Ilya Pshonkin Cc: Link: https://lore.kernel.org/r/20190917074937.157802-1-ilya.pshonkin@netforce.ua Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 25faf2d3c639..5fd4ccce452d 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1658,6 +1658,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, case 0x25ce: /* Mytek devices */ case 0x278b: /* Rotel? */ case 0x2ab6: /* T+A devices */ + case 0xc502: /* HiBy devices */ if (fp->dsd_raw) return SNDRV_PCM_FMTBIT_DSD_U32_BE; break; From dac9f027b1096c5f03ca583e787aac0f852e8f78 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 16 Sep 2019 17:19:35 -0400 Subject: [PATCH 0542/2880] sched/fair: Remove unused cfs_rq_clock_task() function cfs_rq_clock_task() was first introduced and used in: f1b17280efbd ("sched: Maintain runnable averages across throttled periods") Over time its use has been graduately removed by the following commits: d31b1a66cbe0 ("sched/fair: Factorize PELT update") 23127296889f ("sched/fair: Update scale invariance of PELT") Today, there is no single user left, so it can be safely removed. Found via the -Wunused-function build warning. Signed-off-by: Qian Cai Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vincent Guittot Link: https://lkml.kernel.org/r/1568668775-2127-1-git-send-email-cai@lca.pw [ Rewrote the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bbf68c3161..3101c662426d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -749,7 +749,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); static void attach_entity_cfs_rq(struct sched_entity *se); /* @@ -4376,15 +4375,6 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } -/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; - - return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; -} - /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4476,7 +4466,6 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; @@ -5080,11 +5069,6 @@ static inline bool cfs_bandwidth_used(void) return false; } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - return rq_clock_task(rq_of(cfs_rq)); -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} From e75f4940e8ad0dd76527302a10c06b58bf7eb590 Mon Sep 17 00:00:00 2001 From: Mihai Serban Date: Fri, 13 Sep 2019 22:28:05 +0300 Subject: [PATCH 0543/2880] ASoC: fsl_sai: Fix noise when using EDMA EDMA requires the period size to be multiple of maxburst. Otherwise the remaining bytes are not transferred and thus noise is produced. We can handle this issue by adding a constraint on SNDRV_PCM_HW_PARAM_PERIOD_SIZE to be multiple of tx/rx maxburst value. Signed-off-by: Mihai Serban Signed-off-by: Daniel Baluta Acked-by: Nicolin Chen Link: https://lore.kernel.org/r/20190913192807.8423-2-daniel.baluta@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_sai.c | 15 +++++++++++++++ sound/soc/fsl/fsl_sai.h | 1 + 2 files changed, 16 insertions(+) diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index ef0b74693093..b517e4bc1b87 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -628,6 +628,16 @@ static int fsl_sai_startup(struct snd_pcm_substream *substream, FSL_SAI_CR3_TRCE_MASK, FSL_SAI_CR3_TRCE); + /* + * EDMA controller needs period size to be a multiple of + * tx/rx maxburst + */ + if (sai->soc_data->use_edma) + snd_pcm_hw_constraint_step(substream->runtime, 0, + SNDRV_PCM_HW_PARAM_PERIOD_SIZE, + tx ? sai->dma_params_tx.maxburst : + sai->dma_params_rx.maxburst); + ret = snd_pcm_hw_constraint_list(substream->runtime, 0, SNDRV_PCM_HW_PARAM_RATE, &fsl_sai_rate_constraints); @@ -1026,30 +1036,35 @@ static int fsl_sai_remove(struct platform_device *pdev) static const struct fsl_sai_soc_data fsl_sai_vf610_data = { .use_imx_pcm = false, + .use_edma = false, .fifo_depth = 32, .reg_offset = 0, }; static const struct fsl_sai_soc_data fsl_sai_imx6sx_data = { .use_imx_pcm = true, + .use_edma = false, .fifo_depth = 32, .reg_offset = 0, }; static const struct fsl_sai_soc_data fsl_sai_imx7ulp_data = { .use_imx_pcm = true, + .use_edma = false, .fifo_depth = 16, .reg_offset = 8, }; static const struct fsl_sai_soc_data fsl_sai_imx8mq_data = { .use_imx_pcm = true, + .use_edma = false, .fifo_depth = 128, .reg_offset = 8, }; static const struct fsl_sai_soc_data fsl_sai_imx8qm_data = { .use_imx_pcm = true, + .use_edma = true, .fifo_depth = 64, .reg_offset = 0, }; diff --git a/sound/soc/fsl/fsl_sai.h b/sound/soc/fsl/fsl_sai.h index b12cb578f6d0..76b15deea80c 100644 --- a/sound/soc/fsl/fsl_sai.h +++ b/sound/soc/fsl/fsl_sai.h @@ -157,6 +157,7 @@ struct fsl_sai_soc_data { bool use_imx_pcm; + bool use_edma; unsigned int fifo_depth; unsigned int reg_offset; }; From a0a4bf57a977ed37bcbdfc8027a44485fe086a3d Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Tue, 17 Sep 2019 05:03:53 +0800 Subject: [PATCH 0544/2880] ASoC: core: delete component->card_list in soc_remove_component only We add component->card_list in the end of soc_probe_component(). In other words, component->card_list will not be added if there is an error in the soc_probe_component() function. So we can't delete component->card_list in the error handling of soc_probe_component(). Fixes: 22d1423187e5 ("ASoC: soc-core: add soc_cleanup_component()") Signed-off-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20190916210353.6318-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 35f48e9c5ead..aff4b4bf4d07 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -978,7 +978,6 @@ static void soc_cleanup_component(struct snd_soc_component *component) /* For framework level robustness */ snd_soc_component_set_jack(component, NULL, NULL); - list_del(&component->card_list); snd_soc_dapm_free(snd_soc_component_get_dapm(component)); soc_cleanup_component_debugfs(component); component->card = NULL; @@ -991,7 +990,7 @@ static void soc_remove_component(struct snd_soc_component *component) return; snd_soc_component_remove(component); - + list_del(&component->card_list); soc_cleanup_component(component); } From 131cb1210d4b58acb0695707dad2eb90dcb50a2a Mon Sep 17 00:00:00 2001 From: Marco Felsch Date: Tue, 17 Sep 2019 17:40:20 +0200 Subject: [PATCH 0545/2880] regulator: of: fix suspend-min/max-voltage parsing Currently the regulator-suspend-min/max-microvolt must be within the root regulator node but the dt-bindings specifies it as subnode properties for the regulator-state-[mem/disk/standby] node. The only DT using this bindings currently is the at91-sama5d2_xplained.dts and this DT uses it correctly. I don't know if it isn't tested but it can't work without this fix. Fixes: f7efad10b5c4 ("regulator: add PM suspend and resume hooks") Signed-off-by: Marco Felsch Link: https://lore.kernel.org/r/20190917154021.14693-3-m.felsch@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/of_regulator.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c index 9112faa6a9a0..38dd06fbab38 100644 --- a/drivers/regulator/of_regulator.c +++ b/drivers/regulator/of_regulator.c @@ -231,12 +231,12 @@ static int of_get_regulation_constraints(struct device *dev, "regulator-off-in-suspend")) suspend_state->enabled = DISABLE_IN_SUSPEND; - if (!of_property_read_u32(np, "regulator-suspend-min-microvolt", - &pval)) + if (!of_property_read_u32(suspend_np, + "regulator-suspend-min-microvolt", &pval)) suspend_state->min_uV = pval; - if (!of_property_read_u32(np, "regulator-suspend-max-microvolt", - &pval)) + if (!of_property_read_u32(suspend_np, + "regulator-suspend-max-microvolt", &pval)) suspend_state->max_uV = pval; if (!of_property_read_u32(suspend_np, From f8970d341eec73c976a3462b9ecdb02b60b84dd6 Mon Sep 17 00:00:00 2001 From: Marco Felsch Date: Tue, 17 Sep 2019 17:40:21 +0200 Subject: [PATCH 0546/2880] regulator: core: make regulator_register() EPROBE_DEFER aware Sometimes it can happen that the regulator_of_get_init_data() can't retrieve the config due to a not probed device the regulator depends on. Fix that by checking the return value of of_parse_cb() and return EPROBE_DEFER in such cases. Signed-off-by: Marco Felsch Link: https://lore.kernel.org/r/20190917154021.14693-4-m.felsch@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/core.c | 13 +++++++++++++ drivers/regulator/of_regulator.c | 19 ++++++++++++++----- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index afe94470b67f..a46be221dbdc 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -5053,6 +5053,19 @@ regulator_register(const struct regulator_desc *regulator_desc, init_data = regulator_of_get_init_data(dev, regulator_desc, config, &rdev->dev.of_node); + + /* + * Sometimes not all resources are probed already so we need to take + * that into account. This happens most the time if the ena_gpiod comes + * from a gpio extender or something else. + */ + if (PTR_ERR(init_data) == -EPROBE_DEFER) { + kfree(config); + kfree(rdev); + ret = -EPROBE_DEFER; + goto rinse; + } + /* * We need to keep track of any GPIO descriptor coming from the * device tree until we have handled it over to the core. If the diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c index 38dd06fbab38..ef7198b76e50 100644 --- a/drivers/regulator/of_regulator.c +++ b/drivers/regulator/of_regulator.c @@ -445,11 +445,20 @@ struct regulator_init_data *regulator_of_get_init_data(struct device *dev, goto error; } - if (desc->of_parse_cb && desc->of_parse_cb(child, desc, config)) { - dev_err(dev, - "driver callback failed to parse DT for regulator %pOFn\n", - child); - goto error; + if (desc->of_parse_cb) { + int ret; + + ret = desc->of_parse_cb(child, desc, config); + if (ret) { + if (ret == -EPROBE_DEFER) { + of_node_put(child); + return ERR_PTR(-EPROBE_DEFER); + } + dev_err(dev, + "driver callback failed to parse DT for regulator %pOFn\n", + child); + goto error; + } } *node = child; From e3dffa4f6c3612dea337c9c59191bd418afc941b Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Mon, 16 Sep 2019 07:54:34 -0600 Subject: [PATCH 0547/2880] PCI: vmd: Fix config addressing when using bus offsets VMD maps child device config spaces to the VMD Config BAR linearly regardless of the starting bus offset. Because of this, the config address decode must ignore starting bus offsets when mapping the BDF to the config space address. Fixes: 2a5a9c9a20f9 ("PCI: vmd: Add offset to bus numbers if necessary") Signed-off-by: Jon Derrick Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org # v5.2+ --- drivers/pci/controller/vmd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 4575e0c6dc4b..2e4da3f51d6b 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -94,6 +94,7 @@ struct vmd_dev { struct resource resources[3]; struct irq_domain *irq_domain; struct pci_bus *bus; + u8 busn_start; struct dma_map_ops dma_ops; struct dma_domain dma_domain; @@ -440,7 +441,8 @@ static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus, unsigned int devfn, int reg, int len) { char __iomem *addr = vmd->cfgbar + - (bus->number << 20) + (devfn << 12) + reg; + ((bus->number - vmd->busn_start) << 20) + + (devfn << 12) + reg; if ((addr - vmd->cfgbar) + len >= resource_size(&vmd->dev->resource[VMD_CFGBAR])) @@ -563,7 +565,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) unsigned long flags; LIST_HEAD(resources); resource_size_t offset[2] = {0}; - resource_size_t membar2_offset = 0x2000, busn_start = 0; + resource_size_t membar2_offset = 0x2000; struct pci_bus *child; /* @@ -606,14 +608,14 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_read_config_dword(vmd->dev, PCI_REG_VMCONFIG, &vmconfig); if (BUS_RESTRICT_CAP(vmcap) && (BUS_RESTRICT_CFG(vmconfig) == 0x1)) - busn_start = 128; + vmd->busn_start = 128; } res = &vmd->dev->resource[VMD_CFGBAR]; vmd->resources[0] = (struct resource) { .name = "VMD CFGBAR", - .start = busn_start, - .end = busn_start + (resource_size(res) >> 20) - 1, + .start = vmd->busn_start, + .end = vmd->busn_start + (resource_size(res) >> 20) - 1, .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED, }; @@ -681,8 +683,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]); pci_add_resource_offset(&resources, &vmd->resources[2], offset[1]); - vmd->bus = pci_create_root_bus(&vmd->dev->dev, busn_start, &vmd_ops, - sd, &resources); + vmd->bus = pci_create_root_bus(&vmd->dev->dev, vmd->busn_start, + &vmd_ops, sd, &resources); if (!vmd->bus) { pci_free_resource_list(&resources); irq_domain_remove(vmd->irq_domain); From a1a30170138c9c5157bd514ccd4d76b47060f29b Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Mon, 16 Sep 2019 07:54:35 -0600 Subject: [PATCH 0548/2880] PCI: vmd: Fix shadow offsets to reflect spec changes The shadow offset scratchpad was moved to 0x2000-0x2010. Update the location to get the correct shadow offset. Fixes: 6788958e4f3c ("PCI: vmd: Assign membar addresses from shadow registers") Signed-off-by: Jon Derrick Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org # v5.2+ --- drivers/pci/controller/vmd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 2e4da3f51d6b..a35d3f3996d7 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -31,6 +31,9 @@ #define PCI_REG_VMLOCK 0x70 #define MB2_SHADOW_EN(vmlock) (vmlock & 0x2) +#define MB2_SHADOW_OFFSET 0x2000 +#define MB2_SHADOW_SIZE 16 + enum vmd_features { /* * Device may contain registers which hint the physical location of the @@ -578,7 +581,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) u32 vmlock; int ret; - membar2_offset = 0x2018; + membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE; ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, &vmlock); if (ret || vmlock == ~0) return -ENODEV; @@ -590,9 +593,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) if (!membar2) return -ENOMEM; offset[0] = vmd->dev->resource[VMD_MEMBAR1].start - - readq(membar2 + 0x2008); + readq(membar2 + MB2_SHADOW_OFFSET); offset[1] = vmd->dev->resource[VMD_MEMBAR2].start - - readq(membar2 + 0x2010); + readq(membar2 + MB2_SHADOW_OFFSET + 8); pci_iounmap(vmd->dev, membar2); } } From 59b263620c2102171a85f4518d65a571c352ab9a Mon Sep 17 00:00:00 2001 From: Roman Li Date: Wed, 4 Sep 2019 17:23:11 -0400 Subject: [PATCH 0549/2880] drm/amd/display: Add stereo mux and dig programming calls for dcn21 [Why] The earlier patch "Hook up calls to do stereo mux and dig programming..." doesn't include update for dcn21. [How] Align dcn21 gpio settings with updated stereo control interface. Signed-off-by: Roman Li Acked-by: Aaron Liu Signed-off-by: Alex Deucher --- .../display/dc/gpio/dcn21/hw_factory_dcn21.c | 38 +++++++++++++++++-- .../dc/gpio/dcn21/hw_translate_dcn21.c | 3 +- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_factory_dcn21.c b/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_factory_dcn21.c index 34485d9de78a..8572678f8d4f 100644 --- a/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_factory_dcn21.c +++ b/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_factory_dcn21.c @@ -35,12 +35,10 @@ #include "hw_factory_dcn21.h" - #include "dcn/dcn_2_1_0_offset.h" #include "dcn/dcn_2_1_0_sh_mask.h" #include "renoir_ip_offset.h" - #include "reg_helper.h" #include "../hpd_regs.h" /* begin ********************* @@ -136,6 +134,39 @@ static const struct ddc_sh_mask ddc_mask[] = { DDC_MASK_SH_LIST_DCN2(_MASK, 6) }; +#include "../generic_regs.h" + +/* set field name */ +#define SF_GENERIC(reg_name, field_name, post_fix)\ + .field_name = reg_name ## __ ## field_name ## post_fix + +#define generic_regs(id) \ +{\ + GENERIC_REG_LIST(id)\ +} + +static const struct generic_registers generic_regs[] = { + generic_regs(A), +}; + +static const struct generic_sh_mask generic_shift[] = { + GENERIC_MASK_SH_LIST(__SHIFT, A), +}; + +static const struct generic_sh_mask generic_mask[] = { + GENERIC_MASK_SH_LIST(_MASK, A), +}; + +static void define_generic_registers(struct hw_gpio_pin *pin, uint32_t en) +{ + struct hw_generic *generic = HW_GENERIC_FROM_BASE(pin); + + generic->regs = &generic_regs[en]; + generic->shifts = &generic_shift[en]; + generic->masks = &generic_mask[en]; + generic->base.regs = &generic_regs[en].gpio; +} + static void define_ddc_registers( struct hw_gpio_pin *pin, uint32_t en) @@ -181,7 +212,8 @@ static const struct hw_factory_funcs funcs = { .get_hpd_pin = dal_hw_hpd_get_pin, .get_generic_pin = dal_hw_generic_get_pin, .define_hpd_registers = define_hpd_registers, - .define_ddc_registers = define_ddc_registers + .define_ddc_registers = define_ddc_registers, + .define_generic_registers = define_generic_registers }; /* * dal_hw_factory_dcn10_init diff --git a/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_translate_dcn21.c b/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_translate_dcn21.c index ad7c43746291..fbb58fb8c318 100644 --- a/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_translate_dcn21.c +++ b/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_translate_dcn21.c @@ -58,7 +58,6 @@ #define SF_HPD(reg_name, field_name, post_fix)\ .field_name = reg_name ## __ ## field_name ## post_fix - /* macros to expend register list macro defined in HW object header file * end *********************/ @@ -71,7 +70,7 @@ static bool offset_to_id( { switch (offset) { /* GENERIC */ - case REG(DC_GENERICA): + case REG(DC_GPIO_GENERIC_A): *id = GPIO_ID_GENERIC; switch (mask) { case DC_GPIO_GENERIC_A__DC_GPIO_GENERICA_A_MASK: From 5813f97a5969bf1e7e723397a74e00b5de7278d6 Mon Sep 17 00:00:00 2001 From: Aaron Liu Date: Wed, 4 Sep 2019 11:47:48 +0800 Subject: [PATCH 0550/2880] drm/amdgpu: disable stutter mode for renoir With stutter mode enabled, NMI prints frequently. Disable stutter for the moment because NMI warning storm, and will enable it back till the issue is addressed Signed-off-by: Aaron Liu Acked-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index e1b09bb432bd..9590ca552a28 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2384,6 +2384,8 @@ static int amdgpu_dm_initialize_drm_device(struct amdgpu_device *adev) if (adev->asic_type != CHIP_CARRIZO && adev->asic_type != CHIP_STONEY) dm->dc->debug.disable_stutter = amdgpu_pp_feature_mask & PP_STUTTER_MODE ? false : true; + if (adev->asic_type == CHIP_RENOIR) + dm->dc->debug.disable_stutter = true; return 0; fail: From cc204d01262a69218b2d0db5cdea371de85871d9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Sep 2019 13:01:35 -0400 Subject: [PATCH 0551/2880] SUNRPC: Dequeue the request from the receive queue while we're re-encoding Ensure that we dequeue the request from the transport receive queue while we're re-encoding to prevent issues like use-after-free when we release the bvec. Fixes: 7536908982047 ("SUNRPC: Ensure the bvecs are reset when we re-encode...") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.20+ Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/clnt.c | 6 ++--- net/sunrpc/xprt.c | 54 +++++++++++++++++++++---------------- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 13e108bcc9eb..d783e15ba898 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -352,6 +352,7 @@ bool xprt_prepare_transmit(struct rpc_task *task); void xprt_request_enqueue_transmit(struct rpc_task *task); void xprt_request_enqueue_receive(struct rpc_task *task); void xprt_request_wait_receive(struct rpc_task *task); +void xprt_request_dequeue_xprt(struct rpc_task *task); bool xprt_request_need_retransmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); void xprt_end_transmit(struct rpc_task *task); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d8679b6027e9..0359466947e2 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1862,6 +1862,7 @@ rpc_xdr_encode(struct rpc_task *task) req->rq_rbuffer, req->rq_rcvsize); + req->rq_reply_bytes_recvd = 0; req->rq_snd_buf.head[0].iov_len = 0; xdr_init_encode(&xdr, &req->rq_snd_buf, req->rq_snd_buf.head[0].iov_base, req); @@ -1881,6 +1882,8 @@ call_encode(struct rpc_task *task) if (!rpc_task_need_encode(task)) goto out; dprint_status(task); + /* Dequeue task from the receive queue while we're encoding */ + xprt_request_dequeue_xprt(task); /* Encode here so that rpcsec_gss can use correct sequence number. */ rpc_xdr_encode(task); /* Did the encode result in an error condition? */ @@ -2501,9 +2504,6 @@ call_decode(struct rpc_task *task) return; case -EAGAIN: task->tk_status = 0; - xdr_free_bvec(&req->rq_rcv_buf); - req->rq_reply_bytes_recvd = 0; - req->rq_rcv_buf.len = 0; if (task->tk_client->cl_discrtry) xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 783748dc5e6f..02d5b2125c07 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1323,6 +1323,36 @@ xprt_request_dequeue_transmit(struct rpc_task *task) spin_unlock(&xprt->queue_lock); } +/** + * xprt_request_dequeue_xprt - remove a task from the transmit+receive queue + * @task: pointer to rpc_task + * + * Remove a task from the transmit and receive queues, and ensure that + * it is not pinned by the receive work item. + */ +void +xprt_request_dequeue_xprt(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || + test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || + xprt_is_pinned_rqst(req)) { + spin_lock(&xprt->queue_lock); + xprt_request_dequeue_transmit_locked(task); + xprt_request_dequeue_receive_locked(task); + while (xprt_is_pinned_rqst(req)) { + set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + xprt_wait_on_pinned_rqst(req); + spin_lock(&xprt->queue_lock); + clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + } + spin_unlock(&xprt->queue_lock); + } +} + /** * xprt_request_prepare - prepare an encoded request for transport * @req: pointer to rpc_rqst @@ -1754,28 +1784,6 @@ void xprt_retry_reserve(struct rpc_task *task) xprt_do_reserve(xprt, task); } -static void -xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req) -{ - struct rpc_xprt *xprt = req->rq_xprt; - - if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || - test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || - xprt_is_pinned_rqst(req)) { - spin_lock(&xprt->queue_lock); - xprt_request_dequeue_transmit_locked(task); - xprt_request_dequeue_receive_locked(task); - while (xprt_is_pinned_rqst(req)) { - set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); - spin_unlock(&xprt->queue_lock); - xprt_wait_on_pinned_rqst(req); - spin_lock(&xprt->queue_lock); - clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); - } - spin_unlock(&xprt->queue_lock); - } -} - /** * xprt_release - release an RPC request slot * @task: task which is finished with the slot @@ -1795,7 +1803,7 @@ void xprt_release(struct rpc_task *task) } xprt = req->rq_xprt; - xprt_request_dequeue_all(task, req); + xprt_request_dequeue_xprt(task); spin_lock(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) From 45835a63d039fc3bfb1d6c72cedaf785cd920e4a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 12 Sep 2019 08:04:25 -0400 Subject: [PATCH 0552/2880] SUNRPC: Don't receive TCP data into a request buffer that has been reset If we've removed the request from the receive list, and have added it back after resetting the request receive buffer, then we should only receive message data if it is a new reply (i.e. if transport->recv.copied is zero). Fixes: 277e4ab7d530b ("SUNRPC: Simplify TCP receive code by switching...") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e2176c167a57..9ac88722fa83 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -562,10 +562,14 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) printk(KERN_WARNING "Callback slot table overflowed\n"); return -ESHUTDOWN; } + if (transport->recv.copied && !req->rq_private_buf.len) + return -ESHUTDOWN; ret = xs_read_stream_request(transport, msg, flags, req); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) xprt_complete_bc_request(req, transport->recv.copied); + else + req->rq_private_buf.len = transport->recv.copied; return ret; } @@ -587,7 +591,7 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) /* Look up and lock the request corresponding to the given XID */ spin_lock(&xprt->queue_lock); req = xprt_lookup_rqst(xprt, transport->recv.xid); - if (!req) { + if (!req || (transport->recv.copied && !req->rq_private_buf.len)) { msg->msg_flags |= MSG_TRUNC; goto out; } @@ -599,6 +603,8 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) spin_lock(&xprt->queue_lock); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) xprt_complete_rqst(req->rq_task, transport->recv.copied); + else + req->rq_private_buf.len = transport->recv.copied; xprt_unpin_rqst(req); out: spin_unlock(&xprt->queue_lock); From 714fbc73888f59321854e7f6c2f224213923bcad Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 12 Sep 2019 08:06:51 -0400 Subject: [PATCH 0553/2880] SUNRPC: RPC level errors should always set task->tk_rpc_status Ensure that we set task->tk_rpc_status for all RPC level errors so that the caller can distinguish between those and server reply status errors. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 6 +++--- net/sunrpc/sched.c | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0359466947e2..44d5cf318813 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1837,7 +1837,7 @@ call_allocate(struct rpc_task *task) return; } - rpc_exit(task, -ERESTARTSYS); + rpc_call_rpcerror(task, -ERESTARTSYS); } static int @@ -2544,7 +2544,7 @@ rpc_encode_header(struct rpc_task *task, struct xdr_stream *xdr) return 0; out_fail: trace_rpc_bad_callhdr(task); - rpc_exit(task, error); + rpc_call_rpcerror(task, error); return error; } @@ -2611,7 +2611,7 @@ out_garbage: return -EAGAIN; } out_err: - rpc_exit(task, error); + rpc_call_rpcerror(task, error); return error; out_unparsable: diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f25c4b9ba185..360afe153193 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -911,8 +911,10 @@ static void __rpc_execute(struct rpc_task *task) /* * Signalled tasks should exit rather than sleep. */ - if (RPC_SIGNALLED(task)) + if (RPC_SIGNALLED(task)) { + task->tk_rpc_status = -ERESTARTSYS; rpc_exit(task, -ERESTARTSYS); + } /* * The queue->lock protects against races with @@ -948,6 +950,7 @@ static void __rpc_execute(struct rpc_task *task) */ dprintk("RPC: %5u got signal\n", task->tk_pid); set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); + task->tk_rpc_status = -ERESTARTSYS; rpc_exit(task, -ERESTARTSYS); } dprintk("RPC: %5u sync task resuming\n", task->tk_pid); From f79e06bd44e5be98b75f71ac0cde6c16be278180 Mon Sep 17 00:00:00 2001 From: Aaron Liu Date: Wed, 4 Sep 2019 13:21:42 +0800 Subject: [PATCH 0554/2880] drm/amd/display: update renoir_ip_offset.h This patch updates MP1_BASE in renoir_ip_offset.h Signed-off-by: Aaron Liu Acked-by: Roman Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/include/renoir_ip_offset.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/include/renoir_ip_offset.h b/drivers/gpu/drm/amd/include/renoir_ip_offset.h index 554714c8e000..094648cac392 100644 --- a/drivers/gpu/drm/amd/include/renoir_ip_offset.h +++ b/drivers/gpu/drm/amd/include/renoir_ip_offset.h @@ -155,7 +155,7 @@ static const struct IP_BASE MP0_BASE ={ { { { 0x00016000, 0x0243FC00, 0x00DC0000 { { 0, 0, 0, 0, 0 } }, { { 0, 0, 0, 0, 0 } }, { { 0, 0, 0, 0, 0 } } } }; -static const struct IP_BASE MP1_BASE ={ { { { 0x00016200, 0x02400400, 0x00E80000, 0x00EC0000, 0x00F00000 } }, +static const struct IP_BASE MP1_BASE ={ { { { 0x00016000, 0x02400400, 0x00E80000, 0x00EC0000, 0x00F00000 } }, { { 0, 0, 0, 0, 0 } }, { { 0, 0, 0, 0, 0 } }, { { 0, 0, 0, 0, 0 } }, From 1963b7c3beda608501deaf9010b53ec6cb6adbf1 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 3 Sep 2019 16:47:40 -0400 Subject: [PATCH 0555/2880] drm/amdgpu: Add smu lock around in pp_smu_i2c_bus_access Protect from concurrent SMU accesses. Signed-off-by: Andrey Grodzovsky Reviewed-by: Tao Zhou Reviewed-and-tested-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/amd_powerplay.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c index fa636cb462c1..fa8ad7db2b3a 100644 --- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c +++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c @@ -1531,6 +1531,7 @@ static int pp_asic_reset_mode_2(void *handle) static int pp_smu_i2c_bus_access(void *handle, bool acquire) { struct pp_hwmgr *hwmgr = handle; + int ret = 0; if (!hwmgr || !hwmgr->pm_en) return -EINVAL; @@ -1540,7 +1541,11 @@ static int pp_smu_i2c_bus_access(void *handle, bool acquire) return -EINVAL; } - return hwmgr->hwmgr_func->smu_i2c_bus_access(hwmgr, acquire); + mutex_lock(&hwmgr->smu_lock); + ret = hwmgr->hwmgr_func->smu_i2c_bus_access(hwmgr, acquire); + mutex_unlock(&hwmgr->smu_lock); + + return ret; } static const struct amd_pm_funcs pp_dpm_funcs = { From 103efdc1eaed0579e13d1778372575561b6a6ad9 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 4 Sep 2019 11:16:24 -0400 Subject: [PATCH 0556/2880] drm/amdgpu: Remove clock gating restore. Restoring clock gating break SMU opeartion afterwards, avoid this until this further invistigated with SMU. Signed-off-by: Andrey Grodzovsky Reviewed-by: Tao Zhou Reviewed-and-tested-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c b/drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c index 4a5951036927..c44723c267c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c +++ b/drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c @@ -493,7 +493,15 @@ static void smu_v11_0_i2c_fini(struct i2c_adapter *control) } /* Restore clock gating */ - smu_v11_0_i2c_set_clock_gating(control, true); + + /* + * TODO Reenabling clock gating seems to break subsequent SMU operation + * on the I2C bus. My guess is that SMU doesn't disable clock gating like + * we do here before working with the bus. So for now just don't restore + * it but later work with SMU to see if they have this issue and can + * update their code appropriately + */ + /* smu_v11_0_i2c_set_clock_gating(control, true); */ } From df794f679bbaf3b8848b4d726bffc49653fa4cb6 Mon Sep 17 00:00:00 2001 From: Aaron Liu Date: Mon, 16 Sep 2019 09:26:28 +0800 Subject: [PATCH 0557/2880] drm/amdgpu: remove program of lbpw for renoir These is no LBPW on Renoir. So removing program of lbpw for renoir. Signed-off-by: Aaron Liu Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 83d45f98a461..dcadc73bffd2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -1650,7 +1650,6 @@ static int gfx_v9_0_rlc_init(struct amdgpu_device *adev) switch (adev->asic_type) { case CHIP_RAVEN: - case CHIP_RENOIR: gfx_v9_0_init_lbpw(adev); break; case CHIP_VEGA20: @@ -3026,7 +3025,6 @@ static int gfx_v9_0_rlc_resume(struct amdgpu_device *adev) switch (adev->asic_type) { case CHIP_RAVEN: - case CHIP_RENOIR: if (amdgpu_lbpw == 0) gfx_v9_0_enable_lbpw(adev, false); else From c46e5df4ac898108da66a880c4e18f69c74f6c1b Mon Sep 17 00:00:00 2001 From: Charlene Liu Date: Tue, 20 Aug 2019 20:33:46 -0400 Subject: [PATCH 0558/2880] drm/amd/display: dce11.x /dce12 update formula input [Description] 1. OUTSTANDING_REQUEST_LIMIT update from 0xFF to 0x1F (HW doc update) 2. using memory type to convert UMC's MCLK to Yclk. Signed-off-by: Charlene Liu Reviewed-by: Dmytro Laktyushkin Acked-by: Bhawanpreet Lakha Signed-off-by: Alex Deucher --- .../display/dc/clk_mgr/dce110/dce110_clk_mgr.c | 7 +++++-- .../gpu/drm/amd/display/dc/dce/dce_mem_input.c | 4 ++-- .../drm/amd/display/dc/dce112/dce112_resource.c | 16 ++++++++++------ .../drm/amd/display/dc/dce120/dce120_resource.c | 11 ++++++++--- drivers/gpu/drm/amd/display/dc/inc/resource.h | 2 ++ 5 files changed, 27 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c index 5cc3acccda2a..ee32d2c19305 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c @@ -98,11 +98,14 @@ uint32_t dce110_get_min_vblank_time_us(const struct dc_state *context) struct dc_stream_state *stream = context->streams[j]; uint32_t vertical_blank_in_pixels = 0; uint32_t vertical_blank_time = 0; + uint32_t vertical_total_min = stream->timing.v_total; + struct dc_crtc_timing_adjust adjust = stream->adjust; + if (adjust.v_total_max != adjust.v_total_min) + vertical_total_min = adjust.v_total_min; vertical_blank_in_pixels = stream->timing.h_total * - (stream->timing.v_total + (vertical_total_min - stream->timing.v_addressable); - vertical_blank_time = vertical_blank_in_pixels * 10000 / stream->timing.pix_clk_100hz; diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_mem_input.c b/drivers/gpu/drm/amd/display/dc/dce/dce_mem_input.c index 1488ffddf4e3..31b698bf9cfc 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dce_mem_input.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dce_mem_input.c @@ -148,7 +148,7 @@ static void dce_mi_program_pte_vm( pte->min_pte_before_flip_horiz_scan; REG_UPDATE(GRPH_PIPE_OUTSTANDING_REQUEST_LIMIT, - GRPH_PIPE_OUTSTANDING_REQUEST_LIMIT, 0xff); + GRPH_PIPE_OUTSTANDING_REQUEST_LIMIT, 0x7f); REG_UPDATE_3(DVMM_PTE_CONTROL, DVMM_PAGE_WIDTH, page_width, @@ -157,7 +157,7 @@ static void dce_mi_program_pte_vm( REG_UPDATE_2(DVMM_PTE_ARB_CONTROL, DVMM_PTE_REQ_PER_CHUNK, pte->pte_req_per_chunk, - DVMM_MAX_PTE_REQ_OUTSTANDING, 0xff); + DVMM_MAX_PTE_REQ_OUTSTANDING, 0x7f); } static void program_urgency_watermark( diff --git a/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c b/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c index 3ac4c7e73050..2b3a2917c168 100644 --- a/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c @@ -987,6 +987,10 @@ static void bw_calcs_data_update_from_pplib(struct dc *dc) struct dm_pp_clock_levels_with_latency mem_clks = {0}; struct dm_pp_wm_sets_with_clock_ranges clk_ranges = {0}; struct dm_pp_clock_levels clks = {0}; + int memory_type_multiplier = MEMORY_TYPE_MULTIPLIER_CZ; + + if (dc->bw_vbios && dc->bw_vbios->memory_type == bw_def_hbm) + memory_type_multiplier = MEMORY_TYPE_HBM; /*do system clock TODO PPLIB: after PPLIB implement, * then remove old way @@ -1026,12 +1030,12 @@ static void bw_calcs_data_update_from_pplib(struct dc *dc) &clks); dc->bw_vbios->low_yclk = bw_frc_to_fixed( - clks.clocks_in_khz[0] * MEMORY_TYPE_MULTIPLIER_CZ, 1000); + clks.clocks_in_khz[0] * memory_type_multiplier, 1000); dc->bw_vbios->mid_yclk = bw_frc_to_fixed( - clks.clocks_in_khz[clks.num_levels>>1] * MEMORY_TYPE_MULTIPLIER_CZ, + clks.clocks_in_khz[clks.num_levels>>1] * memory_type_multiplier, 1000); dc->bw_vbios->high_yclk = bw_frc_to_fixed( - clks.clocks_in_khz[clks.num_levels-1] * MEMORY_TYPE_MULTIPLIER_CZ, + clks.clocks_in_khz[clks.num_levels-1] * memory_type_multiplier, 1000); return; @@ -1067,12 +1071,12 @@ static void bw_calcs_data_update_from_pplib(struct dc *dc) * YCLK = UMACLK*m_memoryTypeMultiplier */ dc->bw_vbios->low_yclk = bw_frc_to_fixed( - mem_clks.data[0].clocks_in_khz * MEMORY_TYPE_MULTIPLIER_CZ, 1000); + mem_clks.data[0].clocks_in_khz * memory_type_multiplier, 1000); dc->bw_vbios->mid_yclk = bw_frc_to_fixed( - mem_clks.data[mem_clks.num_levels>>1].clocks_in_khz * MEMORY_TYPE_MULTIPLIER_CZ, + mem_clks.data[mem_clks.num_levels>>1].clocks_in_khz * memory_type_multiplier, 1000); dc->bw_vbios->high_yclk = bw_frc_to_fixed( - mem_clks.data[mem_clks.num_levels-1].clocks_in_khz * MEMORY_TYPE_MULTIPLIER_CZ, + mem_clks.data[mem_clks.num_levels-1].clocks_in_khz * memory_type_multiplier, 1000); /* Now notify PPLib/SMU about which Watermarks sets they should select diff --git a/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c b/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c index 7d08154e9662..236c4c0324b1 100644 --- a/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c @@ -847,6 +847,8 @@ static void bw_calcs_data_update_from_pplib(struct dc *dc) int i; unsigned int clk; unsigned int latency; + /*original logic in dal3*/ + int memory_type_multiplier = MEMORY_TYPE_MULTIPLIER_CZ; /*do system clock*/ if (!dm_pp_get_clock_levels_by_type_with_latency( @@ -905,13 +907,16 @@ static void bw_calcs_data_update_from_pplib(struct dc *dc) * ALSO always convert UMA clock (from PPLIB) to YCLK (HW formula): * YCLK = UMACLK*m_memoryTypeMultiplier */ + if (dc->bw_vbios->memory_type == bw_def_hbm) + memory_type_multiplier = MEMORY_TYPE_HBM; + dc->bw_vbios->low_yclk = bw_frc_to_fixed( - mem_clks.data[0].clocks_in_khz * MEMORY_TYPE_MULTIPLIER_CZ, 1000); + mem_clks.data[0].clocks_in_khz * memory_type_multiplier, 1000); dc->bw_vbios->mid_yclk = bw_frc_to_fixed( - mem_clks.data[mem_clks.num_levels>>1].clocks_in_khz * MEMORY_TYPE_MULTIPLIER_CZ, + mem_clks.data[mem_clks.num_levels>>1].clocks_in_khz * memory_type_multiplier, 1000); dc->bw_vbios->high_yclk = bw_frc_to_fixed( - mem_clks.data[mem_clks.num_levels-1].clocks_in_khz * MEMORY_TYPE_MULTIPLIER_CZ, + mem_clks.data[mem_clks.num_levels-1].clocks_in_khz * memory_type_multiplier, 1000); /* Now notify PPLib/SMU about which Watermarks sets they should select diff --git a/drivers/gpu/drm/amd/display/dc/inc/resource.h b/drivers/gpu/drm/amd/display/dc/inc/resource.h index 1cc1c8ce633b..bef224bf803e 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/resource.h +++ b/drivers/gpu/drm/amd/display/dc/inc/resource.h @@ -31,6 +31,8 @@ #include "dm_pp_smu.h" #define MEMORY_TYPE_MULTIPLIER_CZ 4 +#define MEMORY_TYPE_HBM 2 + enum dce_version resource_parse_asic_id( struct hw_asic_id asic_id); From c02d6a161395dfc0c2fdabb9e976a229017288d8 Mon Sep 17 00:00:00 2001 From: Zhan Liu Date: Thu, 22 Aug 2019 14:54:18 -0400 Subject: [PATCH 0559/2880] drm/amd/display: Add missing HBM support and raise Vega20's uclk. [Why] When more than 2 displays are connected to the graphics card, only the minimum memory clock is needed. However, when more displays are connected, the minimum memory clock is not sufficient enough to support the overwhelming bandwidth. System will hang under this circumstance. Also, the old code didn't address HBM cards, which has 2 pseudo channels. We need to add the HBM part here. [How] When graphics card connects to 2 or more displays, switch to high memory clock. Also, choose memory multiplier based on whether its regular DRAM or HBM. Signed-off-by: Zhan Liu Reviewed-by: Roman Li Acked-by: Leo Li Signed-off-by: Alex Deucher --- .../display/dc/clk_mgr/dce110/dce110_clk_mgr.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c index ee32d2c19305..36277bca0326 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c @@ -174,6 +174,10 @@ void dce11_pplib_apply_display_requirements( struct dc_state *context) { struct dm_pp_display_configuration *pp_display_cfg = &context->pp_display_cfg; + int memory_type_multiplier = MEMORY_TYPE_MULTIPLIER_CZ; + + if (dc->bw_vbios && dc->bw_vbios->memory_type == bw_def_hbm) + memory_type_multiplier = MEMORY_TYPE_HBM; pp_display_cfg->all_displays_in_sync = context->bw_ctx.bw.dce.all_displays_in_sync; @@ -186,8 +190,18 @@ void dce11_pplib_apply_display_requirements( pp_display_cfg->cpu_pstate_separation_time = context->bw_ctx.bw.dce.blackout_recovery_time_us; - pp_display_cfg->min_memory_clock_khz = context->bw_ctx.bw.dce.yclk_khz - / MEMORY_TYPE_MULTIPLIER_CZ; + /* + * TODO: determine whether the bandwidth has reached memory's limitation + * , then change minimum memory clock based on real-time bandwidth + * limitation. + */ + if (ASICREV_IS_VEGA20_P(dc->ctx->asic_id.hw_internal_rev) && (context->stream_count >= 2)) { + pp_display_cfg->min_memory_clock_khz = max(pp_display_cfg->min_memory_clock_khz, + (uint32_t) (dc->bw_vbios->high_yclk.value / memory_type_multiplier / 10000)); + } else { + pp_display_cfg->min_memory_clock_khz = context->bw_ctx.bw.dce.yclk_khz + / memory_type_multiplier; + } pp_display_cfg->min_engine_clock_khz = determine_sclk_from_bounding_box( dc, From 9dbc88d013b79c62bd845cb9e7c0256e660967c5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 7 Sep 2019 22:32:38 +0200 Subject: [PATCH 0560/2880] drm/radeon: Bail earlier when radeon.cik_/si_support=0 is passed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bail from the pci_driver probe function instead of from the drm_driver load function. This avoid /dev/dri/card0 temporarily getting registered and then unregistered again, sending unwanted add / remove udev events to userspace. Specifically this avoids triggering the (userspace) bug fixed by this plymouth merge-request: https://gitlab.freedesktop.org/plymouth/plymouth/merge_requests/59 Note that despite that being an userspace bug, not sending unnecessary udev events is a good idea in general. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1490490 Reviewed-by: Michel Dänzer Signed-off-by: Hans de Goede Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_drv.c | 31 +++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_kms.c | 25 ----------------------- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index 5838162f687f..4267cb55bc33 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c @@ -323,8 +323,39 @@ bool radeon_device_is_virtual(void); static int radeon_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { + unsigned long flags = 0; int ret; + if (!ent) + return -ENODEV; /* Avoid NULL-ptr deref in drm_get_pci_dev */ + + flags = ent->driver_data; + + if (!radeon_si_support) { + switch (flags & RADEON_FAMILY_MASK) { + case CHIP_TAHITI: + case CHIP_PITCAIRN: + case CHIP_VERDE: + case CHIP_OLAND: + case CHIP_HAINAN: + dev_info(&pdev->dev, + "SI support disabled by module param\n"); + return -ENODEV; + } + } + if (!radeon_cik_support) { + switch (flags & RADEON_FAMILY_MASK) { + case CHIP_KAVERI: + case CHIP_BONAIRE: + case CHIP_HAWAII: + case CHIP_KABINI: + case CHIP_MULLINS: + dev_info(&pdev->dev, + "CIK support disabled by module param\n"); + return -ENODEV; + } + } + if (vga_switcheroo_client_probe_defer(pdev)) return -EPROBE_DEFER; diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 07f7ace42c4b..e85c554eeaa9 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -100,31 +100,6 @@ int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags) struct radeon_device *rdev; int r, acpi_status; - if (!radeon_si_support) { - switch (flags & RADEON_FAMILY_MASK) { - case CHIP_TAHITI: - case CHIP_PITCAIRN: - case CHIP_VERDE: - case CHIP_OLAND: - case CHIP_HAINAN: - dev_info(dev->dev, - "SI support disabled by module param\n"); - return -ENODEV; - } - } - if (!radeon_cik_support) { - switch (flags & RADEON_FAMILY_MASK) { - case CHIP_KAVERI: - case CHIP_BONAIRE: - case CHIP_HAWAII: - case CHIP_KABINI: - case CHIP_MULLINS: - dev_info(dev->dev, - "CIK support disabled by module param\n"); - return -ENODEV; - } - } - rdev = kzalloc(sizeof(struct radeon_device), GFP_KERNEL); if (rdev == NULL) { return -ENOMEM; From 73d8e6c7b841d9bf298c8928f228fb433676635c Mon Sep 17 00:00:00 2001 From: Trek Date: Sat, 31 Aug 2019 21:25:36 +0200 Subject: [PATCH 0561/2880] drm/amdgpu: Check for valid number of registers to read Do not try to allocate any amount of memory requested by the user. Instead limit it to 128 registers. Actually the longest series of consecutive allowed registers are 48, mmGB_TILE_MODE0-31 and mmGB_MACROTILE_MODE0-15 (0x2644-0x2673). Bug: https://bugs.freedesktop.org/show_bug.cgi?id=111273 Signed-off-by: Trek Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 0e2ec608530b..f6147528be64 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -677,6 +677,9 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file if (sh_num == AMDGPU_INFO_MMR_SH_INDEX_MASK) sh_num = 0xffffffff; + if (info->read_mmr_reg.count > 128) + return -EINVAL; + regs = kmalloc_array(info->read_mmr_reg.count, sizeof(*regs), GFP_KERNEL); if (!regs) return -ENOMEM; From 4f3a2c10772581840d11b76b5c1147a3767710f7 Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Wed, 11 Sep 2019 13:15:17 +0800 Subject: [PATCH 0562/2880] drm/amd/amdgpu: power up sdma engine when S3 resume back The sdma_v4 should be ungated when the IP resume back, otherwise it will hang up and resume time out error. Signed-off-by: Prike Liang Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c | 2 +- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 10 ++++++---- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 3 +++ 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c index 61bd10310604..5803fcbae22f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c @@ -948,6 +948,7 @@ int amdgpu_dpm_set_powergating_by_smu(struct amdgpu_device *adev, uint32_t block case AMD_IP_BLOCK_TYPE_UVD: case AMD_IP_BLOCK_TYPE_VCN: case AMD_IP_BLOCK_TYPE_VCE: + case AMD_IP_BLOCK_TYPE_SDMA: if (swsmu) ret = smu_dpm_set_power_gate(&adev->smu, block_type, gate); else @@ -956,7 +957,6 @@ int amdgpu_dpm_set_powergating_by_smu(struct amdgpu_device *adev, uint32_t block break; case AMD_IP_BLOCK_TYPE_GMC: case AMD_IP_BLOCK_TYPE_ACP: - case AMD_IP_BLOCK_TYPE_SDMA: ret = ((adev)->powerplay.pp_funcs->set_powergating_by_smu( (adev)->powerplay.pp_handle, block_type, gate)); break; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index ff18b3a57892..78452cf0115d 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1889,8 +1889,9 @@ static int sdma_v4_0_hw_init(void *handle) int r; struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->asic_type == CHIP_RAVEN && adev->powerplay.pp_funcs && - adev->powerplay.pp_funcs->set_powergating_by_smu) + if ((adev->asic_type == CHIP_RAVEN && adev->powerplay.pp_funcs && + adev->powerplay.pp_funcs->set_powergating_by_smu) || + adev->asic_type == CHIP_RENOIR) amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_SDMA, false); if (!amdgpu_sriov_vf(adev)) @@ -1917,8 +1918,9 @@ static int sdma_v4_0_hw_fini(void *handle) sdma_v4_0_ctx_switch_enable(adev, false); sdma_v4_0_enable(adev, false); - if (adev->asic_type == CHIP_RAVEN && adev->powerplay.pp_funcs - && adev->powerplay.pp_funcs->set_powergating_by_smu) + if ((adev->asic_type == CHIP_RAVEN && adev->powerplay.pp_funcs + && adev->powerplay.pp_funcs->set_powergating_by_smu) || + adev->asic_type == CHIP_RENOIR) amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_SDMA, true); return 0; diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 22f3c60d380f..33960fb38a5d 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -354,6 +354,9 @@ int smu_dpm_set_power_gate(struct smu_context *smu, uint32_t block_type, case AMD_IP_BLOCK_TYPE_GFX: ret = smu_gfx_off_control(smu, gate); break; + case AMD_IP_BLOCK_TYPE_SDMA: + ret = smu_powergate_sdma(smu, gate); + break; default: break; } From dcafbd50f2e4d5cc964aae409fb5691b743fba23 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Thu, 5 Sep 2019 19:22:02 -0400 Subject: [PATCH 0563/2880] drm/amdgpu: Fix KFD-related kernel oops on Hawaii MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hawaii needs to flush caches explicitly, submitting an IB in a user VMID from kernel mode. There is no s_fence in this case. Fixes: eb3961a57424 ("drm/amdgpu: remove fence context from the job") Signed-off-by: Felix Kuehling Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index 7850084a05e3..60655834d649 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -143,7 +143,8 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, /* ring tests don't use a job */ if (job) { vm = job->vm; - fence_ctx = job->base.s_fence->scheduled.context; + fence_ctx = job->base.s_fence ? + job->base.s_fence->scheduled.context : 0; } else { vm = NULL; fence_ctx = 0; From 8ad050e6a67822a4adf1e22f396e141b3deaf8ef Mon Sep 17 00:00:00 2001 From: Bhawanpreet Lakha Date: Fri, 6 Sep 2019 16:31:21 -0400 Subject: [PATCH 0564/2880] drm/amd/display: add Asic ID for Dali Dali is a new asic revision based on raven2 Add the ID and ASICREV_IS_DALI define Signed-off-by: Bhawanpreet Lakha Reviewed-by: Huang Rui Acked-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/include/dal_asic_id.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/include/dal_asic_id.h b/drivers/gpu/drm/amd/display/include/dal_asic_id.h index 1f16892f0add..1be6c44fd32f 100644 --- a/drivers/gpu/drm/amd/display/include/dal_asic_id.h +++ b/drivers/gpu/drm/amd/display/include/dal_asic_id.h @@ -137,10 +137,13 @@ #define RAVEN1_F0 0xF0 #define RAVEN_UNKNOWN 0xFF +#define PICASSO_15D8_REV_E3 0xE3 +#define PICASSO_15D8_REV_E4 0xE4 + #define ASICREV_IS_RAVEN(eChipRev) ((eChipRev >= RAVEN_A0) && eChipRev < RAVEN_UNKNOWN) #define ASICREV_IS_PICASSO(eChipRev) ((eChipRev >= PICASSO_A0) && (eChipRev < RAVEN2_A0)) -#define ASICREV_IS_RAVEN2(eChipRev) ((eChipRev >= RAVEN2_A0) && (eChipRev < 0xF0)) - +#define ASICREV_IS_RAVEN2(eChipRev) ((eChipRev >= RAVEN2_A0) && (eChipRev < PICASSO_15D8_REV_E3)) +#define ASICREV_IS_DALI(eChipRev) ((eChipRev >= PICASSO_15D8_REV_E3) && (eChipRev < RAVEN1_F0)) #define ASICREV_IS_RV1_F0(eChipRev) ((eChipRev >= RAVEN1_F0) && (eChipRev < RAVEN_UNKNOWN)) From e42a34dec68950ebad7904f235ed2dfff5bb27b5 Mon Sep 17 00:00:00 2001 From: Bhawanpreet Lakha Date: Fri, 6 Sep 2019 16:32:44 -0400 Subject: [PATCH 0565/2880] drm/amd/display: Implement voltage limitation for dali [Why] we only want the lowest voltage to be available for dali. [How] Use the get_highest_allowed_voltage_level function to return 0 for dali Signed-off-by: Bhawanpreet Lakha Reviewed-by: Huang Rui Acked-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c index 383f4f8db8f4..9b2cb57bf2ba 100644 --- a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c +++ b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c @@ -708,6 +708,10 @@ static void hack_bounding_box(struct dcn_bw_internal_vars *v, unsigned int get_highest_allowed_voltage_level(uint32_t hw_internal_rev) { + /* for dali, the highest voltage level we want is 0 */ + if (ASICREV_IS_DALI(hw_internal_rev)) + return 0; + /* we are ok with all levels */ return 4; } From bb264220d9316f6bd7c1fd84b8da398c93912931 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 2 Sep 2019 16:33:42 +0800 Subject: [PATCH 0566/2880] drm/amd/display: Restore backlight brightness after system resume Laptops with AMD APU doesn't restore display backlight brightness after system resume. This issue started when DC was introduced. Let's use BL_CORE_SUSPENDRESUME so the backlight core calls update_status callback after system resume to restore the backlight level. Tested on Dell Inspiron 3180 (Stoney Ridge) and Dell Latitude 5495 (Raven Ridge). Cc: Signed-off-by: Kai-Heng Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 9590ca552a28..d3f404f097eb 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2113,6 +2113,7 @@ static int amdgpu_dm_backlight_get_brightness(struct backlight_device *bd) } static const struct backlight_ops amdgpu_dm_backlight_ops = { + .options = BL_CORE_SUSPENDRESUME, .get_brightness = amdgpu_dm_backlight_get_brightness, .update_status = amdgpu_dm_backlight_update_status, }; From 8b8031703bd75ebedfcae59d64f8c0c006300bcb Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Wed, 4 Sep 2019 16:34:39 +0800 Subject: [PATCH 0567/2880] drm/amd/powerplay: implement sysfs for getting dpm clock With the common interface print_clk_levels can get the following dpm clock: -pp_dpm_dcefclk -pp_dpm_fclk -pp_dpm_mclk -pp_dpm_sclk -pp_dpm_socclk Signed-off-by: Prike Liang Reviewed-by: Evan Quan Reviewed-by: Kevin Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/renoir_ppt.c | 70 ++++++++++++++++++++++ drivers/gpu/drm/amd/powerplay/renoir_ppt.h | 25 ++++++++ 2 files changed, 95 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c index 2a6da546fb55..e62bfba51562 100644 --- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c @@ -177,12 +177,82 @@ static int renoir_get_dpm_uclk_limited(struct smu_context *smu, uint32_t *clock, } +static int renoir_print_clk_levels(struct smu_context *smu, + enum smu_clk_type clk_type, char *buf) +{ + int i, size = 0, ret = 0; + uint32_t cur_value = 0, value = 0, count = 0, min = 0, max = 0; + DpmClocks_t *clk_table = smu->smu_table.clocks_table; + SmuMetrics_t metrics = {0}; + + if (!clk_table || clk_type >= SMU_CLK_COUNT) + return -EINVAL; + + ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, + (void *)&metrics, false); + if (ret) + return ret; + + switch (clk_type) { + case SMU_GFXCLK: + case SMU_SCLK: + /* retirve table returned paramters unit is MHz */ + cur_value = metrics.ClockFrequency[CLOCK_GFXCLK]; + ret = smu_get_dpm_freq_range(smu, SMU_GFXCLK, &min, &max); + if (!ret) { + /* driver only know min/max gfx_clk, Add level 1 for all other gfx clks */ + if (cur_value == max) + i = 2; + else if (cur_value == min) + i = 0; + else + i = 1; + + size += sprintf(buf + size, "0: %uMhz %s\n", min, + i == 0 ? "*" : ""); + size += sprintf(buf + size, "1: %uMhz %s\n", + i == 1 ? cur_value : RENOIR_UMD_PSTATE_GFXCLK, + i == 1 ? "*" : ""); + size += sprintf(buf + size, "2: %uMhz %s\n", max, + i == 2 ? "*" : ""); + } + return size; + case SMU_SOCCLK: + count = NUM_SOCCLK_DPM_LEVELS; + cur_value = metrics.ClockFrequency[CLOCK_SOCCLK]; + break; + case SMU_MCLK: + count = NUM_MEMCLK_DPM_LEVELS; + cur_value = metrics.ClockFrequency[CLOCK_UMCCLK]; + break; + case SMU_DCEFCLK: + count = NUM_DCFCLK_DPM_LEVELS; + cur_value = metrics.ClockFrequency[CLOCK_DCFCLK]; + break; + case SMU_FCLK: + count = NUM_FCLK_DPM_LEVELS; + cur_value = metrics.ClockFrequency[CLOCK_FCLK]; + break; + default: + return -EINVAL; + } + + for (i = 0; i < count; i++) { + GET_DPM_CUR_FREQ(clk_table, clk_type, i, value); + size += sprintf(buf + size, "%d: %uMhz %s\n", i, value, + cur_value == value ? "*" : ""); + } + + return size; +} + static const struct pptable_funcs renoir_ppt_funcs = { .get_smu_msg_index = renoir_get_smu_msg_index, .get_smu_table_index = renoir_get_smu_table_index, .tables_init = renoir_tables_init, .set_power_state = NULL, .get_dpm_uclk_limited = renoir_get_dpm_uclk_limited, + .print_clk_levels = renoir_print_clk_levels, }; void renoir_set_ppt_funcs(struct smu_context *smu) diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h index e9b7237c0f7f..2a390ddd37dd 100644 --- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h +++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h @@ -25,4 +25,29 @@ extern void renoir_set_ppt_funcs(struct smu_context *smu); +/* UMD PState Renoir Msg Parameters in MHz */ +#define RENOIR_UMD_PSTATE_GFXCLK 700 +#define RENOIR_UMD_PSTATE_SOCCLK 678 +#define RENOIR_UMD_PSTATE_FCLK 800 + +#define GET_DPM_CUR_FREQ(table, clk_type, dpm_level, freq) \ + do { \ + switch (clk_type) { \ + case SMU_SOCCLK: \ + freq = table->SocClocks[dpm_level].Freq; \ + break; \ + case SMU_MCLK: \ + freq = table->MemClocks[dpm_level].Freq; \ + break; \ + case SMU_DCEFCLK: \ + freq = table->DcfClocks[dpm_level].Freq; \ + break; \ + case SMU_FCLK: \ + freq = table->FClocks[dpm_level].Freq; \ + break; \ + default: \ + break; \ + } \ + } while (0) + #endif From 8fde7784ecd35120a5ace851bcf384e90fb2b64b Mon Sep 17 00:00:00 2001 From: Jay Cornwall Date: Thu, 12 Sep 2019 14:03:41 -0500 Subject: [PATCH 0568/2880] drm/amdkfd: Swap trap temporary registers in gfx10 trap handler ttmp[4:5] hold information useful to the debugger. Use ttmp[14:15] instead, aligning implementation with gfx9 trap handler. Signed-off-by: Jay Cornwall Reviewed-by: shaoyun liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 6 +++--- drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index a8cf82d46109..901fe3590165 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -694,10 +694,10 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x003f8000, 0x8f6f896f, 0x88776f77, 0x8a6eff6e, 0x023f8000, 0xb9eef807, - 0xb970f812, 0xb971f813, - 0x8ff08870, 0xf4051bb8, + 0xb97af812, 0xb97bf813, + 0x8ffa887a, 0xf4051bbd, 0xfa000000, 0xbf8cc07f, - 0xf4051c38, 0xfa000008, + 0xf4051ebd, 0xfa000008, 0xbf8cc07f, 0x87ee6e6e, 0xbf840001, 0xbe80206e, 0xb971f803, 0x8771ff71, diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index 35986219ce5f..cdaa523ce6be 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -187,12 +187,12 @@ L_FETCH_2ND_TRAP: // Read second-level TBA/TMA from first-level TMA and jump if available. // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) // ttmp12 holds SQ_WAVE_STATUS - s_getreg_b32 ttmp4, hwreg(HW_REG_SHADER_TMA_LO) - s_getreg_b32 ttmp5, hwreg(HW_REG_SHADER_TMA_HI) - s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 - s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA + s_getreg_b32 ttmp14, hwreg(HW_REG_SHADER_TMA_LO) + s_getreg_b32 ttmp15, hwreg(HW_REG_SHADER_TMA_HI) + s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 + s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA s_waitcnt lgkmcnt(0) - s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA + s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA s_waitcnt lgkmcnt(0) s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set From a82e163bca624c7c222b33a95fb6ed7c9bcee801 Mon Sep 17 00:00:00 2001 From: "Tianci.Yin" Date: Thu, 5 Sep 2019 15:28:57 +0800 Subject: [PATCH 0569/2880] drm/amdgpu: add navi14 PCI ID for work station SKU Add the navi14 PCI device id. Reviewed-by: Feifei Xu Signed-off-by: Tianci.Yin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 48a2070e72f2..22a8d4c4c591 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1012,6 +1012,8 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x731F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, /* Navi14 */ {0x1002, 0x7340, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14}, + {0x1002, 0x7341, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14}, + {0x1002, 0x7347, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14}, /* Renoir */ {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU|AMD_EXP_HW_SUPPORT}, From 10e85054f986ce9d899cdb3efc8529f8016bf435 Mon Sep 17 00:00:00 2001 From: "Tianci.Yin" Date: Tue, 17 Sep 2019 10:35:34 +0800 Subject: [PATCH 0570/2880] drm/amdgpu: add navi12 pci id Add Navi12 PCI id support. Reviewed-by: Hawking Zhang Signed-off-by: Tianci.Yin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 22a8d4c4c591..c68e54a27a2c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1018,6 +1018,9 @@ static const struct pci_device_id pciidlist[] = { /* Renoir */ {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU|AMD_EXP_HW_SUPPORT}, + /* Navi12 */ + {0x1002, 0x7360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12}, + {0, 0, 0} }; From 314eed30ede02fa925990f535652254b5bad6b65 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 17 Sep 2019 11:00:25 -0700 Subject: [PATCH 0571/2880] usercopy: Avoid HIGHMEM pfn warning When running on a system with >512MB RAM with a 32-bit kernel built with: CONFIG_DEBUG_VIRTUAL=y CONFIG_HIGHMEM=y CONFIG_HARDENED_USERCOPY=y all execve()s will fail due to argv copying into kmap()ed pages, and on usercopy checking the calls ultimately of virt_to_page() will be looking for "bad" kmap (highmem) pointers due to CONFIG_DEBUG_VIRTUAL=y: ------------[ cut here ]------------ kernel BUG at ../arch/x86/mm/physaddr.c:83! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc8 #6 Hardware name: Dell Inc. Inspiron 1318/0C236D, BIOS A04 01/15/2009 EIP: __phys_addr+0xaf/0x100 ... Call Trace: __check_object_size+0xaf/0x3c0 ? __might_sleep+0x80/0xa0 copy_strings+0x1c2/0x370 copy_strings_kernel+0x2b/0x40 __do_execve_file+0x4ca/0x810 ? kmem_cache_alloc+0x1c7/0x370 do_execve+0x1b/0x20 ... The check is from arch/x86/mm/physaddr.c: VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn); Due to the kmap() in fs/exec.c: kaddr = kmap(kmapped_page); ... if (copy_from_user(kaddr+offset, str, bytes_to_copy)) ... Now we can fetch the correct page to avoid the pfn check. In both cases, hardened usercopy will need to walk the page-span checker (if enabled) to do sanity checking. Reported-by: Randy Dunlap Tested-by: Randy Dunlap Fixes: f5509cc18daa ("mm: Hardened usercopy") Cc: Matthew Wilcox Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/201909171056.7F2FFD17@keescook --- mm/usercopy.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index 98e924864554..660717a1ea5c 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -227,7 +228,12 @@ static inline void check_heap_object(const void *ptr, unsigned long n, if (!virt_addr_valid(ptr)) return; - page = virt_to_head_page(ptr); + /* + * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the + * highmem page or fallback to virt_to_page(). The following + * is effectively a highmem-aware virt_to_head_page(). + */ + page = compound_head(kmap_to_page((void *)ptr)); if (PageSlab(page)) { /* Check slab allocator for flags and size. */ From 5eaed68dd38c14907be2ce5a45c73a5f2acb75f4 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 16 Sep 2019 18:44:28 +0300 Subject: [PATCH 0572/2880] block: use symbolic constants for t10_pi type Replace all hard-coded values with T10_PI_TYPES to make the code more readable. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Max Gurtovoy Signed-off-by: Jens Axboe --- block/t10-pi.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/block/t10-pi.c b/block/t10-pi.c index 0c0094609dd6..7fed58705d1a 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -27,7 +27,7 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len) * tag. */ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, - csum_fn *fn, unsigned int type) + csum_fn *fn, enum t10_dif_type type) { unsigned int i; @@ -37,7 +37,7 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, pi->guard_tag = fn(iter->data_buf, iter->interval); pi->app_tag = 0; - if (type == 1) + if (type == T10_PI_TYPE1_PROTECTION) pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); else pi->ref_tag = 0; @@ -51,7 +51,7 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, } static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, - csum_fn *fn, unsigned int type) + csum_fn *fn, enum t10_dif_type type) { unsigned int i; @@ -60,8 +60,8 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, __be16 csum; switch (type) { - case 1: - case 2: + case T10_PI_TYPE1_PROTECTION: + case T10_PI_TYPE2_PROTECTION: if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -74,7 +74,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, return BLK_STS_PROTECTION; } break; - case 3: + case T10_PI_TYPE3_PROTECTION: if (pi->app_tag == T10_PI_APP_ESCAPE && pi->ref_tag == T10_PI_REF_ESCAPE) goto next; @@ -102,42 +102,42 @@ next: static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) { - return t10_pi_generate(iter, t10_pi_crc_fn, 1); + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) { - return t10_pi_generate(iter, t10_pi_ip_fn, 1); + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) { - return t10_pi_verify(iter, t10_pi_crc_fn, 1); + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) { - return t10_pi_verify(iter, t10_pi_ip_fn, 1); + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) { - return t10_pi_generate(iter, t10_pi_crc_fn, 3); + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); } static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) { - return t10_pi_generate(iter, t10_pi_ip_fn, 3); + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); } static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) { - return t10_pi_verify(iter, t10_pi_crc_fn, 3); + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); } static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) { - return t10_pi_verify(iter, t10_pi_ip_fn, 3); + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); } const struct blk_integrity_profile t10_pi_type1_crc = { From 54d4e6ab91eb24b47a58403d8561206e916f0242 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 16 Sep 2019 18:44:29 +0300 Subject: [PATCH 0573/2880] block: centralize PI remapping logic to the block layer Currently t10_pi_prepare/t10_pi_complete functions are called during the NVMe and SCSi layers command preparetion/completion, but their actual place should be the block layer since T10-PI is a general data integrity feature that is used by block storage protocols. Introduce .prepare_fn and .complete_fn callbacks within the integrity profile that each type can implement according to its needs. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Suggested-by: Martin K. Petersen Reviewed-by: Martin K. Petersen Signed-off-by: Max Gurtovoy Fixed to not call queue integrity functions if BLK_DEV_INTEGRITY isn't defined in the config. Signed-off-by: Jens Axboe --- block/blk-core.c | 7 ++ block/blk-integrity.c | 11 +++ block/blk-mq.c | 6 ++ block/t10-pi.c | 144 ++++++++++++++++++++------------------ drivers/md/dm-integrity.c | 10 +++ drivers/nvme/host/core.c | 9 --- drivers/scsi/sd.c | 8 --- include/linux/blkdev.h | 4 ++ include/linux/t10-pi.h | 14 ---- 9 files changed, 114 insertions(+), 99 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 875e8d105067..d5e668ec751b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1436,6 +1437,12 @@ bool blk_update_request(struct request *req, blk_status_t error, if (!req->bio) return false; +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && + error == BLK_STS_OK) + req->q->integrity.profile->complete_fn(req, nr_bytes); +#endif + if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET))) print_req_error(req, error, __func__); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ca39b4624cf8..ff1070edbb40 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -368,10 +368,21 @@ static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) return BLK_STS_OK; } +static void blk_integrity_nop_prepare(struct request *rq) +{ +} + +static void blk_integrity_nop_complete(struct request *rq, + unsigned int nr_bytes) +{ +} + static const struct blk_integrity_profile nop_profile = { .name = "nop", .generate_fn = blk_integrity_nop_fn, .verify_fn = blk_integrity_nop_fn, + .prepare_fn = blk_integrity_nop_prepare, + .complete_fn = blk_integrity_nop_complete, }; /** diff --git a/block/blk-mq.c b/block/blk-mq.c index 20a49be536b5..29275f5a996f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -30,6 +30,7 @@ #include #include +#include #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" @@ -700,6 +701,11 @@ void blk_mq_start_request(struct request *rq) */ rq->nr_phys_segments++; } + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) + q->integrity.profile->prepare_fn(rq); +#endif } EXPORT_SYMBOL(blk_mq_start_request); diff --git a/block/t10-pi.c b/block/t10-pi.c index 7fed58705d1a..0c0120a672f9 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -120,76 +120,22 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); } -static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -const struct blk_integrity_profile t10_pi_type1_crc = { - .name = "T10-DIF-TYPE1-CRC", - .generate_fn = t10_pi_type1_generate_crc, - .verify_fn = t10_pi_type1_verify_crc, -}; -EXPORT_SYMBOL(t10_pi_type1_crc); - -const struct blk_integrity_profile t10_pi_type1_ip = { - .name = "T10-DIF-TYPE1-IP", - .generate_fn = t10_pi_type1_generate_ip, - .verify_fn = t10_pi_type1_verify_ip, -}; -EXPORT_SYMBOL(t10_pi_type1_ip); - -const struct blk_integrity_profile t10_pi_type3_crc = { - .name = "T10-DIF-TYPE3-CRC", - .generate_fn = t10_pi_type3_generate_crc, - .verify_fn = t10_pi_type3_verify_crc, -}; -EXPORT_SYMBOL(t10_pi_type3_crc); - -const struct blk_integrity_profile t10_pi_type3_ip = { - .name = "T10-DIF-TYPE3-IP", - .generate_fn = t10_pi_type3_generate_ip, - .verify_fn = t10_pi_type3_verify_ip, -}; -EXPORT_SYMBOL(t10_pi_type3_ip); - /** - * t10_pi_prepare - prepare PI prior submitting request to device + * t10_pi_type1_prepare - prepare PI prior submitting request to device * @rq: request with PI that should be prepared - * @protection_type: PI type (Type 1/Type 2/Type 3) * * For Type 1/Type 2, the virtual start sector is the one that was * originally submitted by the block layer for the ref_tag usage. Due to * partitioning, MD/DM cloning, etc. the actual physical start sector is * likely to be different. Remap protection information to match the * physical LBA. - * - * Type 3 does not have a reference tag so no remapping is required. */ -void t10_pi_prepare(struct request *rq, u8 protection_type) +static void t10_pi_type1_prepare(struct request *rq) { const int tuple_sz = rq->q->integrity.tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); struct bio *bio; - if (protection_type == T10_PI_TYPE3_PROTECTION) - return; - __rq_for_each_bio(bio, rq) { struct bio_integrity_payload *bip = bio_integrity(bio); u32 virt = bip_get_seed(bip) & 0xffffffff; @@ -222,13 +168,11 @@ void t10_pi_prepare(struct request *rq, u8 protection_type) bip->bip_flags |= BIP_MAPPED_INTEGRITY; } } -EXPORT_SYMBOL(t10_pi_prepare); /** - * t10_pi_complete - prepare PI prior returning request to the block layer + * t10_pi_type1_complete - prepare PI prior returning request to the blk layer * @rq: request with PI that should be prepared - * @protection_type: PI type (Type 1/Type 2/Type 3) - * @intervals: total elements to prepare + * @nr_bytes: total bytes to prepare * * For Type 1/Type 2, the virtual start sector is the one that was * originally submitted by the block layer for the ref_tag usage. Due to @@ -236,19 +180,14 @@ EXPORT_SYMBOL(t10_pi_prepare); * likely to be different. Since the physical start sector was submitted * to the device, we should remap it back to virtual values expected by the * block layer. - * - * Type 3 does not have a reference tag so no remapping is required. */ -void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals) +static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { + unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; const int tuple_sz = rq->q->integrity.tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); struct bio *bio; - if (protection_type == T10_PI_TYPE3_PROTECTION) - return; - __rq_for_each_bio(bio, rq) { struct bio_integrity_payload *bip = bio_integrity(bio); u32 virt = bip_get_seed(bip) & 0xffffffff; @@ -276,4 +215,73 @@ void t10_pi_complete(struct request *rq, u8 protection_type, } } } -EXPORT_SYMBOL(t10_pi_complete); + +static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); +} + +/** + * Type 3 does not have a reference tag so no remapping is required. + */ +static void t10_pi_type3_prepare(struct request *rq) +{ +} + +/** + * Type 3 does not have a reference tag so no remapping is required. + */ +static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) +{ +} + +const struct blk_integrity_profile t10_pi_type1_crc = { + .name = "T10-DIF-TYPE1-CRC", + .generate_fn = t10_pi_type1_generate_crc, + .verify_fn = t10_pi_type1_verify_crc, + .prepare_fn = t10_pi_type1_prepare, + .complete_fn = t10_pi_type1_complete, +}; +EXPORT_SYMBOL(t10_pi_type1_crc); + +const struct blk_integrity_profile t10_pi_type1_ip = { + .name = "T10-DIF-TYPE1-IP", + .generate_fn = t10_pi_type1_generate_ip, + .verify_fn = t10_pi_type1_verify_ip, + .prepare_fn = t10_pi_type1_prepare, + .complete_fn = t10_pi_type1_complete, +}; +EXPORT_SYMBOL(t10_pi_type1_ip); + +const struct blk_integrity_profile t10_pi_type3_crc = { + .name = "T10-DIF-TYPE3-CRC", + .generate_fn = t10_pi_type3_generate_crc, + .verify_fn = t10_pi_type3_verify_crc, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL(t10_pi_type3_crc); + +const struct blk_integrity_profile t10_pi_type3_ip = { + .name = "T10-DIF-TYPE3-IP", + .generate_fn = t10_pi_type3_generate_ip, + .verify_fn = t10_pi_type3_verify_ip, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL(t10_pi_type3_ip); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 9118ab85cb3a..dab4446fe7d8 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -345,6 +345,14 @@ static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...) #define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) #endif +static void dm_integrity_prepare(struct request *rq) +{ +} + +static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes) +{ +} + /* * DM Integrity profile, protection is performed layer above (dm-crypt) */ @@ -352,6 +360,8 @@ static const struct blk_integrity_profile dm_integrity_profile = { .name = "DM-DIF-EXT-TAG", .generate_fn = NULL, .verify_fn = NULL, + .prepare_fn = dm_integrity_prepare, + .complete_fn = dm_integrity_complete, }; static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1ede1763a5ee..108f60b46804 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -666,8 +666,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) return BLK_STS_NOTSUPP; control |= NVME_RW_PRINFO_PRACT; - } else if (req_op(req) == REQ_OP_WRITE) { - t10_pi_prepare(req, ns->pi_type); } switch (ns->pi_type) { @@ -690,13 +688,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, void nvme_cleanup_cmd(struct request *req) { - if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && - nvme_req(req)->status == 0) { - struct nvme_ns *ns = req->rq_disk->private_data; - - t10_pi_complete(req, ns->pi_type, - blk_rq_bytes(req) >> ns->lba_shift); - } if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { struct nvme_ns *ns = req->rq_disk->private_data; struct page *page = req->special_vec.bv_page; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 4b925552458f..0e8941caaa0d 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1211,9 +1211,6 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) dix = scsi_prot_sg_count(cmd); dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); - if (write && dix) - t10_pi_prepare(cmd->request, sdkp->protection_type); - if (dif || dix) protect = sd_setup_protect_cmnd(cmd, dix, dif); else @@ -2054,11 +2051,6 @@ static int sd_done(struct scsi_cmnd *SCpnt) "sd_done: completed %d of %d bytes\n", good_bytes, scsi_bufflen(SCpnt))); - if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt) && - good_bytes) - t10_pi_complete(SCpnt->request, sdkp->protection_type, - good_bytes / scsi_prot_interval(SCpnt)); - return good_bytes; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3094f2d513b2..6032bb740cf4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1522,10 +1522,14 @@ struct blk_integrity_iter { }; typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); +typedef void (integrity_prepare_fn) (struct request *); +typedef void (integrity_complete_fn) (struct request *, unsigned int); struct blk_integrity_profile { integrity_processing_fn *generate_fn; integrity_processing_fn *verify_fn; + integrity_prepare_fn *prepare_fn; + integrity_complete_fn *complete_fn; const char *name; }; diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 3e2a80cc7b56..96305a64a5a7 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -53,18 +53,4 @@ extern const struct blk_integrity_profile t10_pi_type1_ip; extern const struct blk_integrity_profile t10_pi_type3_crc; extern const struct blk_integrity_profile t10_pi_type3_ip; -#ifdef CONFIG_BLK_DEV_INTEGRITY -extern void t10_pi_prepare(struct request *rq, u8 protection_type); -extern void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals); -#else -static inline void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals) -{ -} -static inline void t10_pi_prepare(struct request *rq, u8 protection_type) -{ -} -#endif - #endif From 23ed570accc94cf9de9036b71102aa7c0e0e8dc6 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 22 Aug 2019 17:20:34 +0200 Subject: [PATCH 0574/2880] block, bfq: update inject limit only after injection occurred BFQ updates the injection limit of each bfq_queue as a function of how much the limit inflates the service times experienced by the I/O requests of the queue. So only service times affected by injection must be taken into account. Unfortunately, in the current implementation of this update scheme, the service time of an I/O request rq not affected by injection may happen to be considered in the following case: there is no I/O request in service when rq arrives. This commit fixes this issue by making sure that only service times affected by injection are considered for updating the injection limit. In particular, the service time of an I/O request rq is now considered only if at least one of the following two conditions holds: - the destination bfq_queue for rq underwent injection before rq arrival, and there is still I/O in service in the drive on rq arrival (the service of such unfinished I/O may delay the service of rq); - injection occurs between the arrival and the completion time of rq. Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b33be928d164..5a2bbd8613a8 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2025,7 +2025,21 @@ static void bfq_add_request(struct request *rq) * be set when rq will be dispatched. */ bfqd->wait_dispatch = true; - bfqd->rqs_injected = false; + /* + * If there is no I/O in service in the drive, + * then possible injection occurred before the + * arrival of rq will not affect the total + * service time of rq. So the injection limit + * must not be updated as a function of such + * total service time, unless new injection + * occurs before rq is completed. To have the + * injection limit updated only in the latter + * case, reset rqs_injected here (rqs_injected + * will be set in case injection is performed + * on bfqq before rq is completed). + */ + if (bfqd->rq_in_driver == 0) + bfqd->rqs_injected = false; } } @@ -5784,7 +5798,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; unsigned int old_limit = bfqq->inject_limit; - if (bfqq->last_serv_time_ns > 0) { + if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) { u64 threshold = (bfqq->last_serv_time_ns * 3)>>1; if (tot_time_ns >= threshold && old_limit > 0) { @@ -5830,6 +5844,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, /* update complete, not waiting for any request completion any longer */ bfqd->waited_rq = NULL; + bfqd->rqs_injected = false; } /* From c1e0a18228822258cbeb49a923d93f8f01c74a76 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 22 Aug 2019 17:20:35 +0200 Subject: [PATCH 0575/2880] block, bfq: reduce upper bound for inject limit to max_rq_in_driver+1 Upon an increment attempt of the injection limit, the latter is constrained not to become higher than twice the maximum number max_rq_in_driver of I/O requests that have happened to be in service in the drive. This high bound allows the injection limit to grow beyond max_rq_in_driver, which may then cause max_rq_in_driver itself to grow. However, since the limit is incremented by only one unit at a time, there is no need for such a high bound, and just max_rq_in_driver+1 is enough. Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 5a2bbd8613a8..e114282204f6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5805,7 +5805,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, bfqq->inject_limit--; bfqq->decrease_time_jif = jiffies; } else if (tot_time_ns < threshold && - old_limit < bfqd->max_rq_in_driver<<1) + old_limit <= bfqd->max_rq_in_driver) bfqq->inject_limit++; } From 17c3d2660268501d36b75c7c7083dc8f2bbb93c3 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 22 Aug 2019 17:20:36 +0200 Subject: [PATCH 0576/2880] block, bfq: increase update frequency of inject limit The update period of the injection limit has been tentatively set to 100 ms, to reduce fluctuations. This value however proved to cause, occasionally, the limit to be decremented for some bfq_queue only after the queue underwent excessive injection for a lot of time. This commit reduces the period to 10 ms. Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e114282204f6..ddac93e910fa 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2016,7 +2016,7 @@ static void bfq_add_request(struct request *rq) (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected && bfqd->rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + - msecs_to_jiffies(100))) { + msecs_to_jiffies(10))) { bfqd->last_empty_occupied_ns = ktime_get_ns(); /* * Start the state machine for measuring the From 58494c980f40274c465ebfdece02d401def088bf Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 22 Aug 2019 17:20:37 +0200 Subject: [PATCH 0577/2880] block, bfq: push up injection only after setting service time If equal to 0, the injection limit for a bfq_queue is pushed to 1 after a first sample of the total service time of the I/O requests of the queue is computed (to allow injection to start). Yet, because of a mistake in the branch that performs this action, the push may happen also in some other case. This commit fixes this issue. Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ddac93e910fa..0319d6339822 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5823,12 +5823,14 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, */ if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || tot_time_ns < bfqq->last_serv_time_ns) { + if (bfqq->last_serv_time_ns == 0) { + /* + * Now we certainly have a base value: make sure we + * start trying injection. + */ + bfqq->inject_limit = max_t(unsigned int, 1, old_limit); + } bfqq->last_serv_time_ns = tot_time_ns; - /* - * Now we certainly have a base value: make sure we - * start trying injection. - */ - bfqq->inject_limit = max_t(unsigned int, 1, old_limit); } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) /* * No I/O injected and no request still in service in From ec76a7b922e42df1437e39b44c564ba892676f0e Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 17 Sep 2019 17:26:05 +0530 Subject: [PATCH 0578/2880] nbd: rename the runtime flags as NBD_RT_ prefixed Preparing for the destory when disconnecting crash fixing. Reviewed-by: Josef Bacik Signed-off-by: Xiubo Li Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 74 ++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a8e3815295fe..7e0501c47153 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -71,14 +71,14 @@ struct link_dead_args { int index; }; -#define NBD_TIMEDOUT 0 -#define NBD_DISCONNECT_REQUESTED 1 -#define NBD_DISCONNECTED 2 -#define NBD_HAS_PID_FILE 3 -#define NBD_HAS_CONFIG_REF 4 -#define NBD_BOUND 5 -#define NBD_DESTROY_ON_DISCONNECT 6 -#define NBD_DISCONNECT_ON_CLOSE 7 +#define NBD_RT_TIMEDOUT 0 +#define NBD_RT_DISCONNECT_REQUESTED 1 +#define NBD_RT_DISCONNECTED 2 +#define NBD_RT_HAS_PID_FILE 3 +#define NBD_RT_HAS_CONFIG_REF 4 +#define NBD_RT_BOUND 5 +#define NBD_RT_DESTROY_ON_DISCONNECT 6 +#define NBD_RT_DISCONNECT_ON_CLOSE 7 struct nbd_config { u32 flags; @@ -238,8 +238,8 @@ static void nbd_put(struct nbd_device *nbd) static int nbd_disconnected(struct nbd_config *config) { - return test_bit(NBD_DISCONNECTED, &config->runtime_flags) || - test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags); + return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) || + test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); } static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, @@ -257,9 +257,9 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, if (!nsock->dead) { kernel_sock_shutdown(nsock->sock, SHUT_RDWR); if (atomic_dec_return(&nbd->config->live_connections) == 0) { - if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED, + if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED, &nbd->config->runtime_flags)) { - set_bit(NBD_DISCONNECTED, + set_bit(NBD_RT_DISCONNECTED, &nbd->config->runtime_flags); dev_info(nbd_to_dev(nbd), "Disconnected due to user request.\n"); @@ -333,7 +333,7 @@ static void sock_shutdown(struct nbd_device *nbd) if (config->num_connections == 0) return; - if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags)) + if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return; for (i = 0; i < config->num_connections; i++) { @@ -427,7 +427,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, } dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n"); - set_bit(NBD_TIMEDOUT, &config->runtime_flags); + set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags); cmd->status = BLK_STS_IOERR; mutex_unlock(&cmd->lock); sock_shutdown(nbd); @@ -795,7 +795,7 @@ static int find_fallback(struct nbd_device *nbd, int index) struct nbd_sock *nsock = config->socks[index]; int fallback = nsock->fallback_index; - if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) + if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return new_index; if (config->num_connections <= 1) { @@ -836,7 +836,7 @@ static int wait_for_reconnect(struct nbd_device *nbd) struct nbd_config *config = nbd->config; if (!config->dead_conn_timeout) return 0; - if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) + if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return 0; return wait_event_timeout(config->conn_wait, atomic_read(&config->live_connections) > 0, @@ -969,12 +969,12 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, return err; if (!netlink && !nbd->task_setup && - !test_bit(NBD_BOUND, &config->runtime_flags)) + !test_bit(NBD_RT_BOUND, &config->runtime_flags)) nbd->task_setup = current; if (!netlink && (nbd->task_setup != current || - test_bit(NBD_BOUND, &config->runtime_flags))) { + test_bit(NBD_RT_BOUND, &config->runtime_flags))) { dev_err(disk_to_dev(nbd->disk), "Device being setup by another task"); sockfd_put(sock); @@ -1053,7 +1053,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) mutex_unlock(&nsock->tx_lock); sockfd_put(old); - clear_bit(NBD_DISCONNECTED, &config->runtime_flags); + clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); /* We take the tx_mutex in an error path in the recv_work, so we * need to queue_work outside of the tx_mutex. @@ -1124,7 +1124,7 @@ static int nbd_disconnect(struct nbd_device *nbd) struct nbd_config *config = nbd->config; dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); - set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags); + set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); send_disconnects(nbd); return 0; } @@ -1143,7 +1143,7 @@ static void nbd_config_put(struct nbd_device *nbd) struct nbd_config *config = nbd->config; nbd_dev_dbg_close(nbd); nbd_size_clear(nbd); - if (test_and_clear_bit(NBD_HAS_PID_FILE, + if (test_and_clear_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags)) device_remove_file(disk_to_dev(nbd->disk), &pid_attr); nbd->task_recv = NULL; @@ -1209,7 +1209,7 @@ static int nbd_start_device(struct nbd_device *nbd) dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); return error; } - set_bit(NBD_HAS_PID_FILE, &config->runtime_flags); + set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags); nbd_dev_dbg_init(nbd); for (i = 0; i < num_connections; i++) { @@ -1256,9 +1256,9 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b mutex_lock(&nbd->config_lock); nbd_bdev_reset(bdev); /* user requested, ignore socket errors */ - if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags)) + if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags)) ret = 0; - if (test_bit(NBD_TIMEDOUT, &config->runtime_flags)) + if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags)) ret = -ETIMEDOUT; return ret; } @@ -1269,7 +1269,7 @@ static void nbd_clear_sock_ioctl(struct nbd_device *nbd, sock_shutdown(nbd); __invalidate_device(bdev, true); nbd_bdev_reset(bdev); - if (test_and_clear_bit(NBD_HAS_CONFIG_REF, + if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); } @@ -1364,7 +1364,7 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode, /* Don't allow ioctl operations on a nbd device that was created with * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine. */ - if (!test_bit(NBD_BOUND, &config->runtime_flags) || + if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK)) error = __nbd_ioctl(bdev, nbd, cmd, arg); else @@ -1435,7 +1435,7 @@ static void nbd_release(struct gendisk *disk, fmode_t mode) struct nbd_device *nbd = disk->private_data; struct block_device *bdev = bdget_disk(disk, 0); - if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && + if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && bdev->bd_openers == 0) nbd_disconnect_and_put(nbd); @@ -1833,7 +1833,7 @@ again: return -ENOMEM; } refcount_set(&nbd->config_refs, 1); - set_bit(NBD_BOUND, &config->runtime_flags); + set_bit(NBD_RT_BOUND, &config->runtime_flags); ret = nbd_genl_size_set(info, nbd); if (ret) @@ -1853,12 +1853,12 @@ again: if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { - set_bit(NBD_DESTROY_ON_DISCONNECT, + set_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags); put_dev = true; } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { - set_bit(NBD_DISCONNECT_ON_CLOSE, + set_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } } @@ -1897,7 +1897,7 @@ again: out: mutex_unlock(&nbd->config_lock); if (!ret) { - set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags); + set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags); refcount_inc(&nbd->config_refs); nbd_connect_reply(info, nbd->index); } @@ -1919,7 +1919,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) * queue. */ flush_workqueue(nbd->recv_workq); - if (test_and_clear_bit(NBD_HAS_CONFIG_REF, + if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); } @@ -2003,7 +2003,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) mutex_lock(&nbd->config_lock); config = nbd->config; - if (!test_bit(NBD_BOUND, &config->runtime_flags) || + if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || !nbd->task_recv) { dev_err(nbd_to_dev(nbd), "not configured, cannot reconfigure\n"); @@ -2026,20 +2026,20 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { - if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, + if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags)) put_dev = true; } else { - if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, + if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags)) refcount_inc(&nbd->refs); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { - set_bit(NBD_DISCONNECT_ON_CLOSE, + set_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } else { - clear_bit(NBD_DISCONNECT_ON_CLOSE, + clear_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } } From 8454d68563d400fa09b63dc636361b6702ceb8af Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 17 Sep 2019 17:26:06 +0530 Subject: [PATCH 0579/2880] nbd: fix possible page fault for nbd disk When the NBD_CFLAG_DESTROY_ON_DISCONNECT flag is set and at the same time when the socket is closed due to the server daemon is restarted, just before the last DISCONNET is totally done if we start a new connection by using the old nbd_index, there will be crashing randomly, like: <3>[ 110.151949] block nbd1: Receive control failed (result -32) <1>[ 110.152024] BUG: unable to handle page fault for address: 0000058000000840 <1>[ 110.152063] #PF: supervisor read access in kernel mode <1>[ 110.152083] #PF: error_code(0x0000) - not-present page <6>[ 110.152094] PGD 0 P4D 0 <4>[ 110.152106] Oops: 0000 [#1] SMP PTI <4>[ 110.152120] CPU: 0 PID: 6698 Comm: kworker/u5:1 Kdump: loaded Not tainted 5.3.0-rc4+ #2 <4>[ 110.152136] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 <4>[ 110.152166] Workqueue: knbd-recv recv_work [nbd] <4>[ 110.152187] RIP: 0010:__dev_printk+0xd/0x67 <4>[ 110.152206] Code: 10 e8 c5 fd ff ff 48 8b 4c 24 18 65 48 33 0c 25 28 00 [...] <4>[ 110.152244] RSP: 0018:ffffa41581f13d18 EFLAGS: 00010206 <4>[ 110.152256] RAX: ffffa41581f13d30 RBX: ffff96dd7374e900 RCX: 0000000000000000 <4>[ 110.152271] RDX: ffffa41581f13d20 RSI: 00000580000007f0 RDI: ffffffff970ec24f <4>[ 110.152285] RBP: ffffa41581f13d80 R08: ffff96dd7fc17908 R09: 0000000000002e56 <4>[ 110.152299] R10: ffffffff970ec24f R11: 0000000000000003 R12: ffff96dd7374e900 <4>[ 110.152313] R13: 0000000000000000 R14: ffff96dd7374e9d8 R15: ffff96dd6e3b02c8 <4>[ 110.152329] FS: 0000000000000000(0000) GS:ffff96dd7fc00000(0000) knlGS:0000000000000000 <4>[ 110.152362] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4>[ 110.152383] CR2: 0000058000000840 CR3: 0000000067cc6002 CR4: 00000000001606f0 <4>[ 110.152401] Call Trace: <4>[ 110.152422] _dev_err+0x6c/0x83 <4>[ 110.152435] nbd_read_stat.cold+0xda/0x578 [nbd] <4>[ 110.152448] ? __switch_to_asm+0x34/0x70 <4>[ 110.152468] ? __switch_to_asm+0x40/0x70 <4>[ 110.152478] ? __switch_to_asm+0x34/0x70 <4>[ 110.152491] ? __switch_to_asm+0x40/0x70 <4>[ 110.152501] ? __switch_to_asm+0x34/0x70 <4>[ 110.152511] ? __switch_to_asm+0x40/0x70 <4>[ 110.152522] ? __switch_to_asm+0x34/0x70 <4>[ 110.152533] recv_work+0x35/0x9e [nbd] <4>[ 110.152547] process_one_work+0x19d/0x340 <4>[ 110.152558] worker_thread+0x50/0x3b0 <4>[ 110.152568] kthread+0xfb/0x130 <4>[ 110.152577] ? process_one_work+0x340/0x340 <4>[ 110.152609] ? kthread_park+0x80/0x80 <4>[ 110.152637] ret_from_fork+0x35/0x40 This is very easy to reproduce by running the nbd-runner. Reviewed-by: Josef Bacik Signed-off-by: Xiubo Li Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 7e0501c47153..ac07e8c94c79 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,9 @@ struct link_dead_args { #define NBD_RT_DESTROY_ON_DISCONNECT 6 #define NBD_RT_DISCONNECT_ON_CLOSE 7 +#define NBD_DESTROY_ON_DISCONNECT 0 +#define NBD_DISCONNECT_REQUESTED 1 + struct nbd_config { u32 flags; unsigned long runtime_flags; @@ -113,6 +117,9 @@ struct nbd_device { struct list_head list; struct task_struct *task_recv; struct task_struct *task_setup; + + struct completion *destroy_complete; + unsigned long flags; }; #define NBD_CMD_REQUEUED 1 @@ -223,6 +230,16 @@ static void nbd_dev_remove(struct nbd_device *nbd) disk->private_data = NULL; put_disk(disk); } + + /* + * Place this in the last just before the nbd is freed to + * make sure that the disk and the related kobject are also + * totally removed to avoid duplicate creation of the same + * one. + */ + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete) + complete(nbd->destroy_complete); + kfree(nbd); } @@ -1125,6 +1142,7 @@ static int nbd_disconnect(struct nbd_device *nbd) dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); + set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags); send_disconnects(nbd); return 0; } @@ -1636,6 +1654,7 @@ static int nbd_dev_add(int index) nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; + nbd->destroy_complete = NULL; err = blk_mq_alloc_tag_set(&nbd->tag_set); if (err) @@ -1750,6 +1769,7 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) { + DECLARE_COMPLETION_ONSTACK(destroy_complete); struct nbd_device *nbd = NULL; struct nbd_config *config; int index = -1; @@ -1801,6 +1821,17 @@ again: mutex_unlock(&nbd_index_mutex); return -EINVAL; } + + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && + test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { + nbd->destroy_complete = &destroy_complete; + mutex_unlock(&nbd_index_mutex); + + /* Wait untill the the nbd stuff is totally destroyed */ + wait_for_completion(&destroy_complete); + goto again; + } + if (!refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); if (index == -1) @@ -1855,7 +1886,10 @@ again: if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { set_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags); + set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); put_dev = true; + } else { + clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { set_bit(NBD_RT_DISCONNECT_ON_CLOSE, @@ -2029,10 +2063,12 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags)) put_dev = true; + set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } else { if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags)) refcount_inc(&nbd->refs); + clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { From 6a090e97972dc7bb6f2661eda3eabd8ae21e9b6d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Sep 2019 18:39:44 -0700 Subject: [PATCH 0580/2880] arch/microblaze: support get_user() of size 8 bytes arch/microblaze/ is missing support for get_user() of size 8 bytes, so add it by using __copy_from_user(). While there, also drop a lot of the code duplication. Fixes these build errors: drivers/infiniband/core/uverbs_main.o: In function `ib_uverbs_write': drivers/infiniband/core/.tmp_gl_uverbs_main.o:(.text+0x13a4): undefined reference to `__user_bad' drivers/android/binder.o: In function `binder_thread_write': drivers/android/.tmp_gl_binder.o:(.text+0xda6c): undefined reference to `__user_bad' drivers/android/.tmp_gl_binder.o:(.text+0xda98): undefined reference to `__user_bad' drivers/android/.tmp_gl_binder.o:(.text+0xdf10): undefined reference to `__user_bad' drivers/android/.tmp_gl_binder.o:(.text+0xe498): undefined reference to `__user_bad' drivers/android/binder.o:drivers/android/.tmp_gl_binder.o:(.text+0xea78): more undefined references to `__user_bad' follow 'make allmodconfig' now builds successfully for arch/microblaze/. Fixes: 538722ca3b76 ("microblaze: fix get_user/put_user side-effects") Reported-by: kbuild test robot Signed-off-by: Randy Dunlap Acked-by: Linus Torvalds Cc: Al Viro Cc: Steven J. Magnani Cc: Michal Simek Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Andrew Morton Cc: Doug Ledford Signed-off-by: Michal Simek --- arch/microblaze/include/asm/uaccess.h | 42 ++++++--------------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h index bff2a71c828a..a1f206b90753 100644 --- a/arch/microblaze/include/asm/uaccess.h +++ b/arch/microblaze/include/asm/uaccess.h @@ -163,44 +163,15 @@ extern long __user_bad(void); * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ -#define get_user(x, ptr) \ - __get_user_check((x), (ptr), sizeof(*(ptr))) - -#define __get_user_check(x, ptr, size) \ -({ \ - unsigned long __gu_val = 0; \ - const typeof(*(ptr)) __user *__gu_addr = (ptr); \ - int __gu_err = 0; \ - \ - if (access_ok(__gu_addr, size)) { \ - switch (size) { \ - case 1: \ - __get_user_asm("lbu", __gu_addr, __gu_val, \ - __gu_err); \ - break; \ - case 2: \ - __get_user_asm("lhu", __gu_addr, __gu_val, \ - __gu_err); \ - break; \ - case 4: \ - __get_user_asm("lw", __gu_addr, __gu_val, \ - __gu_err); \ - break; \ - default: \ - __gu_err = __user_bad(); \ - break; \ - } \ - } else { \ - __gu_err = -EFAULT; \ - } \ - x = (__force typeof(*(ptr)))__gu_val; \ - __gu_err; \ +#define get_user(x, ptr) ({ \ + const typeof(*(ptr)) __user *__gu_ptr = (ptr); \ + access_ok(__gu_ptr, sizeof(*__gu_ptr)) ? \ + __get_user(x, __gu_ptr) : -EFAULT; \ }) #define __get_user(x, ptr) \ ({ \ unsigned long __gu_val = 0; \ - /*unsigned long __gu_ptr = (unsigned long)(ptr);*/ \ long __gu_err; \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -212,6 +183,11 @@ extern long __user_bad(void); case 4: \ __get_user_asm("lw", (ptr), __gu_val, __gu_err); \ break; \ + case 8: \ + __gu_err = __copy_from_user(&__gu_val, ptr, 8); \ + if (__gu_err) \ + __gu_err = -EFAULT; \ + break; \ default: \ /* __gu_val = 0; __gu_err = -EINVAL;*/ __gu_err = __user_bad();\ } \ From 6be76fd94ba31dae3f62e77d687614ddb75d6b3a Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Wed, 4 Sep 2019 10:16:22 +0200 Subject: [PATCH 0581/2880] microblaze: Enable Xilinx AXI emac driver by default Enable Xilinx AXI emac ethernet driver for Microblaze. Signed-off-by: Michal Simek --- arch/microblaze/configs/mmu_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index 92fd4e95b488..990f23d5fe50 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -41,6 +41,7 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_NETDEVICES=y CONFIG_XILINX_EMACLITE=y +CONFIG_XILINX_AXI_EMAC=y CONFIG_XILINX_LL_TEMAC=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set From 1c62ed908363e249f3b0e1879d9d982e8431fb3f Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Wed, 4 Sep 2019 10:42:12 +0200 Subject: [PATCH 0582/2880] microblaze: defconfig synchronization Kconfig structure has changed that's why make sense to do synchronization. Signed-off-by: Michal Simek --- arch/microblaze/configs/mmu_defconfig | 19 ++++++++----------- arch/microblaze/configs/nommu_defconfig | 23 +++++++++++------------ 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index 990f23d5fe50..b83aff1e86cd 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -5,15 +5,10 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_SYSFS_DEPRECATED=y CONFIG_SYSFS_DEPRECATED_V2=y -CONFIG_KALLSYMS_ALL=y # CONFIG_BASE_FULL is not set +CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_PARTITION_ADVANCED=y -# CONFIG_EFI_PARTITION is not set CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_BARREL=1 @@ -25,14 +20,19 @@ CONFIG_MMU=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE_FORCE=y CONFIG_HIGHMEM=y -CONFIG_PCI=y CONFIG_PCI_XILINX=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_PARTITION_ADVANCED=y +# CONFIG_EFI_PARTITION is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set CONFIG_BRIDGE=m +CONFIG_PCI=y CONFIG_MTD=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y @@ -75,8 +75,8 @@ CONFIG_CRAMFS=y CONFIG_ROMFS_FS=y CONFIG_NFS_FS=y CONFIG_CIFS=y -CONFIG_CIFS_STATS=y CONFIG_CIFS_STATS2=y +CONFIG_ENCRYPTED_KEYS=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_SLAB=y CONFIG_DETECT_HUNG_TASK=y @@ -84,6 +84,3 @@ CONFIG_DEBUG_SPINLOCK=y CONFIG_KGDB=y CONFIG_KGDB_TESTS=y CONFIG_KGDB_KDB=y -CONFIG_EARLY_PRINTK=y -CONFIG_KEYS=y -CONFIG_ENCRYPTED_KEYS=y diff --git a/arch/microblaze/configs/nommu_defconfig b/arch/microblaze/configs/nommu_defconfig index 06d69a6e192d..dc102eb65fca 100644 --- a/arch/microblaze/configs/nommu_defconfig +++ b/arch/microblaze/configs/nommu_defconfig @@ -7,15 +7,10 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_SYSFS_DEPRECATED=y CONFIG_SYSFS_DEPRECATED_V2=y -CONFIG_KALLSYMS_ALL=y # CONFIG_BASE_FULL is not set +CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_PARTITION_ADVANCED=y -# CONFIG_EFI_PARTITION is not set CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_BARREL=1 @@ -25,13 +20,18 @@ CONFIG_XILINX_MICROBLAZE0_USE_FPU=2 CONFIG_HZ_100=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE_FORCE=y -CONFIG_PCI=y CONFIG_PCI_XILINX=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_PARTITION_ADVANCED=y +# CONFIG_EFI_PARTITION is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set +CONFIG_PCI=y CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y CONFIG_MTD_BLOCK=y @@ -75,11 +75,6 @@ CONFIG_ROMFS_FS=y CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_NLS=y -CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_SLAB=y -CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEBUG_SPINLOCK=y -CONFIG_EARLY_PRINTK=y CONFIG_KEYS=y CONFIG_ENCRYPTED_KEYS=y CONFIG_CRYPTO_ECB=y @@ -87,3 +82,7 @@ CONFIG_CRYPTO_MD4=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_ARC4=y CONFIG_CRYPTO_DES=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_SLAB=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_DEBUG_SPINLOCK=y From e0f32f78e51b9989ee89f608fd0dd10e9c230652 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Tue, 17 Sep 2019 14:09:35 +0200 Subject: [PATCH 0583/2880] drm/kms: Duct-tape for mode object lifetime checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4f5368b5541a902f6596558b05f5c21a9770dd32 Author: Daniel Vetter Date: Fri Jun 14 08:17:23 2019 +0200 drm/kms: Catch mode_object lifetime errors uncovered a bit a mess in dp drivers. Most drivers (from a quick look, all except i915) register all the dp stuff in their init code, which is too early. With CONFIG_DRM_DP_AUX_CHARDEV this will blow up, because drm_dp_aux_register tries to add a child to a device in sysfs (the connector) which doesn't even exist yet. No one seems to have cared thus far. But with the above change I also moved the setting of dev->registered after the ->load callback, in an attempt to keep old drivers from hitting any WARN_ON backtraces. But that moved radeon.ko from the "working, by accident" to "now also broken" category. Since this is a huge mess I figured a revert would be simplest. But this check has already caught issues in i915: commit 1b9bd09630d4db4827cc04d358a41a16a6bc2cb0 Author: Ville Syrjälä Date: Tue Aug 20 19:16:57 2019 +0300 drm/i915: Do not create a new max_bpc prop for MST connectors Hence I'd like to retain it. Fix the radeon regression by moving the setting of dev->registered back to were it was, and stop the backtraces with an explicit check for dev->driver->load. Everyone else will stay as broken with CONFIG_DRM_DP_AUX_CHARDEV. The next patch will improve the kerneldoc and add a todo entry for this. Fixes: 4f5368b5541a ("drm/kms: Catch mode_object lifetime errors") Cc: Sean Paul Cc: Maarten Lankhorst Reported-by: Michel Dänzer Reviewed-by: Michel Dänzer Tested-by: Michel Dänzer Cc: Michel Dänzer Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20190917120936.7501-1-daniel.vetter@ffwll.ch --- drivers/gpu/drm/drm_drv.c | 4 ++-- drivers/gpu/drm/drm_mode_object.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c index c456c3d3def2..769feefeeeef 100644 --- a/drivers/gpu/drm/drm_drv.c +++ b/drivers/gpu/drm/drm_drv.c @@ -976,14 +976,14 @@ int drm_dev_register(struct drm_device *dev, unsigned long flags) if (ret) goto err_minors; + dev->registered = true; + if (dev->driver->load) { ret = dev->driver->load(dev, flags); if (ret) goto err_minors; } - dev->registered = true; - if (drm_core_check_feature(dev, DRIVER_MODESET)) drm_modeset_register_all(dev); diff --git a/drivers/gpu/drm/drm_mode_object.c b/drivers/gpu/drm/drm_mode_object.c index c355ba8e6d5d..6a23e36ed4fe 100644 --- a/drivers/gpu/drm/drm_mode_object.c +++ b/drivers/gpu/drm/drm_mode_object.c @@ -42,7 +42,7 @@ int __drm_mode_object_add(struct drm_device *dev, struct drm_mode_object *obj, { int ret; - WARN_ON(dev->registered && !obj_free_cb); + WARN_ON(!dev->driver->load && dev->registered && !obj_free_cb); mutex_lock(&dev->mode_config.idr_mutex); ret = idr_alloc(&dev->mode_config.object_idr, register_obj ? obj : NULL, @@ -104,7 +104,7 @@ void drm_mode_object_register(struct drm_device *dev, void drm_mode_object_unregister(struct drm_device *dev, struct drm_mode_object *object) { - WARN_ON(dev->registered && !object->free_cb); + WARN_ON(!dev->driver->load && dev->registered && !object->free_cb); mutex_lock(&dev->mode_config.idr_mutex); if (object->id) { From 0dce49efc70536a8c3b4bb5354a71b727ba31b80 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Wed, 18 Sep 2019 12:03:44 +0200 Subject: [PATCH 0584/2880] ASoC: atmel_ssc_dai: Remove wrong spinlock usage A potential bug was reported in the email "[BUG] atmel_ssc_dai: a possible sleep-in-atomic bug in atmel_ssc_shutdown"[1] Indeed in the function atmel_ssc_shutdown() free_irq() was called in a critical section protected by spinlock. However this spinlock is only used in atmel_ssc_shutdown() and atmel_ssc_startup() functions. After further analysis, it occurred that the call to these function are already protected by mutex used on the calling functions. Then we can remove the spinlock which will fix this bug as a side effect. Thanks to this patch the following message disappears: "BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909" [1]: https://www.spinics.net/lists/alsa-devel/msg71286.html Reviewed-by: Alexandre Belloni Signed-off-by: Gregory CLEMENT Link: https://lore.kernel.org/r/20190918100344.23629-1-gregory.clement@bootlin.com Signed-off-by: Mark Brown --- sound/soc/atmel/atmel_ssc_dai.c | 12 ++---------- sound/soc/atmel/atmel_ssc_dai.h | 1 - 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/sound/soc/atmel/atmel_ssc_dai.c b/sound/soc/atmel/atmel_ssc_dai.c index 48e9eef34c0f..ca603397651c 100644 --- a/sound/soc/atmel/atmel_ssc_dai.c +++ b/sound/soc/atmel/atmel_ssc_dai.c @@ -116,19 +116,16 @@ static struct atmel_pcm_dma_params ssc_dma_params[NUM_SSC_DEVICES][2] = { static struct atmel_ssc_info ssc_info[NUM_SSC_DEVICES] = { { .name = "ssc0", - .lock = __SPIN_LOCK_UNLOCKED(ssc_info[0].lock), .dir_mask = SSC_DIR_MASK_UNUSED, .initialized = 0, }, { .name = "ssc1", - .lock = __SPIN_LOCK_UNLOCKED(ssc_info[1].lock), .dir_mask = SSC_DIR_MASK_UNUSED, .initialized = 0, }, { .name = "ssc2", - .lock = __SPIN_LOCK_UNLOCKED(ssc_info[2].lock), .dir_mask = SSC_DIR_MASK_UNUSED, .initialized = 0, }, @@ -317,13 +314,10 @@ static int atmel_ssc_startup(struct snd_pcm_substream *substream, snd_soc_dai_set_dma_data(dai, substream, dma_params); - spin_lock_irq(&ssc_p->lock); - if (ssc_p->dir_mask & dir_mask) { - spin_unlock_irq(&ssc_p->lock); + if (ssc_p->dir_mask & dir_mask) return -EBUSY; - } + ssc_p->dir_mask |= dir_mask; - spin_unlock_irq(&ssc_p->lock); return 0; } @@ -355,7 +349,6 @@ static void atmel_ssc_shutdown(struct snd_pcm_substream *substream, dir_mask = 1 << dir; - spin_lock_irq(&ssc_p->lock); ssc_p->dir_mask &= ~dir_mask; if (!ssc_p->dir_mask) { if (ssc_p->initialized) { @@ -369,7 +362,6 @@ static void atmel_ssc_shutdown(struct snd_pcm_substream *substream, ssc_p->cmr_div = ssc_p->tcmr_period = ssc_p->rcmr_period = 0; ssc_p->forced_divider = 0; } - spin_unlock_irq(&ssc_p->lock); /* Shutdown the SSC clock. */ pr_debug("atmel_ssc_dai: Stopping clock\n"); diff --git a/sound/soc/atmel/atmel_ssc_dai.h b/sound/soc/atmel/atmel_ssc_dai.h index ae764cb541c7..3470b966e449 100644 --- a/sound/soc/atmel/atmel_ssc_dai.h +++ b/sound/soc/atmel/atmel_ssc_dai.h @@ -93,7 +93,6 @@ struct atmel_ssc_state { struct atmel_ssc_info { char *name; struct ssc_device *ssc; - spinlock_t lock; /* lock for dir_mask */ unsigned short dir_mask; /* 0=unused, 1=playback, 2=capture */ unsigned short initialized; /* true if SSC has been initialized */ unsigned short daifmt; From 42fd8baab31f53bed2952485fcf0e92f244c5e55 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 17 Sep 2019 10:34:54 -0400 Subject: [PATCH 0585/2880] sched/core: Convert vcpu_is_preempted() from macro to an inline function Clang reports this warning: kernel/locking/osq_lock.c:25:19: warning: unused function 'node_cpu' [-Wunused-function] due to osq_lock() calling vcpu_is_preempted(node_cpu(node->prev))), but vcpu_is_preempted() is compiled away. Fix it by converting the dummy vcpu_is_preempted() from a macro to a proper static inline function. Signed-off-by: Qian Cai Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: rostedt@goodmis.org Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/1568730894-10483-1-git-send-email-cai@lca.pw Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f0edee94834a..e2e91960d79f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1856,7 +1856,10 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) * running or not. */ #ifndef vcpu_is_preempted -# define vcpu_is_preempted(cpu) false +static inline bool vcpu_is_preempted(int cpu) +{ + return false; +} #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); From c107d613f9204ff9c7624c229938153d7492c56e Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Wed, 18 Sep 2019 06:57:30 +0000 Subject: [PATCH 0586/2880] irqchip/gic-v3: Fix GIC_LINE_NR accessor As per GIC spec, ITLinesNumber indicates the maximum SPI INTID that the GIC implementation supports. And the maximum SPI INTID an implementation might support is 1019 (field value 11111). max(GICD_TYPER_SPIS(...), 1020) is not what we actually want for GIC_LINE_NR. Fix it to min(GICD_TYPER_SPIS(...), 1020). Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1568789850-14080-1-git-send-email-yuzenghui@huawei.com --- drivers/irqchip/irq-gic-v3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 422664ac5f53..1edc99335a94 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -59,7 +59,7 @@ static struct gic_chip_data gic_data __read_mostly; static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key); #define GIC_ID_NR (1U << GICD_TYPER_ID_BITS(gic_data.rdists.gicd_typer)) -#define GIC_LINE_NR max(GICD_TYPER_SPIS(gic_data.rdists.gicd_typer), 1020U) +#define GIC_LINE_NR min(GICD_TYPER_SPIS(gic_data.rdists.gicd_typer), 1020U) #define GIC_ESPI_NR GICD_TYPER_ESPIS(gic_data.rdists.gicd_typer) /* From bb0fed1c60cccbe4063b455a7228818395dac86e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 15 Sep 2019 15:17:45 +0100 Subject: [PATCH 0587/2880] irqchip/sifive-plic: Switch to fasteoi flow The SiFive PLIC interrupt controller seems to have all the HW features to support the fasteoi flow, but the driver seems to be stuck in a distant past. Bring it into the 21st century. Signed-off-by: Marc Zyngier Tested-by: Palmer Dabbelt (QEMU Boot) Tested-by: Darius Rad (on 2 HW PLIC implementations) Tested-by: Paul Walmsley (HiFive Unleashed) Reviewed-by: Palmer Dabbelt Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/8636gxskmj.wl-maz@kernel.org --- drivers/irqchip/irq-sifive-plic.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index cf755964f2f8..3e51deedbcc8 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -97,7 +97,7 @@ static inline void plic_irq_toggle(const struct cpumask *mask, } } -static void plic_irq_enable(struct irq_data *d) +static void plic_irq_unmask(struct irq_data *d) { unsigned int cpu = cpumask_any_and(irq_data_get_affinity_mask(d), cpu_online_mask); @@ -106,7 +106,7 @@ static void plic_irq_enable(struct irq_data *d) plic_irq_toggle(cpumask_of(cpu), d->hwirq, 1); } -static void plic_irq_disable(struct irq_data *d) +static void plic_irq_mask(struct irq_data *d) { plic_irq_toggle(cpu_possible_mask, d->hwirq, 0); } @@ -125,10 +125,8 @@ static int plic_set_affinity(struct irq_data *d, if (cpu >= nr_cpu_ids) return -EINVAL; - if (!irqd_irq_disabled(d)) { - plic_irq_toggle(cpu_possible_mask, d->hwirq, 0); - plic_irq_toggle(cpumask_of(cpu), d->hwirq, 1); - } + plic_irq_toggle(cpu_possible_mask, d->hwirq, 0); + plic_irq_toggle(cpumask_of(cpu), d->hwirq, 1); irq_data_update_effective_affinity(d, cpumask_of(cpu)); @@ -136,14 +134,18 @@ static int plic_set_affinity(struct irq_data *d, } #endif +static void plic_irq_eoi(struct irq_data *d) +{ + struct plic_handler *handler = this_cpu_ptr(&plic_handlers); + + writel(d->hwirq, handler->hart_base + CONTEXT_CLAIM); +} + static struct irq_chip plic_chip = { .name = "SiFive PLIC", - /* - * There is no need to mask/unmask PLIC interrupts. They are "masked" - * by reading claim and "unmasked" when writing it back. - */ - .irq_enable = plic_irq_enable, - .irq_disable = plic_irq_disable, + .irq_mask = plic_irq_mask, + .irq_unmask = plic_irq_unmask, + .irq_eoi = plic_irq_eoi, #ifdef CONFIG_SMP .irq_set_affinity = plic_set_affinity, #endif @@ -152,7 +154,7 @@ static struct irq_chip plic_chip = { static int plic_irqdomain_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hwirq) { - irq_set_chip_and_handler(irq, &plic_chip, handle_simple_irq); + irq_set_chip_and_handler(irq, &plic_chip, handle_fasteoi_irq); irq_set_chip_data(irq, NULL); irq_set_noprobe(irq); return 0; @@ -188,7 +190,6 @@ static void plic_handle_irq(struct pt_regs *regs) hwirq); else generic_handle_irq(irq); - writel(hwirq, claim); } csr_set(sie, SIE_SEIE); } From 11ed5cf064beac3ca345bb75fdf2429e03ac106a Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 9 Sep 2019 16:21:30 +0100 Subject: [PATCH 0588/2880] firmware: arm_scmi: reset: fix reset_state assignment in scmi_domain_reset Fix the copy paste typo that incorrectly assigns domain_id with the passed 'state' parameter instead of reset_state. Fixes: 95a15d80aa0d ("firmware: arm_scmi: Add RESET protocol in SCMI v2.0") Reported-by: Etienne Carriere Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/reset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/reset.c b/drivers/firmware/arm_scmi/reset.c index 64cc81915581..ab42c21c5517 100644 --- a/drivers/firmware/arm_scmi/reset.c +++ b/drivers/firmware/arm_scmi/reset.c @@ -150,7 +150,7 @@ static int scmi_domain_reset(const struct scmi_handle *handle, u32 domain, dom = t->tx.buf; dom->domain_id = cpu_to_le32(domain); dom->flags = cpu_to_le32(flags); - dom->domain_id = cpu_to_le32(state); + dom->reset_state = cpu_to_le32(state); if (rdom->async_reset) ret = scmi_do_xfer_with_response(handle, t); From 61423712dbb86e02af4aa5de65b9041493c92cac Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 9 Sep 2019 16:21:07 +0100 Subject: [PATCH 0589/2880] reset: reset-scmi: add missing handle initialisation scmi_reset_data->handle needs to be initialised at probe, so that it can be later used to access scmi reset protocol APIs using the same. Since it was tested with a module that obtained handle elsewhere, it was missed easily. Add the missing scmi_reset_data->handle initialisation to fix the issue. Fixes: c8ae9c2da1cc ("reset: Add support for resets provided by SCMI") Acked-by: Philipp Zabel Reported-by: Etienne Carriere Signed-off-by: Sudeep Holla --- drivers/reset/reset-scmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/reset/reset-scmi.c b/drivers/reset/reset-scmi.c index c6d3c8427f14..b46df80ec6c3 100644 --- a/drivers/reset/reset-scmi.c +++ b/drivers/reset/reset-scmi.c @@ -102,6 +102,7 @@ static int scmi_reset_probe(struct scmi_device *sdev) data->rcdev.owner = THIS_MODULE; data->rcdev.of_node = np; data->rcdev.nr_resets = handle->reset_ops->num_domains_get(handle); + data->handle = handle; return devm_reset_controller_register(dev, &data->rcdev); } From 2d1d25d0a224dcd2021004d52342fc1727ccd85f Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 29 Aug 2019 14:41:04 +0100 Subject: [PATCH 0590/2880] virtio-fs: add Documentation/filesystems/virtiofs.rst Add information about the new "virtiofs" file system. Signed-off-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- Documentation/filesystems/index.rst | 10 +++++ Documentation/filesystems/virtiofs.rst | 60 ++++++++++++++++++++++++++ MAINTAINERS | 12 ++++++ 3 files changed, 82 insertions(+) create mode 100644 Documentation/filesystems/virtiofs.rst diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 2de2fe2ab078..56e94bfc580f 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -32,3 +32,13 @@ filesystem implementations. journalling fscrypt + +Filesystems +=========== + +Documentation for filesystem implementations. + +.. toctree:: + :maxdepth: 2 + + virtiofs diff --git a/Documentation/filesystems/virtiofs.rst b/Documentation/filesystems/virtiofs.rst new file mode 100644 index 000000000000..4f338e3cb3f7 --- /dev/null +++ b/Documentation/filesystems/virtiofs.rst @@ -0,0 +1,60 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================================== +virtiofs: virtio-fs host<->guest shared file system +=================================================== + +- Copyright (C) 2019 Red Hat, Inc. + +Introduction +============ +The virtiofs file system for Linux implements a driver for the paravirtualized +VIRTIO "virtio-fs" device for guest<->host file system sharing. It allows a +guest to mount a directory that has been exported on the host. + +Guests often require access to files residing on the host or remote systems. +Use cases include making files available to new guests during installation, +booting from a root file system located on the host, persistent storage for +stateless or ephemeral guests, and sharing a directory between guests. + +Although it is possible to use existing network file systems for some of these +tasks, they require configuration steps that are hard to automate and they +expose the storage network to the guest. The virtio-fs device was designed to +solve these problems by providing file system access without networking. + +Furthermore the virtio-fs device takes advantage of the co-location of the +guest and host to increase performance and provide semantics that are not +possible with network file systems. + +Usage +===== +Mount file system with tag ``myfs`` on ``/mnt``: + +.. code-block:: sh + + guest# mount -t virtiofs myfs /mnt + +Please see https://virtio-fs.gitlab.io/ for details on how to configure QEMU +and the virtiofsd daemon. + +Internals +========= +Since the virtio-fs device uses the FUSE protocol for file system requests, the +virtiofs file system for Linux is integrated closely with the FUSE file system +client. The guest acts as the FUSE client while the host acts as the FUSE +server. The /dev/fuse interface between the kernel and userspace is replaced +with the virtio-fs device interface. + +FUSE requests are placed into a virtqueue and processed by the host. The +response portion of the buffer is filled in by the host and the guest handles +the request completion. + +Mapping /dev/fuse to virtqueues requires solving differences in semantics +between /dev/fuse and virtqueues. Each time the /dev/fuse device is read, the +FUSE client may choose which request to transfer, making it possible to +prioritize certain requests over others. Virtqueues have queue semantics and +it is not possible to change the order of requests that have been enqueued. +This is especially important if the virtqueue becomes full since it is then +impossible to add high priority requests. In order to address this difference, +the virtio-fs device uses a "hiprio" virtqueue specifically for requests that +have priority over normal requests. diff --git a/MAINTAINERS b/MAINTAINERS index 9cbcf167bdd0..953e8c60d1c0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17117,6 +17117,18 @@ S: Supported F: drivers/s390/virtio/ F: arch/s390/include/uapi/asm/virtio-ccw.h +VIRTIO FILE SYSTEM +M: Vivek Goyal +M: Stefan Hajnoczi +M: Miklos Szeredi +L: virtualization@lists.linux-foundation.org +L: linux-fsdevel@vger.kernel.org +W: https://virtio-fs.gitlab.io/ +S: Supported +F: fs/fuse/virtio_fs.c +F: include/uapi/linux/virtio_fs.h +F: Documentation/filesystems/virtiofs.rst + VIRTIO GPU DRIVER M: David Airlie M: Gerd Hoffmann From e16a7cbced7110866dcde84e504909ea85099bbd Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 17 Sep 2019 14:49:53 -0500 Subject: [PATCH 0591/2880] drm/amdgpu: flag navi12 and 14 as experimental for 5.4 We can remove this later as things get closer to launch. Reviewed-by: Xiaojie Yuan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index c68e54a27a2c..5da72ca6f3e1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1011,15 +1011,15 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x731B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, {0x1002, 0x731F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10}, /* Navi14 */ - {0x1002, 0x7340, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14}, - {0x1002, 0x7341, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14}, - {0x1002, 0x7347, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14}, + {0x1002, 0x7340, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14|AMD_EXP_HW_SUPPORT}, + {0x1002, 0x7341, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14|AMD_EXP_HW_SUPPORT}, + {0x1002, 0x7347, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14|AMD_EXP_HW_SUPPORT}, /* Renoir */ {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU|AMD_EXP_HW_SUPPORT}, /* Navi12 */ - {0x1002, 0x7360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12}, + {0x1002, 0x7360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12|AMD_EXP_HW_SUPPORT}, {0, 0, 0} }; From 26b1d3b527e7bf3e24b814d617866ac5199ce68d Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Thu, 5 Sep 2019 20:53:18 +0200 Subject: [PATCH 0592/2880] drm/atomic: Take the atomic toys away from X MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The -modesetting ddx has a totally broken idea of how atomic works: - doesn't disable old connectors, assuming they get auto-disable like with the legacy setcrtc - assumes ASYNC_FLIP is wired through for the atomic ioctl - not a single call to TEST_ONLY Iow the implementation is a 1:1 translation of legacy ioctls to atomic, which is a) broken b) pointless. We already have bugs in both i915 and amdgpu-DC where this prevents us from enabling neat features. If anyone ever cares about atomic in X we can easily add a new atomic level (req->value == 2) for X to get back the shiny toys. Since these broken versions of -modesetting have been shipping, there's really no other way to get out of this bind. v2: - add an informational dmesg output (Rob, Ajax) - reorder after the DRIVER_ATOMIC check to avoid useless noise (Ilia) - allow req->value > 2 so that X can do another attempt at atomic in the future v3: Go with paranoid, insist that the X should be first (suggested by Rob) Cc: Ilia Mirkin References: https://gitlab.freedesktop.org/xorg/xserver/issues/629 References: https://gitlab.freedesktop.org/xorg/xserver/merge_requests/180 References: abbc0697d5fb ("drm/fb: revert the i915 Actually configure untiled displays from master") Cc: Maarten Lankhorst Reviewed-by: Maarten Lankhorst (v1) Reviewed-by: Nicholas Kazlauskas (v1) Cc: Michel Dänzer Cc: Alex Deucher Cc: Adam Jackson Acked-by: Adam Jackson Cc: Sean Paul Cc: David Airlie Cc: Rob Clark Acked-by: Rob Clark Cc: stable@vger.kernel.org Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20190905185318.31363-1-daniel.vetter@ffwll.ch --- drivers/gpu/drm/drm_ioctl.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c index f675a3bb2c88..fcd728d7cf72 100644 --- a/drivers/gpu/drm/drm_ioctl.c +++ b/drivers/gpu/drm/drm_ioctl.c @@ -336,7 +336,12 @@ drm_setclientcap(struct drm_device *dev, void *data, struct drm_file *file_priv) case DRM_CLIENT_CAP_ATOMIC: if (!drm_core_check_feature(dev, DRIVER_ATOMIC)) return -EOPNOTSUPP; - if (req->value > 1) + /* The modesetting DDX has a totally broken idea of atomic. */ + if (current->comm[0] == 'X' && req->value == 1) { + pr_info("broken atomic modeset userspace detected, disabling atomic\n"); + return -EOPNOTSUPP; + } + if (req->value > 2) return -EINVAL; file_priv->atomic = req->value; file_priv->universal_planes = req->value; From f2cbda2dba11de868759cae9c0d2bab5b8411406 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Tue, 3 Sep 2019 21:06:41 +0200 Subject: [PATCH 0593/2880] drm/atomic: Reject FLIP_ASYNC unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's never been wired up. Only userspace that tried to use it (and didn't actually check whether anything works, but hey it builds) is the -modesetting atomic implementation. And we just shut that up. If there's anyone else then we need to silently accept this flag no matter what, and find a new one. Because once a flag is tainted, it's lost. Reviewed-by: Maarten Lankhorst Reviewed-by: Nicholas Kazlauskas Cc: Maarten Lankhorst Cc: Michel Dänzer Cc: Alex Deucher Cc: Adam Jackson Cc: Sean Paul Cc: David Airlie Cc: stable@vger.kernel.org Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20190903190642.32588-2-daniel.vetter@ffwll.ch --- drivers/gpu/drm/drm_atomic_uapi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c index 5a5b42db6f2a..7a26bfb5329c 100644 --- a/drivers/gpu/drm/drm_atomic_uapi.c +++ b/drivers/gpu/drm/drm_atomic_uapi.c @@ -1305,8 +1305,7 @@ int drm_mode_atomic_ioctl(struct drm_device *dev, if (arg->reserved) return -EINVAL; - if ((arg->flags & DRM_MODE_PAGE_FLIP_ASYNC) && - !dev->mode_config.async_page_flip) + if (arg->flags & DRM_MODE_PAGE_FLIP_ASYNC) return -EINVAL; /* can't test and expect an event at the same time. */ From 954dab193d19cbbff8f83b58c9360bf00ddb273c Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Wed, 18 Sep 2019 10:37:52 +0800 Subject: [PATCH 0594/2880] io_uring: use kmemdup instead of kmalloc and memcpy Just clean up the code, no function changes. Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0dadbdbead0f..42a684ef578a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2098,13 +2098,11 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; - sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); + sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); if (sqe_copy) { struct async_list *list; - memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy)); s->sqe = sqe_copy; - memcpy(&req->submit, s, sizeof(*s)); list = io_async_list_from_sqe(ctx, s->sqe); if (!io_add_to_prev_work(list, req)) { From 5f5ad9ced33621d353be6429c3900f8a526fcae8 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Wed, 18 Sep 2019 10:37:53 +0800 Subject: [PATCH 0595/2880] io_uring: fix use-after-free of shadow_req There is a potential dangling pointer problem. we never clean shadow_req, if there are multiple link lists in this series of sqes, then the shadow_req will not reallocate, and continue to use the last one. but in the previous, his memory has been released, thus forming a dangling pointer. let's clean up him and make sure that every new link list can reapply for a new shadow_req. Fixes: 4fe2c963154c ("io_uring: add support for link with drain") Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 42a684ef578a..1621243da1ea 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2357,6 +2357,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, io_queue_link_head(ctx, link, &link->submit, shadow_req, true); link = NULL; + shadow_req = NULL; } prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; @@ -2543,6 +2544,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, io_queue_link_head(ctx, link, &link->submit, shadow_req, force_nonblock); link = NULL; + shadow_req = NULL; } prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; From 6cc47d1d2a9b631f62405f56df651975c7587a97 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 Sep 2019 11:18:23 -0600 Subject: [PATCH 0596/2880] io_uring: ensure poll commands clear ->sqe If we end up getting woken in poll (due to a signal), then we may need to punt the poll request to an async worker. When we do that, we look up the list to queue at, deferefencing req->submit.sqe, however that is only set for requests we initially decided to queue async. This fixes a crash with poll command usage and wakeups that need to punt to async context. Fixes: 54a91f3bb9b9 ("io_uring: limit parallelism of buffered writes") Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1621243da1ea..f9139f5bd158 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -446,16 +446,15 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) static inline void io_queue_async_work(struct io_ring_ctx *ctx, struct io_kiocb *req) { - int rw; + int rw = 0; - switch (req->submit.sqe->opcode) { - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - rw = !(req->rw.ki_flags & IOCB_DIRECT); - break; - default: - rw = 0; - break; + if (req->submit.sqe) { + switch (req->submit.sqe->opcode) { + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + rw = !(req->rw.ki_flags & IOCB_DIRECT); + break; + } } queue_work(ctx->sqo_wq[rw], &req->work); @@ -1714,6 +1713,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!poll->file) return -EBADF; + req->submit.sqe = NULL; INIT_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; From a1041c27b64ce744632147e19701c95fed14fab1 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Wed, 18 Sep 2019 17:25:52 +0800 Subject: [PATCH 0597/2880] io_uring: fix potential crash issue due to io_get_req failure Sometimes io_get_req will return a NUL, then we need to do the correct error handling, otherwise it will cause the kernel null pointer exception. Fixes: 4fe2c963154c ("io_uring: add support for link with drain") Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index f9139f5bd158..854dedd885fa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2364,12 +2364,15 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) { if (!shadow_req) { shadow_req = io_get_req(ctx, NULL); + if (unlikely(!shadow_req)) + goto out; shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); refcount_dec(&shadow_req->refs); } shadow_req->sequence = sqes[i].sequence; } +out: if (unlikely(mm_fault)) { io_cqring_add_event(ctx, sqes[i].sqe->user_data, -EFAULT); @@ -2551,12 +2554,15 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { if (!shadow_req) { shadow_req = io_get_req(ctx, NULL); + if (unlikely(!shadow_req)) + goto out; shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); refcount_dec(&shadow_req->refs); } shadow_req->sequence = s.sequence; } +out: s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; From 9831a90ce64362f8429e8fd23838a9db2cdf7803 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Sep 2019 09:48:55 -0600 Subject: [PATCH 0598/2880] io_uring: use cond_resched() in sqthread If preempt isn't enabled in the kernel, we can run into hang issues with sqthread submissions. Use cond_resched() to play nice instead of cpu_relax(), if we end up starting the loop and not having any events pending for submissions. Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 854dedd885fa..05a299e80159 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2438,7 +2438,7 @@ static int io_sq_thread(void *data) * to sleep. */ if (inflight || !time_after(jiffies, timeout)) { - cpu_relax(); + cond_resched(); continue; } From 5262f567987d3c30052b22e78c35c2313d07b230 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Sep 2019 12:26:57 -0600 Subject: [PATCH 0599/2880] io_uring: IORING_OP_TIMEOUT support There's been a few requests for functionality similar to io_getevents() and epoll_wait(), where the user can specify a timeout for waiting on events. I deliberately did not add support for this through the system call initially to avoid overloading the args, but I can see that the use cases for this are valid. This adds support for IORING_OP_TIMEOUT. If a user wants to get woken when waiting for events, simply submit one of these timeout commands with your wait call (or before). This ensures that the application sleeping on the CQ ring waiting for events will get woken. The timeout command is passed in as a pointer to a struct timespec. Timeouts are relative. The timeout command also includes a way to auto-cancel after N events has passed. Signed-off-by: Jens Axboe --- fs/io_uring.c | 149 ++++++++++++++++++++++++++++++++-- include/uapi/linux/io_uring.h | 2 + 2 files changed, 146 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 05a299e80159..9d8e703bc851 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -200,6 +200,7 @@ struct io_ring_ctx { struct io_uring_sqe *sq_sqes; struct list_head defer_list; + struct list_head timeout_list; } ____cacheline_aligned_in_smp; /* IO offload */ @@ -216,6 +217,7 @@ struct io_ring_ctx { struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; struct eventfd_ctx *cq_ev_fd; + atomic_t cq_timeouts; } ____cacheline_aligned_in_smp; struct io_rings *rings; @@ -283,6 +285,11 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +struct io_timeout { + struct file *file; + struct hrtimer timer; +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -294,6 +301,7 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; + struct io_timeout timeout; }; struct sqe_submit submit; @@ -313,6 +321,7 @@ struct io_kiocb { #define REQ_F_LINK_DONE 128 /* linked sqes done */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ #define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ +#define REQ_F_TIMEOUT 1024 /* timeout request */ u64 user_data; u32 result; u32 sequence; @@ -344,6 +353,8 @@ struct io_submit_state { }; static void io_sq_wq_submit_work(struct work_struct *work); +static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res); static void __io_free_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -400,26 +411,30 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->cancel_list); INIT_LIST_HEAD(&ctx->defer_list); + INIT_LIST_HEAD(&ctx->timeout_list); return ctx; } static inline bool io_sequence_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) { - if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) + /* timeout requests always honor sequence */ + if (!(req->flags & REQ_F_TIMEOUT) && + (req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; } -static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) +static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx, + struct list_head *list) { struct io_kiocb *req; - if (list_empty(&ctx->defer_list)) + if (list_empty(list)) return NULL; - req = list_first_entry(&ctx->defer_list, struct io_kiocb, list); + req = list_first_entry(list, struct io_kiocb, list); if (!io_sequence_defer(ctx, req)) { list_del_init(&req->list); return req; @@ -428,6 +443,16 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) return NULL; } +static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) +{ + return __io_get_deferred_req(ctx, &ctx->defer_list); +} + +static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) +{ + return __io_get_deferred_req(ctx, &ctx->timeout_list); +} + static void __io_commit_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; @@ -460,10 +485,36 @@ static inline void io_queue_async_work(struct io_ring_ctx *ctx, queue_work(ctx->sqo_wq[rw], &req->work); } +static void io_kill_timeout(struct io_kiocb *req) +{ + int ret; + + ret = hrtimer_try_to_cancel(&req->timeout.timer); + if (ret != -1) { + atomic_inc(&req->ctx->cq_timeouts); + list_del(&req->list); + io_cqring_fill_event(req->ctx, req->user_data, 0); + __io_free_req(req); + } +} + +static void io_kill_timeouts(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req, *tmp; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + io_kill_timeout(req); + spin_unlock_irq(&ctx->completion_lock); +} + static void io_commit_cqring(struct io_ring_ctx *ctx) { struct io_kiocb *req; + while ((req = io_get_timeout_req(ctx)) != NULL) + io_kill_timeout(req); + __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { @@ -1765,6 +1816,81 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ipt.error; } +static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) +{ + struct io_ring_ctx *ctx; + struct io_kiocb *req; + unsigned long flags; + + req = container_of(timer, struct io_kiocb, timeout.timer); + ctx = req->ctx; + atomic_inc(&ctx->cq_timeouts); + + spin_lock_irqsave(&ctx->completion_lock, flags); + list_del(&req->list); + + io_cqring_fill_event(ctx, req->user_data, -ETIME); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + + io_put_req(req); + return HRTIMER_NORESTART; +} + +static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned count, req_dist, tail_index; + struct io_ring_ctx *ctx = req->ctx; + struct list_head *entry; + struct timespec ts; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags || + sqe->len != 1) + return -EINVAL; + if (copy_from_user(&ts, (void __user *) (unsigned long) sqe->addr, + sizeof(ts))) + return -EFAULT; + + /* + * sqe->off holds how many events that need to occur for this + * timeout event to be satisfied. + */ + count = READ_ONCE(sqe->off); + if (!count) + count = 1; + + req->sequence = ctx->cached_sq_head + count - 1; + req->flags |= REQ_F_TIMEOUT; + + /* + * Insertion sort, ensuring the first entry in the list is always + * the one we need first. + */ + tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped; + req_dist = req->sequence - tail_index; + spin_lock_irq(&ctx->completion_lock); + list_for_each_prev(entry, &ctx->timeout_list) { + struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); + unsigned dist; + + dist = nxt->sequence - tail_index; + if (req_dist >= dist) + break; + } + list_add(&req->list, entry); + spin_unlock_irq(&ctx->completion_lock); + + hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + req->timeout.timer.function = io_timeout_fn; + hrtimer_start(&req->timeout.timer, timespec_to_ktime(ts), + HRTIMER_MODE_REL); + return 0; +} + static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -1842,6 +1968,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_RECVMSG: ret = io_recvmsg(req, s->sqe, force_nonblock); break; + case IORING_OP_TIMEOUT: + ret = io_timeout(req, s->sqe); + break; default: ret = -EINVAL; break; @@ -2599,6 +2728,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { struct io_rings *rings = ctx->rings; + unsigned nr_timeouts; int ret; if (io_cqring_events(rings) >= min_events) @@ -2617,7 +2747,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events); + nr_timeouts = atomic_read(&ctx->cq_timeouts); + /* + * Return if we have enough events, or if a timeout occured since + * we started waiting. For timeouts, we always want to return to + * userspace. + */ + ret = wait_event_interruptible(ctx->wait, + io_cqring_events(rings) >= min_events || + atomic_read(&ctx->cq_timeouts) != nr_timeouts); restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; @@ -3288,6 +3426,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); + io_kill_timeouts(ctx); io_poll_remove_all(ctx); io_iopoll_reap_events(ctx); wait_for_completion(&ctx->ctx_done); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 96ee9d94b73e..ea57526a5b89 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -28,6 +28,7 @@ struct io_uring_sqe { __u16 poll_events; __u32 sync_range_flags; __u32 msg_flags; + __u32 timeout_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -61,6 +62,7 @@ struct io_uring_sqe { #define IORING_OP_SYNC_FILE_RANGE 8 #define IORING_OP_SENDMSG 9 #define IORING_OP_RECVMSG 10 +#define IORING_OP_TIMEOUT 11 /* * sqe->fsync_flags From 4d85f45c73a22bc0ee900c7505b7210a87a7966d Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Tue, 3 Sep 2019 21:06:42 +0200 Subject: [PATCH 0600/2880] drm/atomic: Rename crtc_state->pageflip_flags to async_flip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's the only flag anyone actually cares about. Plus if we're unlucky, the atomic ioctl might need a different flag for async flips. So better to abstract this away from the uapi a bit. Reviewed-by: Maarten Lankhorst Reviewed-by: Nicholas Kazlauskas Cc: Maarten Lankhorst Cc: Michel Dänzer Cc: Alex Deucher Cc: Adam Jackson Cc: Sean Paul Cc: David Airlie Signed-off-by: Daniel Vetter Cc: Maxime Ripard Cc: Daniel Vetter Cc: Nicholas Kazlauskas Cc: Leo Li Cc: Harry Wentland Cc: David Francis Cc: Mario Kleiner Cc: Bhawanpreet Lakha Cc: Ben Skeggs Cc: "Christian König" Cc: Ilia Mirkin Cc: Sam Ravnborg Cc: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20190903190642.32588-3-daniel.vetter@ffwll.ch --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 5 ++--- drivers/gpu/drm/drm_atomic_helper.c | 2 +- drivers/gpu/drm/drm_atomic_state_helper.c | 2 +- drivers/gpu/drm/nouveau/dispnv50/wndw.c | 4 ++-- include/drm/drm_crtc.h | 8 ++++---- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 0a71ed1e7762..2f0ef0820f00 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -5756,8 +5756,7 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, * change FB pitch, DCC state, rotation or mirroing. */ bundle->flip_addrs[planes_count].flip_immediate = - (crtc->state->pageflip_flags & - DRM_MODE_PAGE_FLIP_ASYNC) != 0 && + crtc->state->async_flip && acrtc_state->update_type == UPDATE_TYPE_FAST; timestamp_ns = ktime_get_ns(); @@ -6334,7 +6333,7 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) amdgpu_dm_enable_crtc_interrupts(dev, state, true); for_each_new_crtc_in_state(state, crtc, new_crtc_state, j) - if (new_crtc_state->pageflip_flags & DRM_MODE_PAGE_FLIP_ASYNC) + if (new_crtc_state->async_flip) wait_for_vblank = false; /* update planes when needed per crtc*/ diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index aa16ea17ff9b..3a67f63c3146 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -3275,7 +3275,7 @@ static int page_flip_common(struct drm_atomic_state *state, return PTR_ERR(crtc_state); crtc_state->event = event; - crtc_state->pageflip_flags = flags; + crtc_state->async_flip = flags & DRM_MODE_PAGE_FLIP_ASYNC; plane_state = drm_atomic_get_plane_state(state, plane); if (IS_ERR(plane_state)) diff --git a/drivers/gpu/drm/drm_atomic_state_helper.c b/drivers/gpu/drm/drm_atomic_state_helper.c index 46dc264a248b..d0a937fb0c56 100644 --- a/drivers/gpu/drm/drm_atomic_state_helper.c +++ b/drivers/gpu/drm/drm_atomic_state_helper.c @@ -128,7 +128,7 @@ void __drm_atomic_helper_crtc_duplicate_state(struct drm_crtc *crtc, state->zpos_changed = false; state->commit = NULL; state->event = NULL; - state->pageflip_flags = 0; + state->async_flip = false; /* Self refresh should be canceled when a new update is available */ state->active = drm_atomic_crtc_effectively_active(state); diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c index 2db029371c91..5193b6257061 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c @@ -267,7 +267,7 @@ nv50_wndw_atomic_check_acquire(struct nv50_wndw *wndw, bool modeset, asyw->image.pitch[0] = fb->base.pitches[0]; } - if (!(asyh->state.pageflip_flags & DRM_MODE_PAGE_FLIP_ASYNC)) + if (!asyh->state.async_flip) asyw->image.interval = 1; else asyw->image.interval = 0; @@ -383,7 +383,7 @@ nv50_wndw_atomic_check_lut(struct nv50_wndw *wndw, } /* Can't do an immediate flip while changing the LUT. */ - asyh->state.pageflip_flags &= ~DRM_MODE_PAGE_FLIP_ASYNC; + asyh->state.async_flip = false; } static int diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h index 7d14c11bdc0a..c4528eb5d168 100644 --- a/include/drm/drm_crtc.h +++ b/include/drm/drm_crtc.h @@ -285,12 +285,12 @@ struct drm_crtc_state { u32 target_vblank; /** - * @pageflip_flags: + * @async_flip: * - * DRM_MODE_PAGE_FLIP_* flags, as passed to the page flip ioctl. - * Zero in any other case. + * This is set when DRM_MODE_PAGE_FLIP_ASYNC is set in the legacy + * PAGE_FLIP IOCTL. It's not wired up for the atomic IOCTL itself yet. */ - u32 pageflip_flags; + bool async_flip; /** * @vrr_enabled: From 947ec14c7369a87625f03abaab5b3f4d33ac73ba Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 18 Sep 2019 15:02:38 +0900 Subject: [PATCH 0601/2880] ASoC: rsnd: do error check after rsnd_channel_normalization() SSI need to use rsnd_channel_normalization() for TDM-split mode, thus, channel check need to do after that. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/874l1aw39d.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/sh/rcar/ssi.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sound/soc/sh/rcar/ssi.c b/sound/soc/sh/rcar/ssi.c index f6a7466622ea..fc5d089868df 100644 --- a/sound/soc/sh/rcar/ssi.c +++ b/sound/soc/sh/rcar/ssi.c @@ -286,6 +286,11 @@ static int rsnd_ssi_master_clk_start(struct rsnd_mod *mod, if (rsnd_ssi_is_multi_slave(mod, io)) return 0; + if (rsnd_runtime_is_tdm_split(io)) + chan = rsnd_io_converted_chan(io); + + chan = rsnd_channel_normalization(chan); + if (ssi->usrcnt > 0) { if (ssi->rate != rate) { dev_err(dev, "SSI parent/child should use same rate\n"); @@ -300,11 +305,6 @@ static int rsnd_ssi_master_clk_start(struct rsnd_mod *mod, return 0; } - if (rsnd_runtime_is_tdm_split(io)) - chan = rsnd_io_converted_chan(io); - - chan = rsnd_channel_normalization(chan); - main_rate = rsnd_ssi_clk_query(rdai, rate, chan, &idx); if (!main_rate) { dev_err(dev, "unsupported clock rate\n"); From a62a8ef9d97da23762a588592c8b8eb50a8deb6a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 12 Jun 2018 09:41:17 +0100 Subject: [PATCH 0602/2880] virtio-fs: add virtiofs filesystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a basic file system module for virtio-fs. This does not yet contain shared data support between host and guest or metadata coherency speedups. However it is already significantly faster than virtio-9p. Design Overview =============== With the goal of designing something with better performance and local file system semantics, a bunch of ideas were proposed. - Use fuse protocol (instead of 9p) for communication between guest and host. Guest kernel will be fuse client and a fuse server will run on host to serve the requests. - For data access inside guest, mmap portion of file in QEMU address space and guest accesses this memory using dax. That way guest page cache is bypassed and there is only one copy of data (on host). This will also enable mmap(MAP_SHARED) between guests. - For metadata coherency, there is a shared memory region which contains version number associated with metadata and any guest changing metadata updates version number and other guests refresh metadata on next access. This is yet to be implemented. How virtio-fs differs from existing approaches ============================================== The unique idea behind virtio-fs is to take advantage of the co-location of the virtual machine and hypervisor to avoid communication (vmexits). DAX allows file contents to be accessed without communication with the hypervisor. The shared memory region for metadata avoids communication in the common case where metadata is unchanged. By replacing expensive communication with cheaper shared memory accesses, we expect to achieve better performance than approaches based on network file system protocols. In addition, this also makes it easier to achieve local file system semantics (coherency). These techniques are not applicable to network file system protocols since the communications channel is bypassed by taking advantage of shared memory on a local machine. This is why we decided to build virtio-fs rather than focus on 9P or NFS. Caching Modes ============= Like virtio-9p, different caching modes are supported which determine the coherency level as well. The “cache=FOO” and “writeback” options control the level of coherence between the guest and host filesystems. - cache=none metadata, data and pathname lookup are not cached in guest. They are always fetched from host and any changes are immediately pushed to host. - cache=always metadata, data and pathname lookup are cached in guest and never expire. - cache=auto metadata and pathname lookup cache expires after a configured amount of time (default is 1 second). Data is cached while the file is open (close to open consistency). - writeback/no_writeback These options control the writeback strategy. If writeback is disabled, then normal writes will immediately be synchronized with the host fs. If writeback is enabled, then writes may be cached in the guest until the file is closed or an fsync(2) performed. This option has no effect on mmap-ed writes or writes going through the DAX mechanism. Signed-off-by: Stefan Hajnoczi Signed-off-by: Vivek Goyal Acked-by: Michael S. Tsirkin Signed-off-by: Miklos Szeredi --- fs/fuse/Kconfig | 11 + fs/fuse/Makefile | 1 + fs/fuse/fuse_i.h | 9 + fs/fuse/inode.c | 4 + fs/fuse/virtio_fs.c | 1195 +++++++++++++++++++++++++++++++ include/uapi/linux/virtio_fs.h | 19 + include/uapi/linux/virtio_ids.h | 1 + 7 files changed, 1240 insertions(+) create mode 100644 fs/fuse/virtio_fs.c create mode 100644 include/uapi/linux/virtio_fs.h diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 24fc5a5c1b97..0635cba19971 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -27,3 +27,14 @@ config CUSE If you want to develop or use a userspace character device based on CUSE, answer Y or M. + +config VIRTIO_FS + tristate "Virtio Filesystem" + depends on FUSE_FS + select VIRTIO + help + The Virtio Filesystem allows guests to mount file systems from the + host. + + If you want to share files between guests or with the host, answer Y + or M. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 9485019c2a14..6419a2b3510d 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -5,5 +5,6 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o +obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index fc89cb40e874..956aeaf961ae 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -353,6 +353,10 @@ struct fuse_req { /** Used to wake up the task waiting for completion of request*/ wait_queue_head_t waitq; +#if IS_ENABLED(CONFIG_VIRTIO_FS) + /** virtio-fs's physically contiguous buffer for in and out args */ + void *argbuf; +#endif }; struct fuse_iqueue; @@ -383,6 +387,11 @@ struct fuse_iqueue_ops { */ void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) __releases(fiq->lock); + + /** + * Clean up when fuse_iqueue is destroyed + */ + void (*release)(struct fuse_iqueue *fiq); }; /** /dev/fuse input queue operations */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 10d193b24fb8..3d598a5bb5b5 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -630,6 +630,10 @@ EXPORT_SYMBOL_GPL(fuse_conn_init); void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { + struct fuse_iqueue *fiq = &fc->iq; + + if (fiq->ops->release) + fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); put_user_ns(fc->user_ns); fc->release(fc); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c new file mode 100644 index 000000000000..6af3f131e468 --- /dev/null +++ b/fs/fuse/virtio_fs.c @@ -0,0 +1,1195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * virtio-fs: Virtio Filesystem + * Copyright (C) 2018 Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "fuse_i.h" + +/* List of virtio-fs device instances and a lock for the list. Also provides + * mutual exclusion in device removal and mounting path + */ +static DEFINE_MUTEX(virtio_fs_mutex); +static LIST_HEAD(virtio_fs_instances); + +enum { + VQ_HIPRIO, + VQ_REQUEST +}; + +/* Per-virtqueue state */ +struct virtio_fs_vq { + spinlock_t lock; + struct virtqueue *vq; /* protected by ->lock */ + struct work_struct done_work; + struct list_head queued_reqs; + struct delayed_work dispatch_work; + struct fuse_dev *fud; + bool connected; + long in_flight; + char name[24]; +} ____cacheline_aligned_in_smp; + +/* A virtio-fs device instance */ +struct virtio_fs { + struct kref refcount; + struct list_head list; /* on virtio_fs_instances */ + char *tag; + struct virtio_fs_vq *vqs; + unsigned int nvqs; /* number of virtqueues */ + unsigned int num_request_queues; /* number of request queues */ +}; + +struct virtio_fs_forget { + struct fuse_in_header ih; + struct fuse_forget_in arg; + /* This request can be temporarily queued on virt queue */ + struct list_head list; +}; + +static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) +{ + struct virtio_fs *fs = vq->vdev->priv; + + return &fs->vqs[vq->index]; +} + +static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq) +{ + return &vq_to_fsvq(vq)->fud->pq; +} + +static void release_virtio_fs_obj(struct kref *ref) +{ + struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount); + + kfree(vfs->vqs); + kfree(vfs); +} + +/* Make sure virtiofs_mutex is held */ +static void virtio_fs_put(struct virtio_fs *fs) +{ + kref_put(&fs->refcount, release_virtio_fs_obj); +} + +static void virtio_fs_fiq_release(struct fuse_iqueue *fiq) +{ + struct virtio_fs *vfs = fiq->priv; + + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(vfs); + mutex_unlock(&virtio_fs_mutex); +} + +static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) +{ + WARN_ON(fsvq->in_flight < 0); + + /* Wait for in flight requests to finish.*/ + while (1) { + spin_lock(&fsvq->lock); + if (!fsvq->in_flight) { + spin_unlock(&fsvq->lock); + break; + } + spin_unlock(&fsvq->lock); + /* TODO use completion instead of timeout */ + usleep_range(1000, 2000); + } + + flush_work(&fsvq->done_work); + flush_delayed_work(&fsvq->dispatch_work); +} + +static inline void drain_hiprio_queued_reqs(struct virtio_fs_vq *fsvq) +{ + struct virtio_fs_forget *forget; + + spin_lock(&fsvq->lock); + while (1) { + forget = list_first_entry_or_null(&fsvq->queued_reqs, + struct virtio_fs_forget, list); + if (!forget) + break; + list_del(&forget->list); + kfree(forget); + } + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_drain_all_queues(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + if (i == VQ_HIPRIO) + drain_hiprio_queued_reqs(fsvq); + + virtio_fs_drain_queue(fsvq); + } +} + +static void virtio_fs_start_all_queues(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + spin_lock(&fsvq->lock); + fsvq->connected = true; + spin_unlock(&fsvq->lock); + } +} + +/* Add a new instance to the list or return -EEXIST if tag name exists*/ +static int virtio_fs_add_instance(struct virtio_fs *fs) +{ + struct virtio_fs *fs2; + bool duplicate = false; + + mutex_lock(&virtio_fs_mutex); + + list_for_each_entry(fs2, &virtio_fs_instances, list) { + if (strcmp(fs->tag, fs2->tag) == 0) + duplicate = true; + } + + if (!duplicate) + list_add_tail(&fs->list, &virtio_fs_instances); + + mutex_unlock(&virtio_fs_mutex); + + if (duplicate) + return -EEXIST; + return 0; +} + +/* Return the virtio_fs with a given tag, or NULL */ +static struct virtio_fs *virtio_fs_find_instance(const char *tag) +{ + struct virtio_fs *fs; + + mutex_lock(&virtio_fs_mutex); + + list_for_each_entry(fs, &virtio_fs_instances, list) { + if (strcmp(fs->tag, tag) == 0) { + kref_get(&fs->refcount); + goto found; + } + } + + fs = NULL; /* not found */ + +found: + mutex_unlock(&virtio_fs_mutex); + + return fs; +} + +static void virtio_fs_free_devs(struct virtio_fs *fs) +{ + unsigned int i; + + for (i = 0; i < fs->nvqs; i++) { + struct virtio_fs_vq *fsvq = &fs->vqs[i]; + + if (!fsvq->fud) + continue; + + fuse_dev_free(fsvq->fud); + fsvq->fud = NULL; + } +} + +/* Read filesystem name from virtio config into fs->tag (must kfree()). */ +static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs) +{ + char tag_buf[sizeof_field(struct virtio_fs_config, tag)]; + char *end; + size_t len; + + virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag), + &tag_buf, sizeof(tag_buf)); + end = memchr(tag_buf, '\0', sizeof(tag_buf)); + if (end == tag_buf) + return -EINVAL; /* empty tag */ + if (!end) + end = &tag_buf[sizeof(tag_buf)]; + + len = end - tag_buf; + fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL); + if (!fs->tag) + return -ENOMEM; + memcpy(fs->tag, tag_buf, len); + fs->tag[len] = '\0'; + return 0; +} + +/* Work function for hiprio completion */ +static void virtio_fs_hiprio_done_work(struct work_struct *work) +{ + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + done_work); + struct virtqueue *vq = fsvq->vq; + + /* Free completed FUSE_FORGET requests */ + spin_lock(&fsvq->lock); + do { + unsigned int len; + void *req; + + virtqueue_disable_cb(vq); + + while ((req = virtqueue_get_buf(vq, &len)) != NULL) { + kfree(req); + fsvq->in_flight--; + } + } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_dummy_dispatch_work(struct work_struct *work) +{ +} + +static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) +{ + struct virtio_fs_forget *forget; + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + dispatch_work.work); + struct virtqueue *vq = fsvq->vq; + struct scatterlist sg; + struct scatterlist *sgs[] = {&sg}; + bool notify; + int ret; + + pr_debug("virtio-fs: worker %s called.\n", __func__); + while (1) { + spin_lock(&fsvq->lock); + forget = list_first_entry_or_null(&fsvq->queued_reqs, + struct virtio_fs_forget, list); + if (!forget) { + spin_unlock(&fsvq->lock); + return; + } + + list_del(&forget->list); + if (!fsvq->connected) { + spin_unlock(&fsvq->lock); + kfree(forget); + continue; + } + + sg_init_one(&sg, forget, sizeof(*forget)); + + /* Enqueue the request */ + dev_dbg(&vq->vdev->dev, "%s\n", __func__); + ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", + ret); + list_add_tail(&forget->list, + &fsvq->queued_reqs); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + } else { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", + ret); + kfree(forget); + } + spin_unlock(&fsvq->lock); + return; + } + + fsvq->in_flight++; + notify = virtqueue_kick_prepare(vq); + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); + pr_debug("virtio-fs: worker %s dispatched one forget request.\n", + __func__); + } +} + +/* Allocate and copy args into req->argbuf */ +static int copy_args_to_argbuf(struct fuse_req *req) +{ + struct fuse_args *args = req->args; + unsigned int offset = 0; + unsigned int num_in; + unsigned int num_out; + unsigned int len; + unsigned int i; + + num_in = args->in_numargs - args->in_pages; + num_out = args->out_numargs - args->out_pages; + len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) + + fuse_len_args(num_out, args->out_args); + + req->argbuf = kmalloc(len, GFP_ATOMIC); + if (!req->argbuf) + return -ENOMEM; + + for (i = 0; i < num_in; i++) { + memcpy(req->argbuf + offset, + args->in_args[i].value, + args->in_args[i].size); + offset += args->in_args[i].size; + } + + return 0; +} + +/* Copy args out of and free req->argbuf */ +static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) +{ + unsigned int remaining; + unsigned int offset; + unsigned int num_in; + unsigned int num_out; + unsigned int i; + + remaining = req->out.h.len - sizeof(req->out.h); + num_in = args->in_numargs - args->in_pages; + num_out = args->out_numargs - args->out_pages; + offset = fuse_len_args(num_in, (struct fuse_arg *)args->in_args); + + for (i = 0; i < num_out; i++) { + unsigned int argsize = args->out_args[i].size; + + if (args->out_argvar && + i == args->out_numargs - 1 && + argsize > remaining) { + argsize = remaining; + } + + memcpy(args->out_args[i].value, req->argbuf + offset, argsize); + offset += argsize; + + if (i != args->out_numargs - 1) + remaining -= argsize; + } + + /* Store the actual size of the variable-length arg */ + if (args->out_argvar) + args->out_args[args->out_numargs - 1].size = remaining; + + kfree(req->argbuf); + req->argbuf = NULL; +} + +/* Work function for request completion */ +static void virtio_fs_requests_done_work(struct work_struct *work) +{ + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + done_work); + struct fuse_pqueue *fpq = &fsvq->fud->pq; + struct fuse_conn *fc = fsvq->fud->fc; + struct virtqueue *vq = fsvq->vq; + struct fuse_req *req; + struct fuse_args_pages *ap; + struct fuse_req *next; + struct fuse_args *args; + unsigned int len, i, thislen; + struct page *page; + LIST_HEAD(reqs); + + /* Collect completed requests off the virtqueue */ + spin_lock(&fsvq->lock); + do { + virtqueue_disable_cb(vq); + + while ((req = virtqueue_get_buf(vq, &len)) != NULL) { + spin_lock(&fpq->lock); + list_move_tail(&req->list, &reqs); + spin_unlock(&fpq->lock); + } + } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + spin_unlock(&fsvq->lock); + + /* End requests */ + list_for_each_entry_safe(req, next, &reqs, list) { + /* + * TODO verify that server properly follows FUSE protocol + * (oh.uniq, oh.len) + */ + args = req->args; + copy_args_from_argbuf(args, req); + + if (args->out_pages && args->page_zeroing) { + len = args->out_args[args->out_numargs - 1].size; + ap = container_of(args, typeof(*ap), args); + for (i = 0; i < ap->num_pages; i++) { + thislen = ap->descs[i].length; + if (len < thislen) { + WARN_ON(ap->descs[i].offset); + page = ap->pages[i]; + zero_user_segment(page, len, thislen); + len = 0; + } else { + len -= thislen; + } + } + } + + spin_lock(&fpq->lock); + clear_bit(FR_SENT, &req->flags); + list_del_init(&req->list); + spin_unlock(&fpq->lock); + + fuse_request_end(fc, req); + spin_lock(&fsvq->lock); + fsvq->in_flight--; + spin_unlock(&fsvq->lock); + } +} + +/* Virtqueue interrupt handler */ +static void virtio_fs_vq_done(struct virtqueue *vq) +{ + struct virtio_fs_vq *fsvq = vq_to_fsvq(vq); + + dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name); + + schedule_work(&fsvq->done_work); +} + +/* Initialize virtqueues */ +static int virtio_fs_setup_vqs(struct virtio_device *vdev, + struct virtio_fs *fs) +{ + struct virtqueue **vqs; + vq_callback_t **callbacks; + const char **names; + unsigned int i; + int ret = 0; + + virtio_cread(vdev, struct virtio_fs_config, num_request_queues, + &fs->num_request_queues); + if (fs->num_request_queues == 0) + return -EINVAL; + + fs->nvqs = 1 + fs->num_request_queues; + fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); + if (!fs->vqs) + return -ENOMEM; + + vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL); + callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]), + GFP_KERNEL); + names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL); + if (!vqs || !callbacks || !names) { + ret = -ENOMEM; + goto out; + } + + callbacks[VQ_HIPRIO] = virtio_fs_vq_done; + snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name), + "hiprio"); + names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; + INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work); + INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs); + INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work, + virtio_fs_hiprio_dispatch_work); + spin_lock_init(&fs->vqs[VQ_HIPRIO].lock); + + /* Initialize the requests virtqueues */ + for (i = VQ_REQUEST; i < fs->nvqs; i++) { + spin_lock_init(&fs->vqs[i].lock); + INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work); + INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work, + virtio_fs_dummy_dispatch_work); + INIT_LIST_HEAD(&fs->vqs[i].queued_reqs); + snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name), + "requests.%u", i - VQ_REQUEST); + callbacks[i] = virtio_fs_vq_done; + names[i] = fs->vqs[i].name; + } + + ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL); + if (ret < 0) + goto out; + + for (i = 0; i < fs->nvqs; i++) + fs->vqs[i].vq = vqs[i]; + + virtio_fs_start_all_queues(fs); +out: + kfree(names); + kfree(callbacks); + kfree(vqs); + if (ret) + kfree(fs->vqs); + return ret; +} + +/* Free virtqueues (device must already be reset) */ +static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, + struct virtio_fs *fs) +{ + vdev->config->del_vqs(vdev); +} + +static int virtio_fs_probe(struct virtio_device *vdev) +{ + struct virtio_fs *fs; + int ret; + + fs = kzalloc(sizeof(*fs), GFP_KERNEL); + if (!fs) + return -ENOMEM; + kref_init(&fs->refcount); + vdev->priv = fs; + + ret = virtio_fs_read_tag(vdev, fs); + if (ret < 0) + goto out; + + ret = virtio_fs_setup_vqs(vdev, fs); + if (ret < 0) + goto out; + + /* TODO vq affinity */ + + /* Bring the device online in case the filesystem is mounted and + * requests need to be sent before we return. + */ + virtio_device_ready(vdev); + + ret = virtio_fs_add_instance(fs); + if (ret < 0) + goto out_vqs; + + return 0; + +out_vqs: + vdev->config->reset(vdev); + virtio_fs_cleanup_vqs(vdev, fs); + +out: + vdev->priv = NULL; + kfree(fs); + return ret; +} + +static void virtio_fs_stop_all_queues(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + spin_lock(&fsvq->lock); + fsvq->connected = false; + spin_unlock(&fsvq->lock); + } +} + +static void virtio_fs_remove(struct virtio_device *vdev) +{ + struct virtio_fs *fs = vdev->priv; + + mutex_lock(&virtio_fs_mutex); + /* This device is going away. No one should get new reference */ + list_del_init(&fs->list); + virtio_fs_stop_all_queues(fs); + virtio_fs_drain_all_queues(fs); + vdev->config->reset(vdev); + virtio_fs_cleanup_vqs(vdev, fs); + + vdev->priv = NULL; + /* Put device reference on virtio_fs object */ + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); +} + +#ifdef CONFIG_PM_SLEEP +static int virtio_fs_freeze(struct virtio_device *vdev) +{ + /* TODO need to save state here */ + pr_warn("virtio-fs: suspend/resume not yet supported\n"); + return -EOPNOTSUPP; +} + +static int virtio_fs_restore(struct virtio_device *vdev) +{ + /* TODO need to restore state here */ + return 0; +} +#endif /* CONFIG_PM_SLEEP */ + +const static struct virtio_device_id id_table[] = { + { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID }, + {}, +}; + +const static unsigned int feature_table[] = {}; + +static struct virtio_driver virtio_fs_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .feature_table = feature_table, + .feature_table_size = ARRAY_SIZE(feature_table), + .probe = virtio_fs_probe, + .remove = virtio_fs_remove, +#ifdef CONFIG_PM_SLEEP + .freeze = virtio_fs_freeze, + .restore = virtio_fs_restore, +#endif +}; + +static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + struct fuse_forget_link *link; + struct virtio_fs_forget *forget; + struct scatterlist sg; + struct scatterlist *sgs[] = {&sg}; + struct virtio_fs *fs; + struct virtqueue *vq; + struct virtio_fs_vq *fsvq; + bool notify; + u64 unique; + int ret; + + link = fuse_dequeue_forget(fiq, 1, NULL); + unique = fuse_get_unique(fiq); + + fs = fiq->priv; + fsvq = &fs->vqs[VQ_HIPRIO]; + spin_unlock(&fiq->lock); + + /* Allocate a buffer for the request */ + forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); + + forget->ih = (struct fuse_in_header){ + .opcode = FUSE_FORGET, + .nodeid = link->forget_one.nodeid, + .unique = unique, + .len = sizeof(*forget), + }; + forget->arg = (struct fuse_forget_in){ + .nlookup = link->forget_one.nlookup, + }; + + sg_init_one(&sg, forget, sizeof(*forget)); + + /* Enqueue the request */ + spin_lock(&fsvq->lock); + + if (!fsvq->connected) { + kfree(forget); + spin_unlock(&fsvq->lock); + goto out; + } + + vq = fsvq->vq; + dev_dbg(&vq->vdev->dev, "%s\n", __func__); + + ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later.\n", + ret); + list_add_tail(&forget->list, &fsvq->queued_reqs); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + } else { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", + ret); + kfree(forget); + } + spin_unlock(&fsvq->lock); + goto out; + } + + fsvq->in_flight++; + notify = virtqueue_kick_prepare(vq); + + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); +out: + kfree(link); +} + +static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + /* + * TODO interrupts. + * + * Normal fs operations on a local filesystems aren't interruptible. + * Exceptions are blocking lock operations; for example fcntl(F_SETLKW) + * with shared lock between host and guest. + */ + spin_unlock(&fiq->lock); +} + +/* Return the number of scatter-gather list elements required */ +static unsigned int sg_count_fuse_req(struct fuse_req *req) +{ + struct fuse_args *args = req->args; + struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); + unsigned int total_sgs = 1 /* fuse_in_header */; + + if (args->in_numargs - args->in_pages) + total_sgs += 1; + + if (args->in_pages) + total_sgs += ap->num_pages; + + if (!test_bit(FR_ISREPLY, &req->flags)) + return total_sgs; + + total_sgs += 1 /* fuse_out_header */; + + if (args->out_numargs - args->out_pages) + total_sgs += 1; + + if (args->out_pages) + total_sgs += ap->num_pages; + + return total_sgs; +} + +/* Add pages to scatter-gather list and return number of elements used */ +static unsigned int sg_init_fuse_pages(struct scatterlist *sg, + struct page **pages, + struct fuse_page_desc *page_descs, + unsigned int num_pages, + unsigned int total_len) +{ + unsigned int i; + unsigned int this_len; + + for (i = 0; i < num_pages && total_len; i++) { + sg_init_table(&sg[i], 1); + this_len = min(page_descs[i].length, total_len); + sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset); + total_len -= this_len; + } + + return i; +} + +/* Add args to scatter-gather list and return number of elements used */ +static unsigned int sg_init_fuse_args(struct scatterlist *sg, + struct fuse_req *req, + struct fuse_arg *args, + unsigned int numargs, + bool argpages, + void *argbuf, + unsigned int *len_used) +{ + struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); + unsigned int total_sgs = 0; + unsigned int len; + + len = fuse_len_args(numargs - argpages, args); + if (len) + sg_init_one(&sg[total_sgs++], argbuf, len); + + if (argpages) + total_sgs += sg_init_fuse_pages(&sg[total_sgs], + ap->pages, ap->descs, + ap->num_pages, + args[numargs - 1].size); + + if (len_used) + *len_used = len; + + return total_sgs; +} + +/* Add a request to a virtqueue and kick the device */ +static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, + struct fuse_req *req) +{ + /* requests need at least 4 elements */ + struct scatterlist *stack_sgs[6]; + struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)]; + struct scatterlist **sgs = stack_sgs; + struct scatterlist *sg = stack_sg; + struct virtqueue *vq; + struct fuse_args *args = req->args; + unsigned int argbuf_used = 0; + unsigned int out_sgs = 0; + unsigned int in_sgs = 0; + unsigned int total_sgs; + unsigned int i; + int ret; + bool notify; + + /* Does the sglist fit on the stack? */ + total_sgs = sg_count_fuse_req(req); + if (total_sgs > ARRAY_SIZE(stack_sgs)) { + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC); + sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC); + if (!sgs || !sg) { + ret = -ENOMEM; + goto out; + } + } + + /* Use a bounce buffer since stack args cannot be mapped */ + ret = copy_args_to_argbuf(req); + if (ret < 0) + goto out; + + /* Request elements */ + sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h)); + out_sgs += sg_init_fuse_args(&sg[out_sgs], req, + (struct fuse_arg *)args->in_args, + args->in_numargs, args->in_pages, + req->argbuf, &argbuf_used); + + /* Reply elements */ + if (test_bit(FR_ISREPLY, &req->flags)) { + sg_init_one(&sg[out_sgs + in_sgs++], + &req->out.h, sizeof(req->out.h)); + in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req, + args->out_args, args->out_numargs, + args->out_pages, + req->argbuf + argbuf_used, NULL); + } + + WARN_ON(out_sgs + in_sgs != total_sgs); + + for (i = 0; i < total_sgs; i++) + sgs[i] = &sg[i]; + + spin_lock(&fsvq->lock); + + if (!fsvq->connected) { + spin_unlock(&fsvq->lock); + ret = -ENOTCONN; + goto out; + } + + vq = fsvq->vq; + ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); + if (ret < 0) { + spin_unlock(&fsvq->lock); + goto out; + } + + fsvq->in_flight++; + notify = virtqueue_kick_prepare(vq); + + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); + +out: + if (ret < 0 && req->argbuf) { + kfree(req->argbuf); + req->argbuf = NULL; + } + if (sgs != stack_sgs) { + kfree(sgs); + kfree(sg); + } + + return ret; +} + +static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */ + struct virtio_fs *fs; + struct fuse_conn *fc; + struct fuse_req *req; + struct fuse_pqueue *fpq; + int ret; + + WARN_ON(list_empty(&fiq->pending)); + req = list_last_entry(&fiq->pending, struct fuse_req, list); + clear_bit(FR_PENDING, &req->flags); + list_del_init(&req->list); + WARN_ON(!list_empty(&fiq->pending)); + spin_unlock(&fiq->lock); + + fs = fiq->priv; + fc = fs->vqs[queue_id].fud->fc; + + pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n", + __func__, req->in.h.opcode, req->in.h.unique, + req->in.h.nodeid, req->in.h.len, + fuse_len_args(req->args->out_numargs, req->args->out_args)); + + fpq = &fs->vqs[queue_id].fud->pq; + spin_lock(&fpq->lock); + if (!fpq->connected) { + spin_unlock(&fpq->lock); + req->out.h.error = -ENODEV; + pr_err("virtio-fs: %s disconnected\n", __func__); + fuse_request_end(fc, req); + return; + } + list_add_tail(&req->list, fpq->processing); + spin_unlock(&fpq->lock); + set_bit(FR_SENT, &req->flags); + /* matches barrier in request_wait_answer() */ + smp_mb__after_atomic(); + +retry: + ret = virtio_fs_enqueue_req(&fs->vqs[queue_id], req); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + /* Virtqueue full. Retry submission */ + /* TODO use completion instead of timeout */ + usleep_range(20, 30); + goto retry; + } + req->out.h.error = ret; + pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret); + spin_lock(&fpq->lock); + clear_bit(FR_SENT, &req->flags); + list_del_init(&req->list); + spin_unlock(&fpq->lock); + fuse_request_end(fc, req); + return; + } +} + +const static struct fuse_iqueue_ops virtio_fs_fiq_ops = { + .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, + .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, + .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, + .release = virtio_fs_fiq_release, +}; + +static int virtio_fs_fill_super(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct virtio_fs *fs = fc->iq.priv; + unsigned int i; + int err; + struct fuse_fs_context ctx = { + .rootmode = S_IFDIR, + .default_permissions = 1, + .allow_other = 1, + .max_read = UINT_MAX, + .blksize = 512, + .destroy = true, + .no_control = true, + .no_force_umount = true, + }; + + mutex_lock(&virtio_fs_mutex); + + /* After holding mutex, make sure virtiofs device is still there. + * Though we are holding a reference to it, drive ->remove might + * still have cleaned up virtual queues. In that case bail out. + */ + err = -EINVAL; + if (list_empty(&fs->list)) { + pr_info("virtio-fs: tag <%s> not found\n", fs->tag); + goto err; + } + + err = -ENOMEM; + /* Allocate fuse_dev for hiprio and notification queues */ + for (i = 0; i < VQ_REQUEST; i++) { + struct virtio_fs_vq *fsvq = &fs->vqs[i]; + + fsvq->fud = fuse_dev_alloc(); + if (!fsvq->fud) + goto err_free_fuse_devs; + } + + ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud; + err = fuse_fill_super_common(sb, &ctx); + if (err < 0) + goto err_free_fuse_devs; + + fc = fs->vqs[VQ_REQUEST].fud->fc; + + for (i = 0; i < fs->nvqs; i++) { + struct virtio_fs_vq *fsvq = &fs->vqs[i]; + + if (i == VQ_REQUEST) + continue; /* already initialized */ + fuse_dev_install(fsvq->fud, fc); + } + + /* Previous unmount will stop all queues. Start these again */ + virtio_fs_start_all_queues(fs); + fuse_send_init(fc); + mutex_unlock(&virtio_fs_mutex); + return 0; + +err_free_fuse_devs: + virtio_fs_free_devs(fs); +err: + mutex_unlock(&virtio_fs_mutex); + return err; +} + +static void virtio_kill_sb(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct virtio_fs *vfs; + struct virtio_fs_vq *fsvq; + + /* If mount failed, we can still be called without any fc */ + if (!fc) + return fuse_kill_sb_anon(sb); + + vfs = fc->iq.priv; + fsvq = &vfs->vqs[VQ_HIPRIO]; + + /* Stop forget queue. Soon destroy will be sent */ + spin_lock(&fsvq->lock); + fsvq->connected = false; + spin_unlock(&fsvq->lock); + virtio_fs_drain_all_queues(vfs); + + fuse_kill_sb_anon(sb); + + /* fuse_kill_sb_anon() must have sent destroy. Stop all queues + * and drain one more time and free fuse devices. Freeing fuse + * devices will drop their reference on fuse_conn and that in + * turn will drop its reference on virtio_fs object. + */ + virtio_fs_stop_all_queues(vfs); + virtio_fs_drain_all_queues(vfs); + virtio_fs_free_devs(vfs); +} + +static int virtio_fs_test_super(struct super_block *sb, + struct fs_context *fsc) +{ + struct fuse_conn *fc = fsc->s_fs_info; + + return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv; +} + +static int virtio_fs_set_super(struct super_block *sb, + struct fs_context *fsc) +{ + int err; + + err = get_anon_bdev(&sb->s_dev); + if (!err) + fuse_conn_get(fsc->s_fs_info); + + return err; +} + +static int virtio_fs_get_tree(struct fs_context *fsc) +{ + struct virtio_fs *fs; + struct super_block *sb; + struct fuse_conn *fc; + int err; + + /* This gets a reference on virtio_fs object. This ptr gets installed + * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() + * to drop the reference to this object. + */ + fs = virtio_fs_find_instance(fsc->source); + if (!fs) { + pr_info("virtio-fs: tag <%s> not found\n", fsc->source); + return -EINVAL; + } + + fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); + if (!fc) { + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); + return -ENOMEM; + } + + fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops, + fs); + fc->release = fuse_free_conn; + fc->delete_stale = true; + + fsc->s_fs_info = fc; + sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super); + fuse_conn_put(fc); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + err = virtio_fs_fill_super(sb); + if (err) { + deactivate_locked_super(sb); + return err; + } + + sb->s_flags |= SB_ACTIVE; + } + + WARN_ON(fsc->root); + fsc->root = dget(sb->s_root); + return 0; +} + +static const struct fs_context_operations virtio_fs_context_ops = { + .get_tree = virtio_fs_get_tree, +}; + +static int virtio_fs_init_fs_context(struct fs_context *fsc) +{ + fsc->ops = &virtio_fs_context_ops; + return 0; +} + +static struct file_system_type virtio_fs_type = { + .owner = THIS_MODULE, + .name = "virtiofs", + .init_fs_context = virtio_fs_init_fs_context, + .kill_sb = virtio_kill_sb, +}; + +static int __init virtio_fs_init(void) +{ + int ret; + + ret = register_virtio_driver(&virtio_fs_driver); + if (ret < 0) + return ret; + + ret = register_filesystem(&virtio_fs_type); + if (ret < 0) { + unregister_virtio_driver(&virtio_fs_driver); + return ret; + } + + return 0; +} +module_init(virtio_fs_init); + +static void __exit virtio_fs_exit(void) +{ + unregister_filesystem(&virtio_fs_type); + unregister_virtio_driver(&virtio_fs_driver); +} +module_exit(virtio_fs_exit); + +MODULE_AUTHOR("Stefan Hajnoczi "); +MODULE_DESCRIPTION("Virtio Filesystem"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_FS(KBUILD_MODNAME); +MODULE_DEVICE_TABLE(virtio, id_table); diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h new file mode 100644 index 000000000000..b02eb2ac3d99 --- /dev/null +++ b/include/uapi/linux/virtio_fs.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ + +#ifndef _UAPI_LINUX_VIRTIO_FS_H +#define _UAPI_LINUX_VIRTIO_FS_H + +#include +#include +#include +#include + +struct virtio_fs_config { + /* Filesystem name (UTF-8, not NUL-terminated, padded with NULs) */ + __u8 tag[36]; + + /* Number of request queues */ + __u32 num_request_queues; +} __attribute__((packed)); + +#endif /* _UAPI_LINUX_VIRTIO_FS_H */ diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 348fd0176f75..585e07b27333 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -44,6 +44,7 @@ #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ #define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ +#define VIRTIO_ID_FS 26 /* virtio filesystem */ #define VIRTIO_ID_PMEM 27 /* virtio pmem */ #endif /* _LINUX_VIRTIO_IDS_H */ From d2935de7e4fd3f873976f1872b505584b727574c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 22 Mar 2019 14:58:36 +0000 Subject: [PATCH 0603/2880] vfs: Convert bpf to use the new mount API Convert the bpf filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Signed-off-by: David Howells cc: Alexei Starovoitov cc: Daniel Borkmann cc: Martin KaFai Lau cc: Song Liu cc: Yonghong Song cc: netdev@vger.kernel.org cc: bpf@vger.kernel.org Signed-off-by: Al Viro --- kernel/bpf/inode.c | 92 +++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 34 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index cc0d0cf114e3..a70f7209cda3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -14,8 +14,9 @@ #include #include #include +#include +#include #include -#include #include #include #include @@ -583,58 +584,52 @@ static const struct super_operations bpf_super_ops = { enum { OPT_MODE, - OPT_ERR, }; -static const match_table_t bpf_mount_tokens = { - { OPT_MODE, "mode=%o" }, - { OPT_ERR, NULL }, +static const struct fs_parameter_spec bpf_param_specs[] = { + fsparam_u32oct ("mode", OPT_MODE), + {} +}; + +static const struct fs_parameter_description bpf_fs_parameters = { + .name = "bpf", + .specs = bpf_param_specs, }; struct bpf_mount_opts { umode_t mode; }; -static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) +static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { - substring_t args[MAX_OPT_ARGS]; - int option, token; - char *ptr; + struct bpf_mount_opts *opts = fc->fs_private; + struct fs_parse_result result; + int opt; - opts->mode = S_IRWXUGO; - - while ((ptr = strsep(&data, ",")) != NULL) { - if (!*ptr) - continue; - - token = match_token(ptr, bpf_mount_tokens, args); - switch (token) { - case OPT_MODE: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->mode = option & S_IALLUGO; - break; + opt = fs_parse(fc, &bpf_fs_parameters, param, &result); + if (opt < 0) /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd * better continue to ignore non-existing options for bpf. */ - } + return opt == -ENOPARAM ? 0 : opt; + + switch (opt) { + case OPT_MODE: + opts->mode = result.uint_32 & S_IALLUGO; + break; } return 0; } -static int bpf_fill_super(struct super_block *sb, void *data, int silent) +static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; - struct bpf_mount_opts opts; + struct bpf_mount_opts *opts = fc->fs_private; struct inode *inode; int ret; - ret = bpf_parse_options(data, &opts); - if (ret) - return ret; - ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -644,21 +639,50 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) inode = sb->s_root->d_inode; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= S_ISVTX | opts.mode; + inode->i_mode |= S_ISVTX | opts->mode; return 0; } -static struct dentry *bpf_mount(struct file_system_type *type, int flags, - const char *dev_name, void *data) +static int bpf_get_tree(struct fs_context *fc) { - return mount_nodev(type, flags, data, bpf_fill_super); + return get_tree_nodev(fc, bpf_fill_super); +} + +static void bpf_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations bpf_context_ops = { + .free = bpf_free_fc, + .parse_param = bpf_parse_param, + .get_tree = bpf_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int bpf_init_fs_context(struct fs_context *fc) +{ + struct bpf_mount_opts *opts; + + opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + opts->mode = S_IRWXUGO; + + fc->fs_private = opts; + fc->ops = &bpf_context_ops; + return 0; } static struct file_system_type bpf_fs_type = { .owner = THIS_MODULE, .name = "bpf", - .mount = bpf_mount, + .init_fs_context = bpf_init_fs_context, + .parameters = &bpf_fs_parameters, .kill_sb = kill_litter_super, }; From dec90f61f125568088a6ada61832b366c1670e9f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Mar 2019 23:31:05 +0000 Subject: [PATCH 0604/2880] vfs: Convert functionfs to use the new mount API Convert the functionfs filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Signed-off-by: David Howells Acked-by: Felipe Balbi Acked-by: Michal Nazarewicz cc: linux-usb@vger.kernel.org Signed-off-by: Al Viro --- drivers/usb/gadget/function/f_fs.c | 237 +++++++++++++++-------------- 1 file changed, 122 insertions(+), 115 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 213ff03c8a9f..59d9d512dcda 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1451,9 +1452,9 @@ struct ffs_sb_fill_data { struct ffs_data *ffs_data; }; -static int ffs_sb_fill(struct super_block *sb, void *_data, int silent) +static int ffs_sb_fill(struct super_block *sb, struct fs_context *fc) { - struct ffs_sb_fill_data *data = _data; + struct ffs_sb_fill_data *data = fc->fs_private; struct inode *inode; struct ffs_data *ffs = data->ffs_data; @@ -1486,147 +1487,152 @@ static int ffs_sb_fill(struct super_block *sb, void *_data, int silent) return 0; } -static int ffs_fs_parse_opts(struct ffs_sb_fill_data *data, char *opts) +enum { + Opt_no_disconnect, + Opt_rmode, + Opt_fmode, + Opt_mode, + Opt_uid, + Opt_gid, +}; + +static const struct fs_parameter_spec ffs_fs_param_specs[] = { + fsparam_bool ("no_disconnect", Opt_no_disconnect), + fsparam_u32 ("rmode", Opt_rmode), + fsparam_u32 ("fmode", Opt_fmode), + fsparam_u32 ("mode", Opt_mode), + fsparam_u32 ("uid", Opt_uid), + fsparam_u32 ("gid", Opt_gid), + {} +}; + +static const struct fs_parameter_description ffs_fs_fs_parameters = { + .name = "kAFS", + .specs = ffs_fs_param_specs, +}; + +static int ffs_fs_parse_param(struct fs_context *fc, struct fs_parameter *param) { + struct ffs_sb_fill_data *data = fc->fs_private; + struct fs_parse_result result; + int opt; + ENTER(); - if (!opts || !*opts) - return 0; + opt = fs_parse(fc, &ffs_fs_fs_parameters, param, &result); + if (opt < 0) + return opt; - for (;;) { - unsigned long value; - char *eq, *comma; + switch (opt) { + case Opt_no_disconnect: + data->no_disconnect = result.boolean; + break; + case Opt_rmode: + data->root_mode = (result.uint_32 & 0555) | S_IFDIR; + break; + case Opt_fmode: + data->perms.mode = (result.uint_32 & 0666) | S_IFREG; + break; + case Opt_mode: + data->root_mode = (result.uint_32 & 0555) | S_IFDIR; + data->perms.mode = (result.uint_32 & 0666) | S_IFREG; + break; - /* Option limit */ - comma = strchr(opts, ','); - if (comma) - *comma = 0; + case Opt_uid: + data->perms.uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(data->perms.uid)) + goto unmapped_value; + break; + case Opt_gid: + data->perms.gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(data->perms.gid)) + goto unmapped_value; + break; - /* Value limit */ - eq = strchr(opts, '='); - if (unlikely(!eq)) { - pr_err("'=' missing in %s\n", opts); - return -EINVAL; - } - *eq = 0; - - /* Parse value */ - if (kstrtoul(eq + 1, 0, &value)) { - pr_err("%s: invalid value: %s\n", opts, eq + 1); - return -EINVAL; - } - - /* Interpret option */ - switch (eq - opts) { - case 13: - if (!memcmp(opts, "no_disconnect", 13)) - data->no_disconnect = !!value; - else - goto invalid; - break; - case 5: - if (!memcmp(opts, "rmode", 5)) - data->root_mode = (value & 0555) | S_IFDIR; - else if (!memcmp(opts, "fmode", 5)) - data->perms.mode = (value & 0666) | S_IFREG; - else - goto invalid; - break; - - case 4: - if (!memcmp(opts, "mode", 4)) { - data->root_mode = (value & 0555) | S_IFDIR; - data->perms.mode = (value & 0666) | S_IFREG; - } else { - goto invalid; - } - break; - - case 3: - if (!memcmp(opts, "uid", 3)) { - data->perms.uid = make_kuid(current_user_ns(), value); - if (!uid_valid(data->perms.uid)) { - pr_err("%s: unmapped value: %lu\n", opts, value); - return -EINVAL; - } - } else if (!memcmp(opts, "gid", 3)) { - data->perms.gid = make_kgid(current_user_ns(), value); - if (!gid_valid(data->perms.gid)) { - pr_err("%s: unmapped value: %lu\n", opts, value); - return -EINVAL; - } - } else { - goto invalid; - } - break; - - default: -invalid: - pr_err("%s: invalid option\n", opts); - return -EINVAL; - } - - /* Next iteration */ - if (!comma) - break; - opts = comma + 1; + default: + return -ENOPARAM; } return 0; + +unmapped_value: + return invalf(fc, "%s: unmapped value: %u", param->key, result.uint_32); } -/* "mount -t functionfs dev_name /dev/function" ends up here */ - -static struct dentry * -ffs_fs_mount(struct file_system_type *t, int flags, - const char *dev_name, void *opts) +/* + * Set up the superblock for a mount. + */ +static int ffs_fs_get_tree(struct fs_context *fc) { - struct ffs_sb_fill_data data = { - .perms = { - .mode = S_IFREG | 0600, - .uid = GLOBAL_ROOT_UID, - .gid = GLOBAL_ROOT_GID, - }, - .root_mode = S_IFDIR | 0500, - .no_disconnect = false, - }; - struct dentry *rv; - int ret; + struct ffs_sb_fill_data *ctx = fc->fs_private; void *ffs_dev; struct ffs_data *ffs; ENTER(); - ret = ffs_fs_parse_opts(&data, opts); - if (unlikely(ret < 0)) - return ERR_PTR(ret); + if (!fc->source) + return invalf(fc, "No source specified"); - ffs = ffs_data_new(dev_name); + ffs = ffs_data_new(fc->source); if (unlikely(!ffs)) - return ERR_PTR(-ENOMEM); - ffs->file_perms = data.perms; - ffs->no_disconnect = data.no_disconnect; + return -ENOMEM; + ffs->file_perms = ctx->perms; + ffs->no_disconnect = ctx->no_disconnect; - ffs->dev_name = kstrdup(dev_name, GFP_KERNEL); + ffs->dev_name = kstrdup(fc->source, GFP_KERNEL); if (unlikely(!ffs->dev_name)) { ffs_data_put(ffs); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } - ffs_dev = ffs_acquire_dev(dev_name); + ffs_dev = ffs_acquire_dev(ffs->dev_name); if (IS_ERR(ffs_dev)) { ffs_data_put(ffs); - return ERR_CAST(ffs_dev); + return PTR_ERR(ffs_dev); } - ffs->private_data = ffs_dev; - data.ffs_data = ffs; - rv = mount_nodev(t, flags, &data, ffs_sb_fill); - if (IS_ERR(rv) && data.ffs_data) { - ffs_release_dev(data.ffs_data); - ffs_data_put(data.ffs_data); + ffs->private_data = ffs_dev; + ctx->ffs_data = ffs; + return get_tree_nodev(fc, ffs_sb_fill); +} + +static void ffs_fs_free_fc(struct fs_context *fc) +{ + struct ffs_sb_fill_data *ctx = fc->fs_private; + + if (ctx) { + if (ctx->ffs_data) { + ffs_release_dev(ctx->ffs_data); + ffs_data_put(ctx->ffs_data); + } + + kfree(ctx); } - return rv; +} + +static const struct fs_context_operations ffs_fs_context_ops = { + .free = ffs_fs_free_fc, + .parse_param = ffs_fs_parse_param, + .get_tree = ffs_fs_get_tree, +}; + +static int ffs_fs_init_fs_context(struct fs_context *fc) +{ + struct ffs_sb_fill_data *ctx; + + ctx = kzalloc(sizeof(struct ffs_sb_fill_data), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->perms.mode = S_IFREG | 0600; + ctx->perms.uid = GLOBAL_ROOT_UID; + ctx->perms.gid = GLOBAL_ROOT_GID; + ctx->root_mode = S_IFDIR | 0500; + ctx->no_disconnect = false; + + fc->fs_private = ctx; + fc->ops = &ffs_fs_context_ops; + return 0; } static void @@ -1644,7 +1650,8 @@ ffs_fs_kill_sb(struct super_block *sb) static struct file_system_type ffs_fs_type = { .owner = THIS_MODULE, .name = "functionfs", - .mount = ffs_fs_mount, + .init_fs_context = ffs_fs_init_fs_context, + .parameters = &ffs_fs_fs_parameters, .kill_sb = ffs_fs_kill_sb, }; MODULE_ALIAS_FS("functionfs"); From b54c64f7adeb241423cd46598f458b5486b0375e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 21 Mar 2019 10:08:08 +0000 Subject: [PATCH 0605/2880] hypfs: Fix error number left in struct pointer member In hypfs_fill_super(), if hypfs_create_update_file() fails, sbi->update_file is left holding an error number. This is passed to hypfs_kill_super() which doesn't check for this. Fix this by not setting sbi->update_value until after we've checked for error. Fixes: 24bbb1faf3f0 ("[PATCH] s390_hypfs filesystem") Signed-off-by: David Howells cc: Martin Schwidefsky cc: Heiko Carstens cc: linux-s390@vger.kernel.org Signed-off-by: Al Viro --- arch/s390/hypfs/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index ccad1398abd4..b5cfcad953c2 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -269,7 +269,7 @@ static int hypfs_show_options(struct seq_file *s, struct dentry *root) static int hypfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *root_inode; - struct dentry *root_dentry; + struct dentry *root_dentry, *update_file; int rc = 0; struct hypfs_sb_info *sbi; @@ -300,9 +300,10 @@ static int hypfs_fill_super(struct super_block *sb, void *data, int silent) rc = hypfs_diag_create_files(root_dentry); if (rc) return rc; - sbi->update_file = hypfs_create_update_file(root_dentry); - if (IS_ERR(sbi->update_file)) - return PTR_ERR(sbi->update_file); + update_file = hypfs_create_update_file(root_dentry); + if (IS_ERR(update_file)) + return PTR_ERR(update_file); + sbi->update_file = update_file; hypfs_update_update(sb); pr_info("Hypervisor filesystem mounted\n"); return 0; From 8f8d8d11f76e991516e558eda7639e7b6a760e41 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 21 Mar 2019 09:35:44 +0000 Subject: [PATCH 0606/2880] vfs: Convert hypfs to use the new mount API Convert the hypfs filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Signed-off-by: David Howells cc: Martin Schwidefsky cc: Heiko Carstens cc: linux-s390@vger.kernel.org Signed-off-by: Al Viro --- arch/s390/hypfs/inode.c | 124 ++++++++++++++++++++++------------------ 1 file changed, 68 insertions(+), 56 deletions(-) diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index b5cfcad953c2..498966f6af59 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -12,17 +12,17 @@ #include #include #include +#include +#include #include #include #include #include #include -#include #include #include #include #include -#include #include #include #include "hypfs.h" @@ -207,52 +207,44 @@ static int hypfs_release(struct inode *inode, struct file *filp) return 0; } -enum { opt_uid, opt_gid, opt_err }; +enum { Opt_uid, Opt_gid, }; -static const match_table_t hypfs_tokens = { - {opt_uid, "uid=%u"}, - {opt_gid, "gid=%u"}, - {opt_err, NULL} +static const struct fs_parameter_spec hypfs_param_specs[] = { + fsparam_u32("gid", Opt_gid), + fsparam_u32("uid", Opt_uid), + {} }; -static int hypfs_parse_options(char *options, struct super_block *sb) +static const struct fs_parameter_description hypfs_fs_parameters = { + .name = "hypfs", + .specs = hypfs_param_specs, +}; + +static int hypfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *str; - substring_t args[MAX_OPT_ARGS]; + struct hypfs_sb_info *hypfs_info = fc->s_fs_info; + struct fs_parse_result result; kuid_t uid; kgid_t gid; + int opt; - if (!options) - return 0; - while ((str = strsep(&options, ",")) != NULL) { - int token, option; - struct hypfs_sb_info *hypfs_info = sb->s_fs_info; + opt = fs_parse(fc, &hypfs_fs_parameters, param, &result); + if (opt < 0) + return opt; - if (!*str) - continue; - token = match_token(str, hypfs_tokens, args); - switch (token) { - case opt_uid: - if (match_int(&args[0], &option)) - return -EINVAL; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) - return -EINVAL; - hypfs_info->uid = uid; - break; - case opt_gid: - if (match_int(&args[0], &option)) - return -EINVAL; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) - return -EINVAL; - hypfs_info->gid = gid; - break; - case opt_err: - default: - pr_err("%s is not a valid mount option\n", str); - return -EINVAL; - } + switch (opt) { + case Opt_uid: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + return invalf(fc, "Unknown uid"); + hypfs_info->uid = uid; + break; + case Opt_gid: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + return invalf(fc, "Unknown gid"); + hypfs_info->gid = gid; + break; } return 0; } @@ -266,26 +258,18 @@ static int hypfs_show_options(struct seq_file *s, struct dentry *root) return 0; } -static int hypfs_fill_super(struct super_block *sb, void *data, int silent) +static int hypfs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct hypfs_sb_info *sbi = sb->s_fs_info; struct inode *root_inode; struct dentry *root_dentry, *update_file; - int rc = 0; - struct hypfs_sb_info *sbi; + int rc; - sbi = kzalloc(sizeof(struct hypfs_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - mutex_init(&sbi->lock); - sbi->uid = current_uid(); - sbi->gid = current_gid(); - sb->s_fs_info = sbi; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = HYPFS_MAGIC; sb->s_op = &hypfs_s_ops; - if (hypfs_parse_options(data, sb)) - return -EINVAL; + root_inode = hypfs_make_inode(sb, S_IFDIR | 0755); if (!root_inode) return -ENOMEM; @@ -309,10 +293,37 @@ static int hypfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct dentry *hypfs_mount(struct file_system_type *fst, int flags, - const char *devname, void *data) +static int hypfs_get_tree(struct fs_context *fc) { - return mount_single(fst, flags, data, hypfs_fill_super); + return get_tree_single(fc, hypfs_fill_super); +} + +static void hypfs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations hypfs_context_ops = { + .free = hypfs_free_fc, + .parse_param = hypfs_parse_param, + .get_tree = hypfs_get_tree, +}; + +static int hypfs_init_fs_context(struct fs_context *fc) +{ + struct hypfs_sb_info *sbi; + + sbi = kzalloc(sizeof(struct hypfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + mutex_init(&sbi->lock); + sbi->uid = current_uid(); + sbi->gid = current_gid(); + + fc->s_fs_info = sbi; + fc->ops = &hypfs_context_ops; + return 0; } static void hypfs_kill_super(struct super_block *sb) @@ -443,7 +454,8 @@ static const struct file_operations hypfs_file_ops = { static struct file_system_type hypfs_type = { .owner = THIS_MODULE, .name = "s390_hypfs", - .mount = hypfs_mount, + .init_fs_context = hypfs_init_fs_context, + .parameters = &hypfs_fs_parameters, .kill_sb = hypfs_kill_super }; From d2e0981c3b9a7d31b59f81477c6acacb1bcc4513 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 21 Mar 2019 10:34:14 +0000 Subject: [PATCH 0607/2880] vfs: Convert spufs to use the new mount API Convert the spufs filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Signed-off-by: David Howells cc: Jeremy Kerr cc: Arnd Bergmann cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Al Viro --- arch/powerpc/platforms/cell/spufs/inode.c | 203 ++++++++++++---------- 1 file changed, 114 insertions(+), 89 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 065ff14b76e1..1d93e55a2de1 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -10,6 +10,8 @@ #include #include +#include +#include #include #include #include @@ -20,7 +22,6 @@ #include #include #include -#include #include #include @@ -30,7 +31,7 @@ #include "spufs.h" struct spufs_sb_info { - int debug; + bool debug; }; static struct kmem_cache *spufs_inode_cache; @@ -574,16 +575,27 @@ long spufs_create(struct path *path, struct dentry *dentry, } /* File system initialization */ -enum { - Opt_uid, Opt_gid, Opt_mode, Opt_debug, Opt_err, +struct spufs_fs_context { + kuid_t uid; + kgid_t gid; + umode_t mode; }; -static const match_table_t spufs_tokens = { - { Opt_uid, "uid=%d" }, - { Opt_gid, "gid=%d" }, - { Opt_mode, "mode=%o" }, - { Opt_debug, "debug" }, - { Opt_err, NULL }, +enum { + Opt_uid, Opt_gid, Opt_mode, Opt_debug, +}; + +static const struct fs_parameter_spec spufs_param_specs[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_u32oct ("mode", Opt_mode), + fsparam_u32 ("uid", Opt_uid), + fsparam_flag ("debug", Opt_debug), + {} +}; + +static const struct fs_parameter_description spufs_fs_parameters = { + .name = "spufs", + .specs = spufs_param_specs, }; static int spufs_show_options(struct seq_file *m, struct dentry *root) @@ -604,47 +616,41 @@ static int spufs_show_options(struct seq_file *m, struct dentry *root) return 0; } -static int -spufs_parse_options(struct super_block *sb, char *options, struct inode *root) +static int spufs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; + struct spufs_fs_context *ctx = fc->fs_private; + struct spufs_sb_info *sbi = fc->s_fs_info; + struct fs_parse_result result; + kuid_t uid; + kgid_t gid; + int opt; - while ((p = strsep(&options, ",")) != NULL) { - int token, option; + opt = fs_parse(fc, &spufs_fs_parameters, param, &result); + if (opt < 0) + return opt; - if (!*p) - continue; - - token = match_token(p, spufs_tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - return 0; - root->i_uid = make_kuid(current_user_ns(), option); - if (!uid_valid(root->i_uid)) - return 0; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return 0; - root->i_gid = make_kgid(current_user_ns(), option); - if (!gid_valid(root->i_gid)) - return 0; - break; - case Opt_mode: - if (match_octal(&args[0], &option)) - return 0; - root->i_mode = option | S_IFDIR; - break; - case Opt_debug: - spufs_get_sb_info(sb)->debug = 1; - break; - default: - return 0; - } + switch (opt) { + case Opt_uid: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + return invalf(fc, "Unknown uid"); + ctx->uid = uid; + break; + case Opt_gid: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + return invalf(fc, "Unknown gid"); + ctx->gid = gid; + break; + case Opt_mode: + ctx->mode = result.uint_32 & S_IALLUGO; + break; + case Opt_debug: + sbi->debug = true; + break; } - return 1; + + return 0; } static void spufs_exit_isolated_loader(void) @@ -678,79 +684,98 @@ spufs_init_isolated_loader(void) printk(KERN_INFO "spufs: SPU isolation mode enabled\n"); } -static int -spufs_create_root(struct super_block *sb, void *data) +static int spufs_create_root(struct super_block *sb, struct fs_context *fc) { + struct spufs_fs_context *ctx = fc->fs_private; struct inode *inode; - int ret; - ret = -ENODEV; if (!spu_management_ops) - goto out; + return -ENODEV; - ret = -ENOMEM; - inode = spufs_new_inode(sb, S_IFDIR | 0775); + inode = spufs_new_inode(sb, S_IFDIR | ctx->mode); if (!inode) - goto out; + return -ENOMEM; + inode->i_uid = ctx->uid; + inode->i_gid = ctx->gid; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; SPUFS_I(inode)->i_ctx = NULL; inc_nlink(inode); - ret = -EINVAL; - if (!spufs_parse_options(sb, data, inode)) - goto out_iput; - - ret = -ENOMEM; sb->s_root = d_make_root(inode); if (!sb->s_root) - goto out; - + return -ENOMEM; return 0; -out_iput: - iput(inode); -out: - return ret; } -static int -spufs_fill_super(struct super_block *sb, void *data, int silent) +static const struct super_operations spufs_ops = { + .alloc_inode = spufs_alloc_inode, + .free_inode = spufs_free_inode, + .statfs = simple_statfs, + .evict_inode = spufs_evict_inode, + .show_options = spufs_show_options, +}; + +static int spufs_fill_super(struct super_block *sb, struct fs_context *fc) { - struct spufs_sb_info *info; - static const struct super_operations s_ops = { - .alloc_inode = spufs_alloc_inode, - .free_inode = spufs_free_inode, - .statfs = simple_statfs, - .evict_inode = spufs_evict_inode, - .show_options = spufs_show_options, - }; - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; - sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = SPUFS_MAGIC; - sb->s_op = &s_ops; - sb->s_fs_info = info; + sb->s_op = &spufs_ops; - return spufs_create_root(sb, data); + return spufs_create_root(sb, fc); } -static struct dentry * -spufs_mount(struct file_system_type *fstype, int flags, - const char *name, void *data) +static int spufs_get_tree(struct fs_context *fc) { - return mount_single(fstype, flags, data, spufs_fill_super); + return get_tree_single(fc, spufs_fill_super); +} + +static void spufs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations spufs_context_ops = { + .free = spufs_free_fc, + .parse_param = spufs_parse_param, + .get_tree = spufs_get_tree, +}; + +static int spufs_init_fs_context(struct fs_context *fc) +{ + struct spufs_fs_context *ctx; + struct spufs_sb_info *sbi; + + ctx = kzalloc(sizeof(struct spufs_fs_context), GFP_KERNEL); + if (!ctx) + goto nomem; + + sbi = kzalloc(sizeof(struct spufs_sb_info), GFP_KERNEL); + if (!sbi) + goto nomem_ctx; + + ctx->uid = current_uid(); + ctx->gid = current_gid(); + ctx->mode = 0755; + + fc->s_fs_info = sbi; + fc->ops = &spufs_context_ops; + return 0; + +nomem_ctx: + kfree(ctx); +nomem: + return -ENOMEM; } static struct file_system_type spufs_type = { .owner = THIS_MODULE, .name = "spufs", - .mount = spufs_mount, + .init_fs_context = spufs_init_fs_context, + .parameters = &spufs_fs_parameters, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("spufs"); From 1f52aa08d12f8d359e71b4bfd73ca9d5d668e4da Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Wed, 27 Mar 2019 14:46:00 +0000 Subject: [PATCH 0608/2880] gfs2: Convert gfs2 to fs_context Convert gfs2 and gfs2meta to fs_context. Removes the duplicated vfs code from gfs2_mount and instead uses the new vfs_get_block_super() before switching the ->root to the appropriate dentry. The mount option parsing has been converted to the new API and error reporting for invalid options has been made more precise at the same time. All of the mount/remount code has been moved into ops_fstype.c Signed-off-by: Andrew Price Signed-off-by: David Howells cc: cluster-devel@redhat.com Signed-off-by: Al Viro --- fs/gfs2/incore.h | 8 +- fs/gfs2/ops_fstype.c | 497 ++++++++++++++++++++++++++++++++----------- fs/gfs2/super.c | 333 +---------------------------- fs/gfs2/super.h | 3 +- 4 files changed, 381 insertions(+), 460 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 7a993d7c022e..0662747353e7 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -584,10 +584,10 @@ struct gfs2_args { unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ unsigned int ar_loccookie:1; /* use location based readdir cookies */ - int ar_commit; /* Commit interval */ - int ar_statfs_quantum; /* The fast statfs interval */ - int ar_quota_quantum; /* The quota interval */ - int ar_statfs_percent; /* The % change to force sync */ + s32 ar_commit; /* Commit interval */ + s32 ar_statfs_quantum; /* The fast statfs interval */ + s32 ar_quota_quantum; /* The quota interval */ + s32 ar_statfs_percent; /* The % change to force sync */ }; struct gfs2_tune { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 4a8e5a7310f0..0856f3c6ed8b 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -1030,16 +1031,17 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp) } /** - * fill_super - Read in superblock + * gfs2_fill_super - Read in superblock * @sb: The VFS superblock - * @data: Mount options + * @args: Mount options * @silent: Don't complain if it's not a GFS2 filesystem * - * Returns: errno + * Returns: -errno */ - -static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent) +static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) { + struct gfs2_args *args = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; struct gfs2_sbd *sdp; struct gfs2_holder mount_gh; int error; @@ -1204,161 +1206,411 @@ fail_debug: return error; } -static int set_gfs2_super(struct super_block *s, void *data) +/** + * gfs2_get_tree - Get the GFS2 superblock and root directory + * @fc: The filesystem context + * + * Returns: 0 or -errno on error + */ +static int gfs2_get_tree(struct fs_context *fc) { - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + struct gfs2_args *args = fc->fs_private; + struct gfs2_sbd *sdp; + int error; + + error = get_tree_bdev(fc, gfs2_fill_super); + if (error) + return error; + + sdp = fc->root->d_sb->s_fs_info; + dput(fc->root); + if (args->ar_meta) + fc->root = dget(sdp->sd_master_dir); + else + fc->root = dget(sdp->sd_root_dir); return 0; } -static int test_gfs2_super(struct super_block *s, void *ptr) +static void gfs2_fc_free(struct fs_context *fc) { - struct block_device *bdev = ptr; - return (bdev == s->s_bdev); + struct gfs2_args *args = fc->fs_private; + + kfree(args); } -/** - * gfs2_mount - Get the GFS2 superblock - * @fs_type: The GFS2 filesystem type - * @flags: Mount flags - * @dev_name: The name of the device - * @data: The mount arguments - * - * Q. Why not use get_sb_bdev() ? - * A. We need to select one of two root directories to mount, independent - * of whether this is the initial, or subsequent, mount of this sb - * - * Returns: 0 or -ve on error - */ +enum gfs2_param { + Opt_lockproto, + Opt_locktable, + Opt_hostdata, + Opt_spectator, + Opt_ignore_local_fs, + Opt_localflocks, + Opt_localcaching, + Opt_debug, + Opt_upgrade, + Opt_acl, + Opt_quota, + Opt_suiddir, + Opt_data, + Opt_meta, + Opt_discard, + Opt_commit, + Opt_errors, + Opt_statfs_quantum, + Opt_statfs_percent, + Opt_quota_quantum, + Opt_barrier, + Opt_rgrplvb, + Opt_loccookie, +}; -static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +enum opt_quota { + Opt_quota_unset = 0, + Opt_quota_off, + Opt_quota_account, + Opt_quota_on, +}; + +static const unsigned int opt_quota_values[] = { + [Opt_quota_off] = GFS2_QUOTA_OFF, + [Opt_quota_account] = GFS2_QUOTA_ACCOUNT, + [Opt_quota_on] = GFS2_QUOTA_ON, +}; + +enum opt_data { + Opt_data_writeback = GFS2_DATA_WRITEBACK, + Opt_data_ordered = GFS2_DATA_ORDERED, +}; + +enum opt_errors { + Opt_errors_withdraw = GFS2_ERRORS_WITHDRAW, + Opt_errors_panic = GFS2_ERRORS_PANIC, +}; + +static const struct fs_parameter_spec gfs2_param_specs[] = { + fsparam_string ("lockproto", Opt_lockproto), + fsparam_string ("locktable", Opt_locktable), + fsparam_string ("hostdata", Opt_hostdata), + fsparam_flag ("spectator", Opt_spectator), + fsparam_flag ("norecovery", Opt_spectator), + fsparam_flag ("ignore_local_fs", Opt_ignore_local_fs), + fsparam_flag ("localflocks", Opt_localflocks), + fsparam_flag ("localcaching", Opt_localcaching), + fsparam_flag_no("debug", Opt_debug), + fsparam_flag ("upgrade", Opt_upgrade), + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("suiddir", Opt_suiddir), + fsparam_enum ("data", Opt_data), + fsparam_flag ("meta", Opt_meta), + fsparam_flag_no("discard", Opt_discard), + fsparam_s32 ("commit", Opt_commit), + fsparam_enum ("errors", Opt_errors), + fsparam_s32 ("statfs_quantum", Opt_statfs_quantum), + fsparam_s32 ("statfs_percent", Opt_statfs_percent), + fsparam_s32 ("quota_quantum", Opt_quota_quantum), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag_no("rgrplvb", Opt_rgrplvb), + fsparam_flag_no("loccookie", Opt_loccookie), + /* quota can be a flag or an enum so it gets special treatment */ + __fsparam(fs_param_is_enum, "quota", Opt_quota, fs_param_neg_with_no|fs_param_v_optional), + {} +}; + +static const struct fs_parameter_enum gfs2_param_enums[] = { + { Opt_quota, "off", Opt_quota_off }, + { Opt_quota, "account", Opt_quota_account }, + { Opt_quota, "on", Opt_quota_on }, + { Opt_data, "writeback", Opt_data_writeback }, + { Opt_data, "ordered", Opt_data_ordered }, + { Opt_errors, "withdraw", Opt_errors_withdraw }, + { Opt_errors, "panic", Opt_errors_panic }, + {} +}; + +const struct fs_parameter_description gfs2_fs_parameters = { + .name = "gfs2", + .specs = gfs2_param_specs, + .enums = gfs2_param_enums, +}; + +/* Parse a single mount parameter */ +static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct block_device *bdev; - struct super_block *s; - fmode_t mode = FMODE_READ | FMODE_EXCL; - int error; - struct gfs2_args args; - struct gfs2_sbd *sdp; + struct gfs2_args *args = fc->fs_private; + struct fs_parse_result result; + int o; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; + o = fs_parse(fc, &gfs2_fs_parameters, param, &result); + if (o < 0) + return o; - bdev = blkdev_get_by_path(dev_name, mode, fs_type); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); - - /* - * once the super is inserted into the list by sget, s_umount - * will protect the lockfs code from trying to start a snapshot - * while we are mounting - */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - error = -EBUSY; - goto error_bdev; + switch (o) { + case Opt_lockproto: + strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_locktable: + strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_hostdata: + strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_spectator: + args->ar_spectator = 1; + break; + case Opt_ignore_local_fs: + /* Retained for backwards compat only */ + break; + case Opt_localflocks: + args->ar_localflocks = 1; + break; + case Opt_localcaching: + /* Retained for backwards compat only */ + break; + case Opt_debug: + if (result.boolean && args->ar_errors == GFS2_ERRORS_PANIC) + return invalf(fc, "gfs2: -o debug and -o errors=panic are mutually exclusive"); + args->ar_debug = result.boolean; + break; + case Opt_upgrade: + /* Retained for backwards compat only */ + break; + case Opt_acl: + args->ar_posix_acl = result.boolean; + break; + case Opt_quota: + /* The quota option can be a flag or an enum. A non-zero int_32 + result means that we have an enum index. Otherwise we have + to rely on the 'negated' flag to tell us whether 'quota' or + 'noquota' was specified. */ + if (result.negated) + args->ar_quota = GFS2_QUOTA_OFF; + else if (result.int_32 > 0) + args->ar_quota = opt_quota_values[result.int_32]; + else + args->ar_quota = GFS2_QUOTA_ON; + break; + case Opt_suiddir: + args->ar_suiddir = result.boolean; + break; + case Opt_data: + /* The uint_32 result maps directly to GFS2_DATA_* */ + args->ar_data = result.uint_32; + break; + case Opt_meta: + args->ar_meta = 1; + break; + case Opt_discard: + args->ar_discard = result.boolean; + break; + case Opt_commit: + if (result.int_32 <= 0) + return invalf(fc, "gfs2: commit mount option requires a positive numeric argument"); + args->ar_commit = result.int_32; + break; + case Opt_statfs_quantum: + if (result.int_32 < 0) + return invalf(fc, "gfs2: statfs_quantum mount option requires a non-negative numeric argument"); + args->ar_statfs_quantum = result.int_32; + break; + case Opt_quota_quantum: + if (result.int_32 <= 0) + return invalf(fc, "gfs2: quota_quantum mount option requires a positive numeric argument"); + args->ar_quota_quantum = result.int_32; + break; + case Opt_statfs_percent: + if (result.int_32 < 0 || result.int_32 > 100) + return invalf(fc, "gfs2: statfs_percent mount option requires a numeric argument between 0 and 100"); + args->ar_statfs_percent = result.int_32; + break; + case Opt_errors: + if (args->ar_debug && result.uint_32 == GFS2_ERRORS_PANIC) + return invalf(fc, "gfs2: -o debug and -o errors=panic are mutually exclusive"); + args->ar_errors = result.uint_32; + break; + case Opt_barrier: + args->ar_nobarrier = result.boolean; + break; + case Opt_rgrplvb: + args->ar_rgrplvb = result.boolean; + break; + case Opt_loccookie: + args->ar_loccookie = result.boolean; + break; + default: + return invalf(fc, "gfs2: invalid mount option: %s", param->key); } - s = sget(fs_type, test_gfs2_super, set_gfs2_super, flags, bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - error = PTR_ERR(s); - if (IS_ERR(s)) - goto error_bdev; + return 0; +} - if (s->s_root) { - /* - * s_umount nests inside bd_mutex during - * __invalidate_device(). blkdev_put() acquires - * bd_mutex and can't be called under s_umount. Drop - * s_umount temporarily. This is safe as we're - * holding an active reference. - */ - up_write(&s->s_umount); - blkdev_put(bdev, mode); - down_write(&s->s_umount); - } else { - /* s_mode must be set before deactivate_locked_super calls */ - s->s_mode = mode; - } +static int gfs2_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_args *oldargs = &sdp->sd_args; + struct gfs2_args *newargs = fc->fs_private; + struct gfs2_tune *gt = &sdp->sd_tune; + int error = 0; - memset(&args, 0, sizeof(args)); - args.ar_quota = GFS2_QUOTA_DEFAULT; - args.ar_data = GFS2_DATA_DEFAULT; - args.ar_commit = 30; - args.ar_statfs_quantum = 30; - args.ar_quota_quantum = 60; - args.ar_errors = GFS2_ERRORS_DEFAULT; + sync_filesystem(sb); - error = gfs2_mount_args(&args, data); - if (error) { - pr_warn("can't parse mount arguments\n"); - goto error_super; - } - - if (s->s_root) { - error = -EBUSY; - if ((flags ^ s->s_flags) & SB_RDONLY) - goto error_super; - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, &args, flags & SB_SILENT ? 1 : 0); - if (error) - goto error_super; - s->s_flags |= SB_ACTIVE; - bdev->bd_super = s; - } - - sdp = s->s_fs_info; - if (args.ar_meta) - return dget(sdp->sd_master_dir); + spin_lock(>->gt_spin); + oldargs->ar_commit = gt->gt_logd_secs; + oldargs->ar_quota_quantum = gt->gt_quota_quantum; + if (gt->gt_statfs_slow) + oldargs->ar_statfs_quantum = 0; else - return dget(sdp->sd_root_dir); + oldargs->ar_statfs_quantum = gt->gt_statfs_quantum; + spin_unlock(>->gt_spin); -error_super: - deactivate_locked_super(s); - return ERR_PTR(error); -error_bdev: - blkdev_put(bdev, mode); - return ERR_PTR(error); + if (strcmp(newargs->ar_lockproto, oldargs->ar_lockproto)) { + errorf(fc, "gfs2: reconfiguration of locking protocol not allowed"); + return -EINVAL; + } + if (strcmp(newargs->ar_locktable, oldargs->ar_locktable)) { + errorf(fc, "gfs2: reconfiguration of lock table not allowed"); + return -EINVAL; + } + if (strcmp(newargs->ar_hostdata, oldargs->ar_hostdata)) { + errorf(fc, "gfs2: reconfiguration of host data not allowed"); + return -EINVAL; + } + if (newargs->ar_spectator != oldargs->ar_spectator) { + errorf(fc, "gfs2: reconfiguration of spectator mode not allowed"); + return -EINVAL; + } + if (newargs->ar_localflocks != oldargs->ar_localflocks) { + errorf(fc, "gfs2: reconfiguration of localflocks not allowed"); + return -EINVAL; + } + if (newargs->ar_meta != oldargs->ar_meta) { + errorf(fc, "gfs2: switching between gfs2 and gfs2meta not allowed"); + return -EINVAL; + } + if (oldargs->ar_spectator) + fc->sb_flags |= SB_RDONLY; + + if ((sb->s_flags ^ fc->sb_flags) & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { + error = gfs2_make_fs_ro(sdp); + if (error) + errorf(fc, "gfs2: unable to remount read-only"); + } else { + error = gfs2_make_fs_rw(sdp); + if (error) + errorf(fc, "gfs2: unable to remount read-write"); + } + } + sdp->sd_args = *newargs; + + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= SB_POSIXACL; + else + sb->s_flags &= ~SB_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + else + clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); + spin_lock(>->gt_spin); + gt->gt_logd_secs = newargs->ar_commit; + gt->gt_quota_quantum = newargs->ar_quota_quantum; + if (newargs->ar_statfs_quantum) { + gt->gt_statfs_slow = 0; + gt->gt_statfs_quantum = newargs->ar_statfs_quantum; + } + else { + gt->gt_statfs_slow = 1; + gt->gt_statfs_quantum = 30; + } + spin_unlock(>->gt_spin); + + gfs2_online_uevent(sdp); + return error; } -static int set_meta_super(struct super_block *s, void *ptr) +static const struct fs_context_operations gfs2_context_ops = { + .free = gfs2_fc_free, + .parse_param = gfs2_parse_param, + .get_tree = gfs2_get_tree, + .reconfigure = gfs2_reconfigure, +}; + +/* Set up the filesystem mount context */ +static int gfs2_init_fs_context(struct fs_context *fc) +{ + struct gfs2_args *args; + + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (args == NULL) + return -ENOMEM; + + args->ar_quota = GFS2_QUOTA_DEFAULT; + args->ar_data = GFS2_DATA_DEFAULT; + args->ar_commit = 30; + args->ar_statfs_quantum = 30; + args->ar_quota_quantum = 60; + args->ar_errors = GFS2_ERRORS_DEFAULT; + + fc->fs_private = args; + fc->ops = &gfs2_context_ops; + return 0; +} + +static int set_meta_super(struct super_block *s, struct fs_context *fc) { return -EINVAL; } -static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int test_meta_super(struct super_block *s, struct fs_context *fc) +{ + return (fc->sget_key == s->s_bdev); +} + +static int gfs2_meta_get_tree(struct fs_context *fc) { struct super_block *s; struct gfs2_sbd *sdp; struct path path; int error; - if (!dev_name || !*dev_name) - return ERR_PTR(-EINVAL); + if (!fc->source || !*fc->source) + return -EINVAL; - error = kern_path(dev_name, LOOKUP_FOLLOW, &path); + error = kern_path(fc->source, LOOKUP_FOLLOW, &path); if (error) { pr_warn("path_lookup on %s returned error %d\n", - dev_name, error); - return ERR_PTR(error); + fc->source, error); + return error; } - s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, - path.dentry->d_sb->s_bdev); + fc->fs_type = &gfs2_fs_type; + fc->sget_key = path.dentry->d_sb->s_bdev; + s = sget_fc(fc, test_meta_super, set_meta_super); path_put(&path); if (IS_ERR(s)) { pr_warn("gfs2 mount does not exist\n"); - return ERR_CAST(s); + return PTR_ERR(s); } - if ((flags ^ s->s_flags) & SB_RDONLY) { + if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); - return ERR_PTR(-EBUSY); + return -EBUSY; } sdp = s->s_fs_info; - return dget(sdp->sd_master_dir); + fc->root = dget(sdp->sd_master_dir); + return 0; +} + +static const struct fs_context_operations gfs2_meta_context_ops = { + .get_tree = gfs2_meta_get_tree, +}; + +static int gfs2_meta_init_fs_context(struct fs_context *fc) +{ + int ret = gfs2_init_fs_context(fc); + + if (ret) + return ret; + + fc->ops = &gfs2_meta_context_ops; + return 0; } static void gfs2_kill_sb(struct super_block *sb) @@ -1382,7 +1634,8 @@ static void gfs2_kill_sb(struct super_block *sb) struct file_system_type gfs2_fs_type = { .name = "gfs2", .fs_flags = FS_REQUIRES_DEV, - .mount = gfs2_mount, + .init_fs_context = gfs2_init_fs_context, + .parameters = &gfs2_fs_parameters, .kill_sb = gfs2_kill_sb, .owner = THIS_MODULE, }; @@ -1391,7 +1644,7 @@ MODULE_ALIAS_FS("gfs2"); struct file_system_type gfs2meta_fs_type = { .name = "gfs2meta", .fs_flags = FS_REQUIRES_DEV, - .mount = gfs2_mount_meta, + .init_fs_context = gfs2_meta_init_fs_context, .owner = THIS_MODULE, }; MODULE_ALIAS_FS("gfs2meta"); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 0acc5834f653..ffc77e5097cb 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -44,258 +44,6 @@ #include "xattr.h" #include "lops.h" -#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) - -enum { - Opt_lockproto, - Opt_locktable, - Opt_hostdata, - Opt_spectator, - Opt_ignore_local_fs, - Opt_localflocks, - Opt_localcaching, - Opt_debug, - Opt_nodebug, - Opt_upgrade, - Opt_acl, - Opt_noacl, - Opt_quota_off, - Opt_quota_account, - Opt_quota_on, - Opt_quota, - Opt_noquota, - Opt_suiddir, - Opt_nosuiddir, - Opt_data_writeback, - Opt_data_ordered, - Opt_meta, - Opt_discard, - Opt_nodiscard, - Opt_commit, - Opt_err_withdraw, - Opt_err_panic, - Opt_statfs_quantum, - Opt_statfs_percent, - Opt_quota_quantum, - Opt_barrier, - Opt_nobarrier, - Opt_rgrplvb, - Opt_norgrplvb, - Opt_loccookie, - Opt_noloccookie, - Opt_error, -}; - -static const match_table_t tokens = { - {Opt_lockproto, "lockproto=%s"}, - {Opt_locktable, "locktable=%s"}, - {Opt_hostdata, "hostdata=%s"}, - {Opt_spectator, "spectator"}, - {Opt_spectator, "norecovery"}, - {Opt_ignore_local_fs, "ignore_local_fs"}, - {Opt_localflocks, "localflocks"}, - {Opt_localcaching, "localcaching"}, - {Opt_debug, "debug"}, - {Opt_nodebug, "nodebug"}, - {Opt_upgrade, "upgrade"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_quota_off, "quota=off"}, - {Opt_quota_account, "quota=account"}, - {Opt_quota_on, "quota=on"}, - {Opt_quota, "quota"}, - {Opt_noquota, "noquota"}, - {Opt_suiddir, "suiddir"}, - {Opt_nosuiddir, "nosuiddir"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_data_ordered, "data=ordered"}, - {Opt_meta, "meta"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_commit, "commit=%d"}, - {Opt_err_withdraw, "errors=withdraw"}, - {Opt_err_panic, "errors=panic"}, - {Opt_statfs_quantum, "statfs_quantum=%d"}, - {Opt_statfs_percent, "statfs_percent=%d"}, - {Opt_quota_quantum, "quota_quantum=%d"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_rgrplvb, "rgrplvb"}, - {Opt_norgrplvb, "norgrplvb"}, - {Opt_loccookie, "loccookie"}, - {Opt_noloccookie, "noloccookie"}, - {Opt_error, NULL} -}; - -/** - * gfs2_mount_args - Parse mount options - * @args: The structure into which the parsed options will be written - * @options: The options to parse - * - * Return: errno - */ - -int gfs2_mount_args(struct gfs2_args *args, char *options) -{ - char *o; - int token; - substring_t tmp[MAX_OPT_ARGS]; - int rv; - - /* Split the options into tokens with the "," character and - process them */ - - while (1) { - o = strsep(&options, ","); - if (o == NULL) - break; - if (*o == '\0') - continue; - - token = match_token(o, tokens, tmp); - switch (token) { - case Opt_lockproto: - match_strlcpy(args->ar_lockproto, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_locktable: - match_strlcpy(args->ar_locktable, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_hostdata: - match_strlcpy(args->ar_hostdata, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_spectator: - args->ar_spectator = 1; - break; - case Opt_ignore_local_fs: - /* Retained for backwards compat only */ - break; - case Opt_localflocks: - args->ar_localflocks = 1; - break; - case Opt_localcaching: - /* Retained for backwards compat only */ - break; - case Opt_debug: - if (args->ar_errors == GFS2_ERRORS_PANIC) { - pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); - return -EINVAL; - } - args->ar_debug = 1; - break; - case Opt_nodebug: - args->ar_debug = 0; - break; - case Opt_upgrade: - /* Retained for backwards compat only */ - break; - case Opt_acl: - args->ar_posix_acl = 1; - break; - case Opt_noacl: - args->ar_posix_acl = 0; - break; - case Opt_quota_off: - case Opt_noquota: - args->ar_quota = GFS2_QUOTA_OFF; - break; - case Opt_quota_account: - args->ar_quota = GFS2_QUOTA_ACCOUNT; - break; - case Opt_quota_on: - case Opt_quota: - args->ar_quota = GFS2_QUOTA_ON; - break; - case Opt_suiddir: - args->ar_suiddir = 1; - break; - case Opt_nosuiddir: - args->ar_suiddir = 0; - break; - case Opt_data_writeback: - args->ar_data = GFS2_DATA_WRITEBACK; - break; - case Opt_data_ordered: - args->ar_data = GFS2_DATA_ORDERED; - break; - case Opt_meta: - args->ar_meta = 1; - break; - case Opt_discard: - args->ar_discard = 1; - break; - case Opt_nodiscard: - args->ar_discard = 0; - break; - case Opt_commit: - rv = match_int(&tmp[0], &args->ar_commit); - if (rv || args->ar_commit <= 0) { - pr_warn("commit mount option requires a positive numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_statfs_quantum: - rv = match_int(&tmp[0], &args->ar_statfs_quantum); - if (rv || args->ar_statfs_quantum < 0) { - pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_quota_quantum: - rv = match_int(&tmp[0], &args->ar_quota_quantum); - if (rv || args->ar_quota_quantum <= 0) { - pr_warn("quota_quantum mount option requires a positive numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_statfs_percent: - rv = match_int(&tmp[0], &args->ar_statfs_percent); - if (rv || args->ar_statfs_percent < 0 || - args->ar_statfs_percent > 100) { - pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_err_withdraw: - args->ar_errors = GFS2_ERRORS_WITHDRAW; - break; - case Opt_err_panic: - if (args->ar_debug) { - pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); - return -EINVAL; - } - args->ar_errors = GFS2_ERRORS_PANIC; - break; - case Opt_barrier: - args->ar_nobarrier = 0; - break; - case Opt_nobarrier: - args->ar_nobarrier = 1; - break; - case Opt_rgrplvb: - args->ar_rgrplvb = 1; - break; - case Opt_norgrplvb: - args->ar_rgrplvb = 0; - break; - case Opt_loccookie: - args->ar_loccookie = 1; - break; - case Opt_noloccookie: - args->ar_loccookie = 0; - break; - case Opt_error: - default: - pr_warn("invalid mount option: %s\n", o); - return -EINVAL; - } - } - - return 0; -} - /** * gfs2_jindex_free - Clear all the journal index information * @sdp: The GFS2 superblock @@ -847,7 +595,7 @@ out: * Returns: errno */ -static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +int gfs2_make_fs_ro(struct gfs2_sbd *sdp) { struct gfs2_holder freeze_gh; int error; @@ -1226,84 +974,6 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -/** - * gfs2_remount_fs - called when the FS is remounted - * @sb: the filesystem - * @flags: the remount flags - * @data: extra data passed in (not used right now) - * - * Returns: errno - */ - -static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_args args = sdp->sd_args; /* Default to current settings */ - struct gfs2_tune *gt = &sdp->sd_tune; - int error; - - sync_filesystem(sb); - - spin_lock(>->gt_spin); - args.ar_commit = gt->gt_logd_secs; - args.ar_quota_quantum = gt->gt_quota_quantum; - if (gt->gt_statfs_slow) - args.ar_statfs_quantum = 0; - else - args.ar_statfs_quantum = gt->gt_statfs_quantum; - spin_unlock(>->gt_spin); - error = gfs2_mount_args(&args, data); - if (error) - return error; - - /* Not allowed to change locking details */ - if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) || - strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) || - strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata)) - return -EINVAL; - - /* Some flags must not be changed */ - if (args_neq(&args, &sdp->sd_args, spectator) || - args_neq(&args, &sdp->sd_args, localflocks) || - args_neq(&args, &sdp->sd_args, meta)) - return -EINVAL; - - if (sdp->sd_args.ar_spectator) - *flags |= SB_RDONLY; - - if ((sb->s_flags ^ *flags) & SB_RDONLY) { - if (*flags & SB_RDONLY) - error = gfs2_make_fs_ro(sdp); - else - error = gfs2_make_fs_rw(sdp); - } - - sdp->sd_args = args; - if (sdp->sd_args.ar_posix_acl) - sb->s_flags |= SB_POSIXACL; - else - sb->s_flags &= ~SB_POSIXACL; - if (sdp->sd_args.ar_nobarrier) - set_bit(SDF_NOBARRIERS, &sdp->sd_flags); - else - clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); - spin_lock(>->gt_spin); - gt->gt_logd_secs = args.ar_commit; - gt->gt_quota_quantum = args.ar_quota_quantum; - if (args.ar_statfs_quantum) { - gt->gt_statfs_slow = 0; - gt->gt_statfs_quantum = args.ar_statfs_quantum; - } - else { - gt->gt_statfs_slow = 1; - gt->gt_statfs_quantum = 30; - } - spin_unlock(>->gt_spin); - - gfs2_online_uevent(sdp); - return error; -} - /** * gfs2_drop_inode - Drop an inode (test for remote unlink) * @inode: The inode to drop @@ -1748,7 +1418,6 @@ const struct super_operations gfs2_super_ops = { .freeze_super = gfs2_freeze, .thaw_super = gfs2_unfreeze, .statfs = gfs2_statfs, - .remount_fs = gfs2_remount_fs, .drop_inode = gfs2_drop_inode, .show_options = gfs2_show_options, }; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 9d49eaadb9d9..b8bf811a1305 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -24,8 +24,6 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) extern void gfs2_jindex_free(struct gfs2_sbd *sdp); -extern int gfs2_mount_args(struct gfs2_args *args, char *data); - extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); @@ -33,6 +31,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, struct gfs2_inode **ipp); extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +extern int gfs2_make_fs_ro(struct gfs2_sbd *sdp); extern void gfs2_online_uevent(struct gfs2_sbd *sdp); extern int gfs2_statfs_init(struct gfs2_sbd *sdp); extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, From 7cca9b8b7c5bcc56d627851550840586a25aaa1b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 23 Aug 2019 11:47:28 +0200 Subject: [PATCH 0609/2880] microblaze: Switch to standard restart handler The microblaze uses the legacy APIs to dig out a GPIO pin defined in the root of the device tree to issue a hard reset of the platform. Asserting a hard reset should be done using the standard DT-enabled and fully GPIO descriptor aware driver in drivers/power/reset/gpio-restart.c using the bindings from Documentation/devicetree/bindings/power/reset/gpio-restart.txt To achieve this, first make sure microblaze makes use of the standard kernel restart path utilizing do_kernel_restart() from . Put in some grace time and an emergency print if the restart does not properly assert. As this is basic platform functionality we patch the DTS file and defconfig in one go for a lockstep change. Cc: Arnd Bergmann Cc: Michal Simek Signed-off-by: Linus Walleij [ Michal: Move machine_restart back to reset.c ] Signed-off-by: Michal Simek --- arch/microblaze/boot/dts/system.dts | 16 ++++- arch/microblaze/configs/mmu_defconfig | 2 + arch/microblaze/configs/nommu_defconfig | 2 + arch/microblaze/kernel/reset.c | 87 ++++--------------------- 4 files changed, 30 insertions(+), 77 deletions(-) diff --git a/arch/microblaze/boot/dts/system.dts b/arch/microblaze/boot/dts/system.dts index 5a8a9d090c37..5b236527176e 100644 --- a/arch/microblaze/boot/dts/system.dts +++ b/arch/microblaze/boot/dts/system.dts @@ -18,7 +18,6 @@ #address-cells = <1>; #size-cells = <1>; compatible = "xlnx,microblaze"; - hard-reset-gpios = <&LEDs_8Bit 2 1>; model = "testing"; DDR2_SDRAM: memory@90000000 { device_type = "memory"; @@ -281,6 +280,21 @@ gpios = <&LEDs_8Bit 7 1>; }; } ; + + gpio-restart { + compatible = "gpio-restart"; + /* + * FIXME: is this active low or active high? + * the current flag (1) indicates active low. + * delay measures are templates, should be adjusted + * to datasheet or trial-and-error with real hardware. + */ + gpios = <&LEDs_8Bit 2 1>; + active-delay = <100>; + inactive-delay = <10>; + wait-delay = <100>; + }; + RS232_Uart_1: serial@84000000 { clock-frequency = <125000000>; compatible = "xlnx,xps-uartlite-1.00.a"; diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index b83aff1e86cd..654edfdc7867 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -60,6 +60,8 @@ CONFIG_SPI_XILINX=y CONFIG_GPIOLIB=y CONFIG_GPIO_SYSFS=y CONFIG_GPIO_XILINX=y +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_GPIO_RESTART=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_XILINX_WATCHDOG=y diff --git a/arch/microblaze/configs/nommu_defconfig b/arch/microblaze/configs/nommu_defconfig index dc102eb65fca..377de39ccb8c 100644 --- a/arch/microblaze/configs/nommu_defconfig +++ b/arch/microblaze/configs/nommu_defconfig @@ -62,6 +62,8 @@ CONFIG_SPI_XILINX=y CONFIG_GPIOLIB=y CONFIG_GPIO_SYSFS=y CONFIG_GPIO_XILINX=y +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_GPIO_RESTART=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_XILINX_WATCHDOG=y diff --git a/arch/microblaze/kernel/reset.c b/arch/microblaze/kernel/reset.c index fcbe1daf6316..5f4722908164 100644 --- a/arch/microblaze/kernel/reset.c +++ b/arch/microblaze/kernel/reset.c @@ -8,83 +8,9 @@ */ #include +#include #include - -/* Trigger specific functions */ -#ifdef CONFIG_GPIOLIB - -#include - -static int handle; /* reset pin handle */ -static unsigned int reset_val; - -static int of_platform_reset_gpio_probe(void) -{ - int ret; - handle = of_get_named_gpio(of_find_node_by_path("/"), - "hard-reset-gpios", 0); - - if (!gpio_is_valid(handle)) { - pr_info("Skipping unavailable RESET gpio %d (%s)\n", - handle, "reset"); - return -ENODEV; - } - - ret = gpio_request(handle, "reset"); - if (ret < 0) { - pr_info("GPIO pin is already allocated\n"); - return ret; - } - - /* get current setup value */ - reset_val = gpio_get_value(handle); - /* FIXME maybe worth to perform any action */ - pr_debug("Reset: Gpio output state: 0x%x\n", reset_val); - - /* Setup GPIO as output */ - ret = gpio_direction_output(handle, 0); - if (ret < 0) - goto err; - - /* Setup output direction */ - gpio_set_value(handle, 0); - - pr_info("RESET: Registered gpio device: %d, current val: %d\n", - handle, reset_val); - return 0; -err: - gpio_free(handle); - return ret; -} -device_initcall(of_platform_reset_gpio_probe); - - -static void gpio_system_reset(void) -{ - if (gpio_is_valid(handle)) - gpio_set_value(handle, 1 - reset_val); - else - pr_notice("Reset GPIO unavailable - halting!\n"); -} -#else -static void gpio_system_reset(void) -{ - pr_notice("No reset GPIO present - halting!\n"); -} - -void of_platform_reset_gpio_probe(void) -{ - return; -} -#endif - -void machine_restart(char *cmd) -{ - pr_notice("Machine restart...\n"); - gpio_system_reset(); - while (1) - ; -} +#include void machine_shutdown(void) { @@ -106,3 +32,12 @@ void machine_power_off(void) while (1) ; } + +void machine_restart(char *cmd) +{ + do_kernel_restart(cmd); + /* Give the restart hook 1 s to take us down */ + mdelay(1000); + pr_emerg("Reboot failed -- System halted\n"); + while (1); +} From f71fee2711a788b94ff0acb02fbd2bfe2de7e0a3 Mon Sep 17 00:00:00 2001 From: Ingo Franzki Date: Tue, 20 Aug 2019 14:57:20 +0200 Subject: [PATCH 0610/2880] s390/pkey: Add sysfs attributes to emit AES CIPHER key blobs Now that the pkey kernel module also supports CCA AES CIPHER keys: Add binary read-only sysfs attributes for the pkey module that can be used to read random CCA AES CIPHER secure keys from, similar to the already existing sysfs attributes for AES DATA and random protected keys. Keys are read from these attributes using a cat-like interface. A typical use case for those keys is to encrypt a swap device using the paes cipher. During processing of /etc/crypttab, the CCA random AES CIPHER secure key to encrypt the swap device is read from one of the attributes. The following attributes are added: ccacipher/ccacipher_aes_128 ccacipher/ccacipher_aes_192 ccacipher/ccacipher_aes_256 ccacipher/ccacipher_aes_128_xts ccacipher/ccacipher_aes_256_xts Each attribute emits a secure key blob for the corresponding key size and cipher mode. Signed-off-by: Ingo Franzki Reviewed-by: Harald Freudenberger Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/pkey_api.c | 113 +++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index f76a1d0f54c4..9de3d46b3253 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -1363,9 +1363,122 @@ static struct attribute_group ccadata_attr_group = { .bin_attrs = ccadata_attrs, }; +#define CCACIPHERTOKENSIZE (sizeof(struct cipherkeytoken) + 80) + +/* + * Sysfs attribute read function for all secure key ccacipher binary attributes. + * The implementation can not deal with partial reads, because a new random + * secure key blob is generated with each read. In case of partial reads + * (i.e. off != 0 or count < key blob size) -EINVAL is returned. + */ +static ssize_t pkey_ccacipher_aes_attr_read(enum pkey_key_size keybits, + bool is_xts, char *buf, loff_t off, + size_t count) +{ + size_t keysize; + int rc; + + if (off != 0 || count < CCACIPHERTOKENSIZE) + return -EINVAL; + if (is_xts) + if (count < 2 * CCACIPHERTOKENSIZE) + return -EINVAL; + + keysize = CCACIPHERTOKENSIZE; + rc = cca_gencipherkey(-1, -1, keybits, 0, buf, &keysize); + if (rc) + return rc; + memset(buf + keysize, 0, CCACIPHERTOKENSIZE - keysize); + + if (is_xts) { + keysize = CCACIPHERTOKENSIZE; + rc = cca_gencipherkey(-1, -1, keybits, 0, + buf + CCACIPHERTOKENSIZE, &keysize); + if (rc) + return rc; + memset(buf + CCACIPHERTOKENSIZE + keysize, 0, + CCACIPHERTOKENSIZE - keysize); + + return 2 * CCACIPHERTOKENSIZE; + } + + return CCACIPHERTOKENSIZE; +} + +static ssize_t ccacipher_aes_128_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_128, false, buf, + off, count); +} + +static ssize_t ccacipher_aes_192_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_192, false, buf, + off, count); +} + +static ssize_t ccacipher_aes_256_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_256, false, buf, + off, count); +} + +static ssize_t ccacipher_aes_128_xts_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_128, true, buf, + off, count); +} + +static ssize_t ccacipher_aes_256_xts_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_256, true, buf, + off, count); +} + +static BIN_ATTR_RO(ccacipher_aes_128, CCACIPHERTOKENSIZE); +static BIN_ATTR_RO(ccacipher_aes_192, CCACIPHERTOKENSIZE); +static BIN_ATTR_RO(ccacipher_aes_256, CCACIPHERTOKENSIZE); +static BIN_ATTR_RO(ccacipher_aes_128_xts, 2 * CCACIPHERTOKENSIZE); +static BIN_ATTR_RO(ccacipher_aes_256_xts, 2 * CCACIPHERTOKENSIZE); + +static struct bin_attribute *ccacipher_attrs[] = { + &bin_attr_ccacipher_aes_128, + &bin_attr_ccacipher_aes_192, + &bin_attr_ccacipher_aes_256, + &bin_attr_ccacipher_aes_128_xts, + &bin_attr_ccacipher_aes_256_xts, + NULL +}; + +static struct attribute_group ccacipher_attr_group = { + .name = "ccacipher", + .bin_attrs = ccacipher_attrs, +}; + static const struct attribute_group *pkey_attr_groups[] = { &protkey_attr_group, &ccadata_attr_group, + &ccacipher_attr_group, NULL, }; From b91d9e67e50bfc79850a1760b8e59dc1dc56057f Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Wed, 11 Sep 2019 18:41:03 +0200 Subject: [PATCH 0611/2880] s390/cio: fix intparm documentation The common I/O layer is maintaining an "intparm" inspired by the hardware intparm for driver usage. This "intparm" is not only applicaple for ssch, but also for hsch/csch. The kerneldoc states that it is only updated for hsch/csch if no prior request is pending; however, this is not what the code does (whether that would actually desireable is a different issue.) Let's at least fix the kerneldoc for now. Fixes: b2ffd8e9a76e ("[S390] cio: Add docbook comments.") Signed-off-by: Cornelia Huck Signed-off-by: Sebastian Ott Signed-off-by: Vasily Gorbik --- drivers/s390/cio/device_ops.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c index d722458c5928..65841af15748 100644 --- a/drivers/s390/cio/device_ops.c +++ b/drivers/s390/cio/device_ops.c @@ -124,9 +124,7 @@ EXPORT_SYMBOL(ccw_device_is_multipath); /** * ccw_device_clear() - terminate I/O request processing * @cdev: target ccw device - * @intparm: interruption parameter; value is only used if no I/O is - * outstanding, otherwise the intparm associated with the I/O request - * is returned + * @intparm: interruption parameter to be returned upon conclusion of csch * * ccw_device_clear() calls csch on @cdev's subchannel. * Returns: @@ -179,6 +177,9 @@ int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm) * completed during the time specified by @expires. If a timeout occurs, the * channel program is terminated via xsch, hsch or csch, and the device's * interrupt handler will be called with an irb containing ERR_PTR(-%ETIMEDOUT). + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_halt() or ccw_device_clear(). * Returns: * %0, if the operation was successful; * -%EBUSY, if the device is busy, or status pending; @@ -256,6 +257,9 @@ int ccw_device_start_timeout_key(struct ccw_device *cdev, struct ccw1 *cpa, * Start a S/390 channel program. When the interrupt arrives, the * IRQ handler is called, either immediately, delayed (dev-end missing, * or sense required) or never (no IRQ handler registered). + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_halt() or ccw_device_clear(). * Returns: * %0, if the operation was successful; * -%EBUSY, if the device is busy, or status pending; @@ -287,6 +291,9 @@ int ccw_device_start_key(struct ccw_device *cdev, struct ccw1 *cpa, * Start a S/390 channel program. When the interrupt arrives, the * IRQ handler is called, either immediately, delayed (dev-end missing, * or sense required) or never (no IRQ handler registered). + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_halt() or ccw_device_clear(). * Returns: * %0, if the operation was successful; * -%EBUSY, if the device is busy, or status pending; @@ -322,6 +329,9 @@ int ccw_device_start(struct ccw_device *cdev, struct ccw1 *cpa, * completed during the time specified by @expires. If a timeout occurs, the * channel program is terminated via xsch, hsch or csch, and the device's * interrupt handler will be called with an irb containing ERR_PTR(-%ETIMEDOUT). + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_halt() or ccw_device_clear(). * Returns: * %0, if the operation was successful; * -%EBUSY, if the device is busy, or status pending; @@ -343,11 +353,12 @@ int ccw_device_start_timeout(struct ccw_device *cdev, struct ccw1 *cpa, /** * ccw_device_halt() - halt I/O request processing * @cdev: target ccw device - * @intparm: interruption parameter; value is only used if no I/O is - * outstanding, otherwise the intparm associated with the I/O request - * is returned + * @intparm: interruption parameter to be returned upon conclusion of hsch * * ccw_device_halt() calls hsch on @cdev's subchannel. + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_clear(). * Returns: * %0 on success, * -%ENODEV on device not operational, From cf2957f3907e44ca40392ac19a0c22a14e3fdc18 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Fri, 16 Aug 2019 11:05:58 +0200 Subject: [PATCH 0612/2880] s390/zcrypt: CEX7S exploitation support This patch adds CEX7 exploitation support for the AP bus code, the zcrypt device driver zoo and the vfio device driver. Signed-off-by: Harald Freudenberger Reviewed-by: Ingo Franzki Signed-off-by: Vasily Gorbik --- arch/s390/include/uapi/asm/zcrypt.h | 4 +- drivers/s390/crypto/ap_bus.c | 12 ++--- drivers/s390/crypto/ap_bus.h | 3 +- drivers/s390/crypto/vfio_ap_drv.c | 2 + drivers/s390/crypto/zcrypt_api.h | 3 +- drivers/s390/crypto/zcrypt_cex4.c | 72 +++++++++++++++++++++-------- 6 files changed, 67 insertions(+), 29 deletions(-) diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h index 8c5755f41dde..f9e5e1f0821d 100644 --- a/arch/s390/include/uapi/asm/zcrypt.h +++ b/arch/s390/include/uapi/asm/zcrypt.h @@ -4,7 +4,7 @@ * * zcrypt 2.2.1 (user-visible header) * - * Copyright IBM Corp. 2001, 2018 + * Copyright IBM Corp. 2001, 2019 * Author(s): Robert Burroughs * Eric Rossman (edrossma@us.ibm.com) * @@ -286,7 +286,7 @@ struct zcrypt_device_matrix_ext { * 0x08: CEX3A * 0x0a: CEX4 * 0x0b: CEX5 - * 0x0c: CEX6 + * 0x0c: CEX6 and CEX7 * 0x0d: device is disabled * * ZCRYPT_QDEPTH_MASK diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index a76b8a8bcbbb..a1915061932e 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -1322,24 +1322,24 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func) /* < CEX2A is not supported */ if (rawtype < AP_DEVICE_TYPE_CEX2A) return 0; - /* up to CEX6 known and fully supported */ - if (rawtype <= AP_DEVICE_TYPE_CEX6) + /* up to CEX7 known and fully supported */ + if (rawtype <= AP_DEVICE_TYPE_CEX7) return rawtype; /* - * unknown new type > CEX6, check for compatibility + * unknown new type > CEX7, check for compatibility * to the highest known and supported type which is - * currently CEX6 with the help of the QACT function. + * currently CEX7 with the help of the QACT function. */ if (ap_qact_available()) { struct ap_queue_status status; union ap_qact_ap_info apinfo = {0}; apinfo.mode = (func >> 26) & 0x07; - apinfo.cat = AP_DEVICE_TYPE_CEX6; + apinfo.cat = AP_DEVICE_TYPE_CEX7; status = ap_qact(qid, 0, &apinfo); if (status.response_code == AP_RESPONSE_NORMAL && apinfo.cat >= AP_DEVICE_TYPE_CEX2A - && apinfo.cat <= AP_DEVICE_TYPE_CEX6) + && apinfo.cat <= AP_DEVICE_TYPE_CEX7) comp_type = apinfo.cat; } if (!comp_type) diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 6f3cf37776ca..433b7b64368d 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * Copyright IBM Corp. 2006, 2012 + * Copyright IBM Corp. 2006, 2019 * Author(s): Cornelia Huck * Martin Schwidefsky * Ralph Wuerthner @@ -63,6 +63,7 @@ static inline int ap_test_bit(unsigned int *ptr, unsigned int nr) #define AP_DEVICE_TYPE_CEX4 10 #define AP_DEVICE_TYPE_CEX5 11 #define AP_DEVICE_TYPE_CEX6 12 +#define AP_DEVICE_TYPE_CEX7 13 /* * Known function facilities diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c index 003662aa8060..be2520cc010b 100644 --- a/drivers/s390/crypto/vfio_ap_drv.c +++ b/drivers/s390/crypto/vfio_ap_drv.c @@ -36,6 +36,8 @@ static struct ap_device_id ap_queue_ids[] = { .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { .dev_type = AP_DEVICE_TYPE_CEX6, .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX7, + .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { /* end of sibling */ }, }; diff --git a/drivers/s390/crypto/zcrypt_api.h b/drivers/s390/crypto/zcrypt_api.h index 2d3f2732344f..d464618cd84f 100644 --- a/drivers/s390/crypto/zcrypt_api.h +++ b/drivers/s390/crypto/zcrypt_api.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * Copyright IBM Corp. 2001, 2018 + * Copyright IBM Corp. 2001, 2019 * Author(s): Robert Burroughs * Eric Rossman (edrossma@us.ibm.com) * Cornelia Huck @@ -29,6 +29,7 @@ #define ZCRYPT_CEX4 10 #define ZCRYPT_CEX5 11 #define ZCRYPT_CEX6 12 +#define ZCRYPT_CEX7 13 /** * Large random numbers are pulled in 4096 byte chunks from the crypto cards diff --git a/drivers/s390/crypto/zcrypt_cex4.c b/drivers/s390/crypto/zcrypt_cex4.c index f58d8dec19dc..442e3d6162f7 100644 --- a/drivers/s390/crypto/zcrypt_cex4.c +++ b/drivers/s390/crypto/zcrypt_cex4.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright IBM Corp. 2012 + * Copyright IBM Corp. 2012, 2019 * Author(s): Holger Dengler */ @@ -38,8 +38,8 @@ #define CEX4_CLEANUP_TIME (900*HZ) MODULE_AUTHOR("IBM Corporation"); -MODULE_DESCRIPTION("CEX4/CEX5/CEX6 Cryptographic Card device driver, " \ - "Copyright IBM Corp. 2018"); +MODULE_DESCRIPTION("CEX4/CEX5/CEX6/CEX7 Cryptographic Card device driver, " \ + "Copyright IBM Corp. 2019"); MODULE_LICENSE("GPL"); static struct ap_device_id zcrypt_cex4_card_ids[] = { @@ -49,6 +49,8 @@ static struct ap_device_id zcrypt_cex4_card_ids[] = { .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, { .dev_type = AP_DEVICE_TYPE_CEX6, .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX7, + .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, { /* end of list */ }, }; @@ -61,6 +63,8 @@ static struct ap_device_id zcrypt_cex4_queue_ids[] = { .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { .dev_type = AP_DEVICE_TYPE_CEX6, .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX7, + .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { /* end of list */ }, }; @@ -146,7 +150,7 @@ static const struct attribute_group cca_queue_attr_group = { }; /** - * Probe function for CEX4/CEX5/CEX6 card device. It always + * Probe function for CEX4/CEX5/CEX6/CEX7 card device. It always * accepts the AP device since the bus_match already checked * the hardware type. * @ap_dev: pointer to the AP device. @@ -158,25 +162,31 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) * MEX_1k, MEX_2k, MEX_4k, CRT_1k, CRT_2k, CRT_4k, RNG, SECKEY */ static const int CEX4A_SPEED_IDX[] = { - 14, 19, 249, 42, 228, 1458, 0, 0}; + 14, 19, 249, 42, 228, 1458, 0, 0}; static const int CEX5A_SPEED_IDX[] = { - 8, 9, 20, 18, 66, 458, 0, 0}; + 8, 9, 20, 18, 66, 458, 0, 0}; static const int CEX6A_SPEED_IDX[] = { - 6, 9, 20, 17, 65, 438, 0, 0}; + 6, 9, 20, 17, 65, 438, 0, 0}; + static const int CEX7A_SPEED_IDX[] = { + 6, 8, 17, 15, 54, 362, 0, 0}; static const int CEX4C_SPEED_IDX[] = { 59, 69, 308, 83, 278, 2204, 209, 40}; static const int CEX5C_SPEED_IDX[] = { - 24, 31, 50, 37, 90, 479, 27, 10}; + 24, 31, 50, 37, 90, 479, 27, 10}; static const int CEX6C_SPEED_IDX[] = { - 16, 20, 32, 27, 77, 455, 23, 9}; + 16, 20, 32, 27, 77, 455, 24, 9}; + static const int CEX7C_SPEED_IDX[] = { + 14, 16, 26, 23, 64, 376, 23, 8}; static const int CEX4P_SPEED_IDX[] = { - 224, 313, 3560, 359, 605, 2827, 0, 50}; + 0, 0, 0, 0, 0, 0, 0, 50}; static const int CEX5P_SPEED_IDX[] = { - 63, 84, 156, 83, 142, 533, 0, 10}; + 0, 0, 0, 0, 0, 0, 0, 10}; static const int CEX6P_SPEED_IDX[] = { - 55, 70, 121, 73, 129, 522, 0, 9}; + 0, 0, 0, 0, 0, 0, 0, 9}; + static const int CEX7P_SPEED_IDX[] = { + 0, 0, 0, 0, 0, 0, 0, 8}; struct ap_card *ac = to_ap_card(&ap_dev->device); struct zcrypt_card *zc; @@ -198,11 +208,19 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX5; memcpy(zc->speed_rating, CEX5A_SPEED_IDX, sizeof(CEX5A_SPEED_IDX)); - } else { + } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX6) { zc->type_string = "CEX6A"; zc->user_space_type = ZCRYPT_CEX6; memcpy(zc->speed_rating, CEX6A_SPEED_IDX, sizeof(CEX6A_SPEED_IDX)); + } else { + zc->type_string = "CEX7A"; + /* wrong user space type, just for compatibility + * with the ZCRYPT_STATUS_MASK ioctl. + */ + zc->user_space_type = ZCRYPT_CEX6; + memcpy(zc->speed_rating, CEX7A_SPEED_IDX, + sizeof(CEX7A_SPEED_IDX)); } zc->min_mod_size = CEX4A_MIN_MOD_SIZE; if (ap_test_bit(&ac->functions, AP_FUNC_MEX4K) && @@ -232,7 +250,7 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX3C; memcpy(zc->speed_rating, CEX5C_SPEED_IDX, sizeof(CEX5C_SPEED_IDX)); - } else { + } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX6) { zc->type_string = "CEX6C"; /* wrong user space type, must be CEX6 * just keep it for cca compatibility @@ -240,6 +258,14 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX3C; memcpy(zc->speed_rating, CEX6C_SPEED_IDX, sizeof(CEX6C_SPEED_IDX)); + } else { + zc->type_string = "CEX7C"; + /* wrong user space type, must be CEX7 + * just keep it for cca compatibility + */ + zc->user_space_type = ZCRYPT_CEX3C; + memcpy(zc->speed_rating, CEX7C_SPEED_IDX, + sizeof(CEX7C_SPEED_IDX)); } zc->min_mod_size = CEX4C_MIN_MOD_SIZE; zc->max_mod_size = CEX4C_MAX_MOD_SIZE; @@ -255,11 +281,19 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX5; memcpy(zc->speed_rating, CEX5P_SPEED_IDX, sizeof(CEX5P_SPEED_IDX)); - } else { + } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX6) { zc->type_string = "CEX6P"; zc->user_space_type = ZCRYPT_CEX6; memcpy(zc->speed_rating, CEX6P_SPEED_IDX, sizeof(CEX6P_SPEED_IDX)); + } else { + zc->type_string = "CEX7P"; + /* wrong user space type, just for compatibility + * with the ZCRYPT_STATUS_MASK ioctl. + */ + zc->user_space_type = ZCRYPT_CEX6; + memcpy(zc->speed_rating, CEX7P_SPEED_IDX, + sizeof(CEX7P_SPEED_IDX)); } zc->min_mod_size = CEX4C_MIN_MOD_SIZE; zc->max_mod_size = CEX4C_MAX_MOD_SIZE; @@ -289,8 +323,8 @@ out: } /** - * This is called to remove the CEX4/CEX5/CEX6 card driver information - * if an AP card device is removed. + * This is called to remove the CEX4/CEX5/CEX6/CEX7 card driver + * information if an AP card device is removed. */ static void zcrypt_cex4_card_remove(struct ap_device *ap_dev) { @@ -311,7 +345,7 @@ static struct ap_driver zcrypt_cex4_card_driver = { }; /** - * Probe function for CEX4/CEX5/CEX6 queue device. It always + * Probe function for CEX4/CEX5/CEX6/CEX7 queue device. It always * accepts the AP device since the bus_match already checked * the hardware type. * @ap_dev: pointer to the AP device. @@ -369,7 +403,7 @@ out: } /** - * This is called to remove the CEX4/CEX5/CEX6 queue driver + * This is called to remove the CEX4/CEX5/CEX6/CEX7 queue driver * information if an AP queue device is removed. */ static void zcrypt_cex4_queue_remove(struct ap_device *ap_dev) From 2cb549a821e9daf441b62d55ca8eb6a9c22acf95 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 16 Sep 2019 11:21:23 +0200 Subject: [PATCH 0613/2880] s390/cpum_sf: Support ioctl PERF_EVENT_IOC_PERIOD A perf_event can be set up to deliver overflow notifications via SIGIO signal. The setup of the event is: 1. create event with perf_event_open() 2. assign it a signal for I/O notification with fcntl() 3. Install signal handler and consume samples The initial setup of perf_event_open() determines the period/frequency time span needed to elapse before each signal is delivered to the user process. While the event is active, system call ioctl(.., PERF_EVENT_IOC_PERIOD, value) can be used the change the frequency/period time span of the active event. The remaining signal handler invocations honour the new value. This does not work on s390. In fact the time span does not change regardless of ioctl's third argument 'value'. The call succeeds but the time span does not change. Support this behavior and make it common with other platforms. This is achieved by changing the interval value of the sampling control block accordingly and feed this new value every time the event is enabled using pmu_event_enable(). Before this change the interval value was set only once at pmu_event_add() and never changed. Signed-off-by: Thomas Richter Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/perf_event.h | 2 + arch/s390/kernel/perf_cpum_sf.c | 165 ++++++++++++++++++++++------- 2 files changed, 127 insertions(+), 40 deletions(-) diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index 560d8f766ddf..4652ffffe0b2 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -60,6 +60,7 @@ struct perf_sf_sde_regs { #define PERF_CPUM_SF_MODE_MASK (PERF_CPUM_SF_BASIC_MODE| \ PERF_CPUM_SF_DIAG_MODE) #define PERF_CPUM_SF_FULL_BLOCKS 0x0004 /* Process full SDBs only */ +#define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */ #define REG_NONE 0 #define REG_OVERFLOW 1 @@ -70,5 +71,6 @@ struct perf_sf_sde_regs { #define SAMPL_FLAGS(hwc) ((hwc)->config_base) #define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE) #define SDB_FULL_BLOCKS(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FULL_BLOCKS) +#define SAMPLE_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE) #endif /* _ASM_S390_PERF_EVENT_H */ diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 292a452cd1f3..544a02e944c6 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -673,13 +673,89 @@ out: rcu_read_unlock(); } +static unsigned long getrate(bool freq, unsigned long sample, + struct hws_qsi_info_block *si) +{ + unsigned long rate; + + if (freq) { + rate = freq_to_sample_rate(si, sample); + rate = hw_limit_rate(si, rate); + } else { + /* The min/max sampling rates specifies the valid range + * of sample periods. If the specified sample period is + * out of range, limit the period to the range boundary. + */ + rate = hw_limit_rate(si, sample); + + /* The perf core maintains a maximum sample rate that is + * configurable through the sysctl interface. Ensure the + * sampling rate does not exceed this value. This also helps + * to avoid throttling when pushing samples with + * perf_event_overflow(). + */ + if (sample_rate_to_freq(si, rate) > + sysctl_perf_event_sample_rate) { + debug_sprintf_event(sfdbg, 1, + "Sampling rate exceeds maximum " + "perf sample rate\n"); + rate = 0; + } + } + return rate; +} + +/* The sampling information (si) contains information about the + * min/max sampling intervals and the CPU speed. So calculate the + * correct sampling interval and avoid the whole period adjust + * feedback loop. + * + * Since the CPU Measurement sampling facility can not handle frequency + * calculate the sampling interval when frequency is specified using + * this formula: + * interval := cpu_speed * 1000000 / sample_freq + * + * Returns errno on bad input and zero on success with parameter interval + * set to the correct sampling rate. + * + * Note: This function turns off freq bit to avoid calling function + * perf_adjust_period(). This causes frequency adjustment in the common + * code part which causes tremendous variations in the counter values. + */ +static int __hw_perf_event_init_rate(struct perf_event *event, + struct hws_qsi_info_block *si) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + unsigned long rate; + + if (attr->freq) { + if (!attr->sample_freq) + return -EINVAL; + rate = getrate(attr->freq, attr->sample_freq, si); + attr->freq = 0; /* Don't call perf_adjust_period() */ + SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FREQ_MODE; + } else { + rate = getrate(attr->freq, attr->sample_period, si); + if (!rate) + return -EINVAL; + } + attr->sample_period = rate; + SAMPL_RATE(hwc) = rate; + hw_init_period(hwc, SAMPL_RATE(hwc)); + debug_sprintf_event(sfdbg, 4, "__hw_perf_event_init_rate:" + "cpu:%d period:%llx freq:%d,%#lx\n", event->cpu, + event->attr.sample_period, event->attr.freq, + SAMPLE_FREQ_MODE(hwc)); + return 0; +} + static int __hw_perf_event_init(struct perf_event *event) { struct cpu_hw_sf *cpuhw; struct hws_qsi_info_block si; struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; - unsigned long rate; int cpu, err; /* Reserve CPU-measurement sampling facility */ @@ -745,43 +821,9 @@ static int __hw_perf_event_init(struct perf_event *event) if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS) SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS; - /* The sampling information (si) contains information about the - * min/max sampling intervals and the CPU speed. So calculate the - * correct sampling interval and avoid the whole period adjust - * feedback loop. - */ - rate = 0; - if (attr->freq) { - if (!attr->sample_freq) { - err = -EINVAL; - goto out; - } - rate = freq_to_sample_rate(&si, attr->sample_freq); - rate = hw_limit_rate(&si, rate); - attr->freq = 0; - attr->sample_period = rate; - } else { - /* The min/max sampling rates specifies the valid range - * of sample periods. If the specified sample period is - * out of range, limit the period to the range boundary. - */ - rate = hw_limit_rate(&si, hwc->sample_period); - - /* The perf core maintains a maximum sample rate that is - * configurable through the sysctl interface. Ensure the - * sampling rate does not exceed this value. This also helps - * to avoid throttling when pushing samples with - * perf_event_overflow(). - */ - if (sample_rate_to_freq(&si, rate) > - sysctl_perf_event_sample_rate) { - err = -EINVAL; - debug_sprintf_event(sfdbg, 1, "Sampling rate exceeds maximum perf sample rate\n"); - goto out; - } - } - SAMPL_RATE(hwc) = rate; - hw_init_period(hwc, SAMPL_RATE(hwc)); + err = __hw_perf_event_init_rate(event, &si); + if (err) + goto out; /* Initialize sample data overflow accounting */ hwc->extra_reg.reg = REG_OVERFLOW; @@ -904,6 +946,8 @@ static void cpumsf_pmu_enable(struct pmu *pmu) if (sfb_has_pending_allocs(&cpuhw->sfb, hwc)) extend_sampling_buffer(&cpuhw->sfb, hwc); } + /* Rate may be adjusted with ioctl() */ + cpuhw->lsctl.interval = SAMPL_RATE(&cpuhw->event->hw); } /* (Re)enable the PMU and sampling facility */ @@ -922,8 +966,9 @@ static void cpumsf_pmu_enable(struct pmu *pmu) lpp(&S390_lowcore.lpp); debug_sprintf_event(sfdbg, 6, "pmu_enable: es=%i cs=%i ed=%i cd=%i " - "tear=%p dear=%p\n", cpuhw->lsctl.es, - cpuhw->lsctl.cs, cpuhw->lsctl.ed, cpuhw->lsctl.cd, + "interval:%lx tear=%p dear=%p\n", + cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed, + cpuhw->lsctl.cd, cpuhw->lsctl.interval, (void *) cpuhw->lsctl.tear, (void *) cpuhw->lsctl.dear); } @@ -1717,6 +1762,44 @@ static void cpumsf_pmu_read(struct perf_event *event) /* Nothing to do ... updates are interrupt-driven */ } +/* Check if the new sampling period/freqeuncy is appropriate. + * + * Return non-zero on error and zero on passed checks. + */ +static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) +{ + struct hws_qsi_info_block si; + unsigned long rate; + bool do_freq; + + memset(&si, 0, sizeof(si)); + if (event->cpu == -1) { + if (qsi(&si)) + return -ENODEV; + } else { + /* Event is pinned to a particular CPU, retrieve the per-CPU + * sampling structure for accessing the CPU-specific QSI. + */ + struct cpu_hw_sf *cpuhw = &per_cpu(cpu_hw_sf, event->cpu); + + si = cpuhw->qsi; + } + + do_freq = !!SAMPLE_FREQ_MODE(&event->hw); + rate = getrate(do_freq, value, &si); + if (!rate) + return -EINVAL; + + event->attr.sample_period = rate; + SAMPL_RATE(&event->hw) = rate; + hw_init_period(&event->hw, SAMPL_RATE(&event->hw)); + debug_sprintf_event(sfdbg, 4, "cpumsf_pmu_check_period:" + "cpu:%d value:%llx period:%llx freq:%d\n", + event->cpu, value, + event->attr.sample_period, do_freq); + return 0; +} + /* Activate sampling control. * Next call of pmu_enable() starts sampling. */ @@ -1908,6 +1991,8 @@ static struct pmu cpumf_sampling = { .setup_aux = aux_buffer_setup, .free_aux = aux_buffer_free, + + .check_period = cpumsf_pmu_check_period, }; static void cpumf_measurement_alert(struct ext_code ext_code, From 93426cadc339939d723a71fa1e2fdbfb97773ef0 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 16 Sep 2019 11:31:43 +0200 Subject: [PATCH 0614/2880] s390/cpumf: Remove mixed white space Remove blanks in comment and replace them by tabs. Signed-off-by: Thomas Richter Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/cpu_mf.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index ae3e3221d4b5..ceeb552d3472 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -70,7 +70,7 @@ struct hws_qsi_info_block { /* Bit(s) */ unsigned long tear; /* 24-31: TEAR contents */ unsigned long dear; /* 32-39: DEAR contents */ unsigned int rsvrd0; /* 40-43: reserved */ - unsigned int cpu_speed; /* 44-47: CPU speed */ + unsigned int cpu_speed; /* 44-47: CPU speed */ unsigned long long rsvrd1; /* 48-55: reserved */ unsigned long long rsvrd2; /* 56-63: reserved */ } __packed; @@ -89,10 +89,10 @@ struct hws_lsctl_request_block { unsigned long tear; /* 16-23: TEAR contents */ unsigned long dear; /* 24-31: DEAR contents */ /* 32-63: */ - unsigned long rsvrd1; /* reserved */ - unsigned long rsvrd2; /* reserved */ - unsigned long rsvrd3; /* reserved */ - unsigned long rsvrd4; /* reserved */ + unsigned long rsvrd1; /* reserved */ + unsigned long rsvrd2; /* reserved */ + unsigned long rsvrd3; /* reserved */ + unsigned long rsvrd4; /* reserved */ } __packed; struct hws_basic_entry { From 280ceaed79f18db930c0cc8bb21f6493490bf29c Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 19 Sep 2019 10:23:08 +0200 Subject: [PATCH 0615/2880] usbnet: sanity checking of packet sizes and device mtu After a reset packet sizes and device mtu can change and need to be reevaluated to calculate queue sizes. Malicious devices can set this to zero and we divide by it. Introduce sanity checking. Reported-and-tested-by: syzbot+6102c120be558c885f04@syzkaller.appspotmail.com Signed-off-by: Oliver Neukum Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 58952a79b05f..e44849499b89 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -339,6 +339,8 @@ void usbnet_update_max_qlen(struct usbnet *dev) { enum usb_device_speed speed = dev->udev->speed; + if (!dev->rx_urb_size || !dev->hard_mtu) + goto insanity; switch (speed) { case USB_SPEED_HIGH: dev->rx_qlen = MAX_QUEUE_MEMORY / dev->rx_urb_size; @@ -355,6 +357,7 @@ void usbnet_update_max_qlen(struct usbnet *dev) dev->tx_qlen = 5 * MAX_QUEUE_MEMORY / dev->hard_mtu; break; default: +insanity: dev->rx_qlen = dev->tx_qlen = 4; } } From 44460efe44e05eae2f21e57d06d542bbbb792e65 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Sat, 14 Sep 2019 12:45:43 -0700 Subject: [PATCH 0616/2880] tools/power/x86/intel-speed-select: Fix high priority core mask over count If the CPU package has the less logical CPU than topo_max_cpus, but un-present CPU's punit_cpu_core will be initiated to 0 and they will be count to core 0 Like below, there are only 10 high priority cores (20 logical CPUs) in the CPU package, but it count to 27 logic CPUs. ./intel-speed-select base-freq info -l 0 | grep mask high-priority-cpu-mask:7f000179,f000179f With the fix patch: ./intel-speed-select base-freq info -l 0 high-priority-cpu-mask:00000179,f000179f Signed-off-by: Youquan Song Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- tools/power/x86/intel-speed-select/isst-config.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 59753b3917bb..83ac72902b36 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -402,6 +402,9 @@ void set_cpu_mask_from_punit_coremask(int cpu, unsigned long long core_mask, int j; for (j = 0; j < topo_max_cpus; ++j) { + if (!CPU_ISSET_S(j, present_cpumask_size, present_cpumask)) + continue; + if (cpu_map[j].pkg_id == pkg_id && cpu_map[j].die_id == die_id && cpu_map[j].punit_cpu_core == i) { From 3c64c81ad1f06823b603e9143193dcb4f3121a2f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 14 Sep 2019 12:45:44 -0700 Subject: [PATCH 0617/2880] tools/power/x86/intel-speed-select: Allow online/offline based on tdp Using enable core mask, do online offline CPUs. There is a new option --online|-o for set-config-level. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- .../x86/intel-speed-select/isst-config.c | 60 +++++++++++++++++-- tools/power/x86/intel-speed-select/isst.h | 2 + 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 83ac72902b36..e505aaf2beef 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -38,6 +38,7 @@ static int fact_avx = 0xFF; static unsigned long long fact_trl; static int out_format_json; static int cmd_help; +static int force_online_offline; /* clos related */ static int current_clos = -1; @@ -138,14 +139,14 @@ int out_format_is_json(void) int get_physical_package_id(int cpu) { return parse_int_file( - 1, "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", + 0, "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", cpu); } int get_physical_core_id(int cpu) { return parse_int_file( - 1, "/sys/devices/system/cpu/cpu%d/topology/core_id", cpu); + 0, "/sys/devices/system/cpu/cpu%d/topology/core_id", cpu); } int get_physical_die_id(int cpu) @@ -165,6 +166,26 @@ int get_topo_max_cpus(void) return topo_max_cpus; } +static void set_cpu_online_offline(int cpu, int state) +{ + char buffer[128]; + int fd; + + snprintf(buffer, sizeof(buffer), + "/sys/devices/system/cpu/cpu%d/online", cpu); + + fd = open(buffer, O_WRONLY); + if (fd < 0) + err(-1, "%s open failed", buffer); + + if (state) + write(fd, "1\n", 2); + else + write(fd, "0\n", 2); + + close(fd); +} + #define MAX_PACKAGE_COUNT 8 #define MAX_DIE_PER_PACKAGE 2 static void for_each_online_package_in_set(void (*callback)(int, void *, void *, @@ -736,9 +757,34 @@ static void set_tdp_level_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, ret = isst_set_tdp_level(cpu, tdp_level); if (ret) perror("set_tdp_level_for_cpu"); - else + else { isst_display_result(cpu, outf, "perf-profile", "set_tdp_level", ret); + if (force_online_offline) { + struct isst_pkg_ctdp_level_info ctdp_level; + int pkg_id = get_physical_package_id(cpu); + int die_id = get_physical_die_id(cpu); + + fprintf(stderr, "Option is set to online/offline\n"); + ctdp_level.core_cpumask_size = + alloc_cpu_set(&ctdp_level.core_cpumask); + isst_get_coremask_info(cpu, tdp_level, &ctdp_level); + if (ctdp_level.cpu_count) { + int i, max_cpus = get_topo_max_cpus(); + for (i = 0; i < max_cpus; ++i) { + if (pkg_id != get_physical_package_id(i) || die_id != get_physical_die_id(i)) + continue; + if (CPU_ISSET_S(i, ctdp_level.core_cpumask_size, ctdp_level.core_cpumask)) { + fprintf(stderr, "online cpu %d\n", i); + set_cpu_online_offline(i, 1); + } else { + fprintf(stderr, "offline cpu %d\n", i); + set_cpu_online_offline(i, 0); + } + } + } + } + } } static void set_tdp_level(void) @@ -747,6 +793,8 @@ static void set_tdp_level(void) fprintf(stderr, "Set Config TDP level\n"); fprintf(stderr, "\t Arguments: -l|--level : Specify tdp level\n"); + fprintf(stderr, + "\t Optional Arguments: -o | online : online/offline for the tdp level\n"); exit(0); } @@ -1319,6 +1367,7 @@ static void parse_cmd_args(int argc, int start, char **argv) static struct option long_options[] = { { "bucket", required_argument, 0, 'b' }, { "level", required_argument, 0, 'l' }, + { "online", required_argument, 0, 'o' }, { "trl-type", required_argument, 0, 'r' }, { "trl", required_argument, 0, 't' }, { "help", no_argument, 0, 'h' }, @@ -1335,7 +1384,7 @@ static void parse_cmd_args(int argc, int start, char **argv) option_index = start; optind = start + 1; - while ((opt = getopt_long(argc, argv, "b:l:t:c:d:e:n:m:p:w:h", + while ((opt = getopt_long(argc, argv, "b:l:t:c:d:e:n:m:p:w:ho", long_options, &option_index)) != -1) { switch (opt) { case 'b': @@ -1347,6 +1396,9 @@ static void parse_cmd_args(int argc, int start, char **argv) case 'l': tdp_level = atoi(optarg); break; + case 'o': + force_online_offline = 1; + break; case 't': sscanf(optarg, "0x%llx", &fact_trl); break; diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 2f7f62765eb6..668f914d077f 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -187,6 +187,8 @@ extern int isst_send_msr_command(unsigned int cpu, unsigned int command, int write, unsigned long long *req_resp); extern int isst_get_ctdp_levels(int cpu, struct isst_pkg_ctdp *pkg_dev); +extern int isst_get_coremask_info(int cpu, int config_index, + struct isst_pkg_ctdp_level_info *ctdp_level); extern int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev); extern void isst_get_process_ctdp_complete(int cpu, From e118fbe366d817175b2c47bfa338959bbff5bd37 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 14 Sep 2019 12:45:45 -0700 Subject: [PATCH 0618/2880] tools/power/x86/intel-speed-select: Format get-assoc information Format the get-assoc command output consistant with other commands. For example: Intel(R) Speed Select Technology Executing on CPU model:142[0x8e] package-0 die-0 cpu-0 get-assoc clos:0 Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- .../x86/intel-speed-select/isst-config.c | 14 +++++++---- .../x86/intel-speed-select/isst-display.c | 23 +++++++++++++++++++ tools/power/x86/intel-speed-select/isst.h | 2 +- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index e505aaf2beef..c2892d86be36 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -1249,7 +1249,7 @@ static void get_clos_assoc_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, if (ret) perror("isst_clos_get_assoc_status"); else - isst_display_result(cpu, outf, "core-power", "get-assoc", clos); + isst_clos_display_assoc_information(cpu, outf, clos); } static void get_clos_assoc(void) @@ -1259,13 +1259,17 @@ static void get_clos_assoc(void) fprintf(stderr, "\tSpecify targeted cpu id with [--cpu|-c]\n"); exit(0); } - if (max_target_cpus) - for_each_online_target_cpu_in_set(get_clos_assoc_for_cpu, NULL, - NULL, NULL, NULL); - else { + + if (!max_target_cpus) { fprintf(stderr, "Invalid target cpu. Specify with [-c|--cpu]\n"); + exit(0); } + + isst_ctdp_display_information_start(outf); + for_each_online_target_cpu_in_set(get_clos_assoc_for_cpu, NULL, + NULL, NULL, NULL); + isst_ctdp_display_information_end(outf); } static struct process_cmd_struct isst_cmds[] = { diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index df4aa99c4e92..bd7aaf27e4de 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -503,6 +503,29 @@ void isst_clos_display_information(int cpu, FILE *outf, int clos, format_and_print(outf, 1, NULL, NULL); } +void isst_clos_display_assoc_information(int cpu, FILE *outf, int clos) +{ + char header[256]; + char value[256]; + + snprintf(header, sizeof(header), "package-%d", + get_physical_package_id(cpu)); + format_and_print(outf, 1, header, NULL); + snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); + format_and_print(outf, 2, header, NULL); + snprintf(header, sizeof(header), "cpu-%d", cpu); + format_and_print(outf, 3, header, NULL); + + snprintf(header, sizeof(header), "get-assoc"); + format_and_print(outf, 4, header, NULL); + + snprintf(header, sizeof(header), "clos"); + snprintf(value, sizeof(value), "%d", clos); + format_and_print(outf, 5, header, value); + + format_and_print(outf, 1, NULL, NULL); +} + void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, int result) { diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 668f914d077f..48655d0dee2d 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -225,7 +225,7 @@ extern int isst_clos_associate(int cpu, int clos); extern int isst_clos_get_assoc_status(int cpu, int *clos_id); extern void isst_clos_display_information(int cpu, FILE *outf, int clos, struct isst_clos_config *clos_config); - +extern void isst_clos_display_assoc_information(int cpu, FILE *outf, int clos); extern int isst_read_reg(unsigned short reg, unsigned int *val); extern int isst_write_reg(int reg, unsigned int val); From d2d1f304dc965e6a06e7f105b09bffceb477fccc Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 14 Sep 2019 12:45:46 -0700 Subject: [PATCH 0619/2880] tools/power/x86/intel-speed-select: Fix some debug prints Fix wrong debug print for cpu, which is displayed as CLOS. Also avoid printing clos id, when user is specify clos as parameter. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- tools/power/x86/intel-speed-select/isst-config.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index c2892d86be36..889396d676cb 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -508,7 +508,7 @@ int isst_send_mbox_command(unsigned int cpu, unsigned char command, int write = 0; int clos_id, core_id, ret = 0; - debug_printf("CLOS %d\n", cpu); + debug_printf("CPU %d\n", cpu); if (parameter & BIT(MBOX_CMD_WRITE_BIT)) { value = req_data; @@ -1421,7 +1421,6 @@ static void parse_cmd_args(int argc, int start, char **argv) /* CLOS related */ case 'c': current_clos = atoi(optarg); - printf("clos %d\n", current_clos); break; case 'd': clos_desired = atoi(optarg); From 188afed9db7db3aefc8c9c33150be2f8f398da9a Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 14 Sep 2019 12:45:47 -0700 Subject: [PATCH 0620/2880] tools/power/x86/intel-speed-select: Extend core-power command set Add additional command to get the clos enable and priority type. The current info option is actually dumping per clos QOS config, so name the command appropriately to get-config. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- .../x86/intel-speed-select/isst-config.c | 38 ++++++++++++++++++- .../power/x86/intel-speed-select/isst-core.c | 25 ++++++++++++ .../x86/intel-speed-select/isst-display.c | 28 ++++++++++++++ tools/power/x86/intel-speed-select/isst.h | 5 +++ 4 files changed, 95 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 889396d676cb..6a54e165672d 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -1133,6 +1133,40 @@ static void dump_clos_config(void) isst_ctdp_display_information_end(outf); } +static void get_clos_info_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, + void *arg4) +{ + int enable, ret, prio_type; + + ret = isst_clos_get_clos_information(cpu, &enable, &prio_type); + if (ret) + perror("isst_clos_get_info"); + else + isst_clos_display_clos_information(cpu, outf, enable, prio_type); +} + +static void dump_clos_info(void) +{ + if (cmd_help) { + fprintf(stderr, + "Print Intel Speed Select Technology core power information\n"); + fprintf(stderr, "\tSpecify targeted cpu id with [--cpu|-c]\n"); + exit(0); + } + + if (!max_target_cpus) { + fprintf(stderr, + "Invalid target cpu. Specify with [-c|--cpu]\n"); + exit(0); + } + + isst_ctdp_display_information_start(outf); + for_each_online_target_cpu_in_set(get_clos_info_for_cpu, NULL, + NULL, NULL, NULL); + isst_ctdp_display_information_end(outf); + +} + static void set_clos_config_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, void *arg4) { @@ -1286,10 +1320,11 @@ static struct process_cmd_struct isst_cmds[] = { { "turbo-freq", "info", dump_fact_config }, { "turbo-freq", "enable", set_fact_enable }, { "turbo-freq", "disable", set_fact_disable }, - { "core-power", "info", dump_clos_config }, + { "core-power", "info", dump_clos_info }, { "core-power", "enable", set_clos_enable }, { "core-power", "disable", set_clos_disable }, { "core-power", "config", set_clos_config }, + { "core-power", "get-config", dump_clos_config }, { "core-power", "assoc", set_clos_assoc }, { "core-power", "get-assoc", get_clos_assoc }, { NULL, NULL, NULL } @@ -1491,6 +1526,7 @@ static void core_power_help(void) printf("\tenable\n"); printf("\tdisable\n"); printf("\tconfig\n"); + printf("\tget-config\n"); printf("\tassoc\n"); printf("\tget-assoc\n"); } diff --git a/tools/power/x86/intel-speed-select/isst-core.c b/tools/power/x86/intel-speed-select/isst-core.c index 0bf341ad9697..6dee5332c9d3 100644 --- a/tools/power/x86/intel-speed-select/isst-core.c +++ b/tools/power/x86/intel-speed-select/isst-core.c @@ -619,6 +619,31 @@ int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev) return 0; } +int isst_clos_get_clos_information(int cpu, int *enable, int *type) +{ + unsigned int resp; + int ret; + + ret = isst_send_mbox_command(cpu, CONFIG_CLOS, CLOS_PM_QOS_CONFIG, 0, 0, + &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CLOS_PM_QOS_CONFIG resp:%x\n", cpu, resp); + + if (resp & BIT(1)) + *enable = 1; + else + *enable = 0; + + if (resp & BIT(2)) + *type = 1; + else + *type = 0; + + return 0; +} + int isst_pm_qos_config(int cpu, int enable_clos, int priority_type) { unsigned int req, resp; diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index bd7aaf27e4de..2e6e5fcdbd7c 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -503,6 +503,34 @@ void isst_clos_display_information(int cpu, FILE *outf, int clos, format_and_print(outf, 1, NULL, NULL); } +void isst_clos_display_clos_information(int cpu, FILE *outf, + int clos_enable, int type) +{ + char header[256]; + char value[256]; + + snprintf(header, sizeof(header), "package-%d", + get_physical_package_id(cpu)); + format_and_print(outf, 1, header, NULL); + snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); + format_and_print(outf, 2, header, NULL); + snprintf(header, sizeof(header), "cpu-%d", cpu); + format_and_print(outf, 3, header, NULL); + + snprintf(header, sizeof(header), "core-power"); + format_and_print(outf, 4, header, NULL); + + snprintf(header, sizeof(header), "enable-status"); + snprintf(value, sizeof(value), "%d", clos_enable); + format_and_print(outf, 5, header, value); + + snprintf(header, sizeof(header), "priority-type"); + snprintf(value, sizeof(value), "%d", type); + format_and_print(outf, 5, header, value); + + format_and_print(outf, 1, NULL, NULL); +} + void isst_clos_display_assoc_information(int cpu, FILE *outf, int clos) { char header[256]; diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 48655d0dee2d..09e16a41b57c 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -231,4 +231,9 @@ extern int isst_write_reg(int reg, unsigned int val); extern void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, int result); + +extern int isst_clos_get_clos_information(int cpu, int *enable, int *type); +extern void isst_clos_display_clos_information(int cpu, FILE *outf, + int clos_enable, int type); + #endif From b3abfd778bf1dbdd96f70bd7d00671d027f67c62 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 17 Sep 2019 17:52:37 -0700 Subject: [PATCH 0621/2880] tools/power/x86/intel-speed-select: Fix perf-profile command output commit "c016ae8f9fa04d361efc8629de49ad3af12b5262 "tools/power/x86/intel-speed-select: Output success/failed for command output" introduced a regression in perf-profile outputs. With this the result field is changed to string interpreting every non zero value as errors. But these commands display on zero (>0) result. For example before this commit the display was: package-1 die-0 cpu-14 get-config-levels:4 Here the get-config-levels is interpreted as error and displayed as error with the above commit: package-1 die-0 cpu-14 get-config-levels:failed(error 4) Fix this issue by not using isst_display_result() to display such results, but define a new function which formats this data and prints. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- .../x86/intel-speed-select/isst-config.c | 4 ++-- .../x86/intel-speed-select/isst-display.c | 20 +++++++++++++++++++ tools/power/x86/intel-speed-select/isst.h | 3 ++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 6a54e165672d..2a9890c8395a 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -673,8 +673,8 @@ static void exec_on_get_ctdp_cpu(int cpu, void *arg1, void *arg2, void *arg3, if (ret) perror("get_tdp_*"); else - isst_display_result(cpu, outf, "perf-profile", (char *)arg3, - *(unsigned int *)arg4); + isst_ctdp_display_core_info(cpu, outf, arg3, + *(unsigned int *)arg4); } #define _get_tdp_level(desc, suffix, object, help) \ diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index 2e6e5fcdbd7c..40346d534f78 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -287,6 +287,26 @@ static void _isst_fact_display_information(int cpu, FILE *outf, int level, format_and_print(outf, base_level + 2, header, value); } +void isst_ctdp_display_core_info(int cpu, FILE *outf, char *prefix, + unsigned int val) +{ + char header[256]; + char value[256]; + + snprintf(header, sizeof(header), "package-%d", + get_physical_package_id(cpu)); + format_and_print(outf, 1, header, NULL); + snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); + format_and_print(outf, 2, header, NULL); + snprintf(header, sizeof(header), "cpu-%d", cpu); + format_and_print(outf, 3, header, NULL); + + snprintf(value, sizeof(value), "%u", val); + format_and_print(outf, 4, prefix, value); + + format_and_print(outf, 1, NULL, NULL); +} + void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, struct isst_pkg_ctdp *pkg_dev) { diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 09e16a41b57c..d280b27d600d 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -195,6 +195,8 @@ extern void isst_get_process_ctdp_complete(int cpu, struct isst_pkg_ctdp *pkg_dev); extern void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, struct isst_pkg_ctdp *pkg_dev); +extern void isst_ctdp_display_core_info(int cpu, FILE *outf, char *prefix, + unsigned int val); extern void isst_ctdp_display_information_start(FILE *outf); extern void isst_ctdp_display_information_end(FILE *outf); extern void isst_pbf_display_information(int cpu, FILE *outf, int level, @@ -235,5 +237,4 @@ extern void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, extern int isst_clos_get_clos_information(int cpu, int *enable, int *type); extern void isst_clos_display_clos_information(int cpu, FILE *outf, int clos_enable, int type); - #endif From e0973a421c6e9d268db2157bcb8756e7ab4b4313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 16 Sep 2019 14:33:42 +0200 Subject: [PATCH 0622/2880] libbpf: Remove getsockopt() check for XDP_OPTIONS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xsk_socket__create() function fails and returns an error if it cannot get the XDP_OPTIONS through getsockopt(). However, support for XDP_OPTIONS was not added until kernel 5.3, so this means that creating XSK sockets always fails on older kernels. Since the option is just used to set the zero-copy flag in the xsk struct, and that flag is not really used for anything yet, just remove the getsockopt() call until a proper use for it is introduced. Suggested-by: Yonghong Song Signed-off-by: Toke Høiland-Jørgensen Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/lib/bpf/xsk.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 842c4fd55859..24fa313524fb 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -65,7 +65,6 @@ struct xsk_socket { int xsks_map_fd; __u32 queue_id; char ifname[IFNAMSIZ]; - bool zc; }; struct xsk_nl_info { @@ -491,7 +490,6 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, void *rx_map = NULL, *tx_map = NULL; struct sockaddr_xdp sxdp = {}; struct xdp_mmap_offsets off; - struct xdp_options opts; struct xsk_socket *xsk; socklen_t optlen; int err; @@ -611,15 +609,6 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, xsk->prog_fd = -1; - optlen = sizeof(opts); - err = getsockopt(xsk->fd, SOL_XDP, XDP_OPTIONS, &opts, &optlen); - if (err) { - err = -errno; - goto out_mmap_tx; - } - - xsk->zc = opts.flags & XDP_OPTIONS_ZEROCOPY; - if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { err = xsk_setup_xdp_prog(xsk); if (err) From 9eea984979513d6ee137e545e26c5877d46039dd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 17 Sep 2019 10:45:37 -0700 Subject: [PATCH 0623/2880] bpf: fix BTF verification of enums vmlinux BTF has enums that are 8 byte and 1 byte in size. 2 byte enum is a valid construct as well. Fix BTF enum verification to accept those sizes. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index adb3adcebe3c..722d38e543e9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2377,9 +2377,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (t->size != sizeof(int)) { - btf_verifier_log_type(env, t, "Expected size:%zu", - sizeof(int)); + if (t->size > 8 || !is_power_of_2(t->size)) { + btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; } From a0791f0df7d212c245761538b17a9ea93607b667 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 17 Sep 2019 10:45:38 -0700 Subject: [PATCH 0624/2880] bpf: fix BTF limits vmlinux BTF has more than 64k types. Its string section is also at the offset larger than 64k. Adjust both limits to make in-kernel BTF verifier successfully parse in-kernel BTF. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 63ae4a39e58b..c02dec97e1ce 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -22,9 +22,9 @@ struct btf_header { }; /* Max # of type identifier */ -#define BTF_MAX_TYPE 0x0000ffff +#define BTF_MAX_TYPE 0x000fffff /* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x0000ffff +#define BTF_MAX_NAME_OFFSET 0x00ffffff /* Max # of struct/union/enum members or func args */ #define BTF_MAX_VLEN 0xffff From 733ef7f056a5e23b66e8e7bb3508ca882db388f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 18 Sep 2019 09:57:39 +0200 Subject: [PATCH 0625/2880] xsk: relax UMEM headroom alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch removes the 64B alignment of the UMEM headroom. There is really no reason for it, and having a headroom less than 64B should be valid. Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 947b8ff0227e..cdaef54d48be 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -383,8 +383,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; } - headroom = ALIGN(headroom, 64); - size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM; if (size_chk < 0) return -EINVAL; From b45e0c30bc58fb6fcaa42f1d1d813cefb8ab4117 Mon Sep 17 00:00:00 2001 From: Yash Shah Date: Wed, 21 Aug 2019 14:53:40 +0530 Subject: [PATCH 0626/2880] riscv: dts: Add DT support for SiFive FU540 PWM driver Add the PWM DT node in SiFive FU540 soc-specific DT file. Enable the PWM nodes in HiFive Unleashed board-specific DT file. Signed-off-by: Yash Shah Cc: Palmer Dabbelt [paul.walmsley@sifive.com: added chip-specific compatible string; dropped reg-names string from pwm1] Signed-off-by: Paul Walmsley --- arch/riscv/boot/dts/sifive/fu540-c000.dtsi | 18 ++++++++++++++++++ .../boot/dts/sifive/hifive-unleashed-a00.dts | 8 ++++++++ 2 files changed, 26 insertions(+) diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi index 42b5ec223100..5a29211d396e 100644 --- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi @@ -230,6 +230,24 @@ #size-cells = <0>; status = "disabled"; }; + pwm0: pwm@10020000 { + compatible = "sifive,fu540-c000-pwm", "sifive,pwm0"; + reg = <0x0 0x10020000 0x0 0x1000>; + interrupt-parent = <&plic0>; + interrupts = <42 43 44 45>; + clocks = <&prci PRCI_CLK_TLCLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + pwm1: pwm@10021000 { + compatible = "sifive,fu540-c000-pwm", "sifive,pwm0"; + reg = <0x0 0x10021000 0x0 0x1000>; + interrupt-parent = <&plic0>; + interrupts = <46 47 48 49>; + clocks = <&prci PRCI_CLK_TLCLK>; + #pwm-cells = <3>; + status = "disabled"; + }; }; }; diff --git a/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts index 93d68cbd64fe..104d334511cd 100644 --- a/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts +++ b/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts @@ -85,3 +85,11 @@ reg = <0>; }; }; + +&pwm0 { + status = "okay"; +}; + +&pwm1 { + status = "okay"; +}; From b6f2b2e600a27b79801bae0bf73e3c34438dc0ae Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Wed, 18 Sep 2019 18:38:24 +0800 Subject: [PATCH 0627/2880] RISC-V: Fix building error when CONFIG_SPARSEMEM_MANUAL=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a build break by adjusting where VMALLOC_* and FIXADDR_* are defined. This fixes the definition of the MEMMAP_* macros. CC init/main.o In file included from ./include/linux/mm.h:99, from ./include/linux/ring_buffer.h:5, from ./include/linux/trace_events.h:6, from ./include/trace/syscall.h:7, from ./include/linux/syscalls.h:85, from init/main.c:21: ./arch/riscv/include/asm/pgtable.h: In function ‘pmd_page’: ./arch/riscv/include/asm/pgtable.h:95:24: error: ‘VMALLOC_START’ undeclared (first use in this function); did you mean ‘VMEMMAP_START’? #define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) ^~~~~~~~~~~~~ Fixes: d95f1a542c3d ("RISC-V: Implement sparsemem") Signed-off-by: Greentime Hu [paul.walmsley@sifive.com: minor patch description fix] Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/pgtable.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 80905b27ee98..4f4162d90586 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -83,6 +83,18 @@ extern pgd_t swapper_pg_dir[]; #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC +#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) +#define VMALLOC_END (PAGE_OFFSET - 1) +#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) + +#define FIXADDR_TOP VMALLOC_START +#ifdef CONFIG_64BIT +#define FIXADDR_SIZE PMD_SIZE +#else +#define FIXADDR_SIZE PGDIR_SIZE +#endif +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) + /* * Roughly size the vmemmap space to be large enough to fit enough * struct pages to map half the virtual address space. Then @@ -429,18 +441,6 @@ static inline void pgtable_cache_init(void) /* No page table caches to initialize */ } -#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) -#define VMALLOC_END (PAGE_OFFSET - 1) -#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) - -#define FIXADDR_TOP VMALLOC_START -#ifdef CONFIG_64BIT -#define FIXADDR_SIZE PMD_SIZE -#else -#define FIXADDR_SIZE PGDIR_SIZE -#endif -#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) - /* * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32. * Note that PGDIR_SIZE must evenly divide TASK_SIZE. From 1cec0ce2e97f3e48d9af52e69e21bc614b14dd16 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 4 Sep 2019 16:16:16 +0000 Subject: [PATCH 0628/2880] RISC-V: Enable VIRTIO drivers in RV64 and RV32 defconfig This patch enables more VIRTIO drivers (such as console, rpmsg, 9p, rng, etc.) which are usable on KVM RISC-V Guest and Xvisor RISC-V Guest. Signed-off-by: Anup Patel Acked-by: Paolo Bonzini Reviewed-by: Paolo Bonzini Reviewed-by: Alexander Graf Signed-off-by: Paul Walmsley --- arch/riscv/configs/defconfig | 11 +++++++++++ arch/riscv/configs/rv32_defconfig | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 3efff552a261..420a0dbef386 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -29,6 +29,8 @@ CONFIG_IP_PNP_DHCP=y CONFIG_IP_PNP_BOOTP=y CONFIG_IP_PNP_RARP=y CONFIG_NETLINK_DIAG=y +CONFIG_NET_9P=y +CONFIG_NET_9P_VIRTIO=y CONFIG_PCI=y CONFIG_PCIEPORTBUS=y CONFIG_PCI_HOST_GENERIC=y @@ -39,6 +41,7 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_VIRTIO_BLK=y CONFIG_BLK_DEV_SD=y CONFIG_BLK_DEV_SR=y +CONFIG_SCSI_VIRTIO=y CONFIG_ATA=y CONFIG_SATA_AHCI=y CONFIG_SATA_AHCI_PLATFORM=y @@ -54,6 +57,7 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y CONFIG_SERIAL_EARLYCON_RISCV_SBI=y CONFIG_HVC_RISCV_SBI=y +CONFIG_VIRTIO_CONSOLE=y CONFIG_HW_RANDOM=y CONFIG_HW_RANDOM_VIRTIO=y CONFIG_SPI=y @@ -61,6 +65,7 @@ CONFIG_SPI_SIFIVE=y # CONFIG_PTP_1588_CLOCK is not set CONFIG_DRM=y CONFIG_DRM_RADEON=y +CONFIG_DRM_VIRTIO_GPU=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_USB=y CONFIG_USB_XHCI_HCD=y @@ -73,7 +78,12 @@ CONFIG_USB_STORAGE=y CONFIG_USB_UAS=y CONFIG_MMC=y CONFIG_MMC_SPI=y +CONFIG_VIRTIO_PCI=y +CONFIG_VIRTIO_BALLOON=y +CONFIG_VIRTIO_INPUT=y CONFIG_VIRTIO_MMIO=y +CONFIG_RPMSG_CHAR=y +CONFIG_RPMSG_VIRTIO=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_AUTOFS4_FS=y @@ -86,6 +96,7 @@ CONFIG_NFS_V4=y CONFIG_NFS_V4_1=y CONFIG_NFS_V4_2=y CONFIG_ROOT_NFS=y +CONFIG_9P_FS=y CONFIG_CRYPTO_USER_API_HASH=y CONFIG_CRYPTO_DEV_VIRTIO=y CONFIG_PRINTK_TIME=y diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig index 7da93e494445..87ee6e62b64b 100644 --- a/arch/riscv/configs/rv32_defconfig +++ b/arch/riscv/configs/rv32_defconfig @@ -29,6 +29,8 @@ CONFIG_IP_PNP_DHCP=y CONFIG_IP_PNP_BOOTP=y CONFIG_IP_PNP_RARP=y CONFIG_NETLINK_DIAG=y +CONFIG_NET_9P=y +CONFIG_NET_9P_VIRTIO=y CONFIG_PCI=y CONFIG_PCIEPORTBUS=y CONFIG_PCI_HOST_GENERIC=y @@ -39,6 +41,7 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_VIRTIO_BLK=y CONFIG_BLK_DEV_SD=y CONFIG_BLK_DEV_SR=y +CONFIG_SCSI_VIRTIO=y CONFIG_ATA=y CONFIG_SATA_AHCI=y CONFIG_SATA_AHCI_PLATFORM=y @@ -54,11 +57,13 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y CONFIG_SERIAL_EARLYCON_RISCV_SBI=y CONFIG_HVC_RISCV_SBI=y +CONFIG_VIRTIO_CONSOLE=y CONFIG_HW_RANDOM=y CONFIG_HW_RANDOM_VIRTIO=y # CONFIG_PTP_1588_CLOCK is not set CONFIG_DRM=y CONFIG_DRM_RADEON=y +CONFIG_DRM_VIRTIO_GPU=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_USB=y CONFIG_USB_XHCI_HCD=y @@ -69,7 +74,12 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_OHCI_HCD_PLATFORM=y CONFIG_USB_STORAGE=y CONFIG_USB_UAS=y +CONFIG_VIRTIO_PCI=y +CONFIG_VIRTIO_BALLOON=y +CONFIG_VIRTIO_INPUT=y CONFIG_VIRTIO_MMIO=y +CONFIG_RPMSG_CHAR=y +CONFIG_RPMSG_VIRTIO=y CONFIG_SIFIVE_PLIC=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y @@ -83,6 +93,7 @@ CONFIG_NFS_V4=y CONFIG_NFS_V4_1=y CONFIG_NFS_V4_2=y CONFIG_ROOT_NFS=y +CONFIG_9P_FS=y CONFIG_CRYPTO_USER_API_HASH=y CONFIG_CRYPTO_DEV_VIRTIO=y CONFIG_PRINTK_TIME=y From 2d2e0b90a08f1e3a47b3ee852b27c219295423ef Mon Sep 17 00:00:00 2001 From: Sean Paul Date: Wed, 18 Sep 2019 16:07:28 -0400 Subject: [PATCH 0629/2880] drm: Fix kerneldoc and remove unused struct member in self_refresh helper Artifacts of previous revisions. Reviewed-by: Daniel Vetter Signed-off-by: Sean Paul Link to v1: https://patchwork.freedesktop.org/patch/msgid/20190917200443.64481-1-sean@poorly.run Link: https://patchwork.freedesktop.org/patch/msgid/20190918200734.149876-1-sean@poorly.run Changes in v2: - None --- drivers/gpu/drm/drm_self_refresh_helper.c | 1 - include/drm/drm_crtc.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_self_refresh_helper.c b/drivers/gpu/drm/drm_self_refresh_helper.c index 4b9424a8f1f1..9095cebf2147 100644 --- a/drivers/gpu/drm/drm_self_refresh_helper.c +++ b/drivers/gpu/drm/drm_self_refresh_helper.c @@ -53,7 +53,6 @@ struct drm_self_refresh_data { struct drm_crtc *crtc; struct delayed_work entry_work; - struct drm_atomic_state *save_state; unsigned int entry_delay_ms; }; diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h index c4528eb5d168..408b6f4e63c0 100644 --- a/include/drm/drm_crtc.h +++ b/include/drm/drm_crtc.h @@ -1108,7 +1108,7 @@ struct drm_crtc { /** * @self_refresh_data: Holds the state for the self refresh helpers * - * Initialized via drm_self_refresh_helper_register(). + * Initialized via drm_self_refresh_helper_init(). */ struct drm_self_refresh_data *self_refresh_data; }; From d4da4e33341c5e6159543acc03559cb24f520bc2 Mon Sep 17 00:00:00 2001 From: Sean Paul Date: Wed, 18 Sep 2019 16:07:29 -0400 Subject: [PATCH 0630/2880] drm: Measure Self Refresh Entry/Exit times to avoid thrashing Currently the self refresh idle timer is a const set by the crtc. This is fine if the self refresh entry/exit times are well-known for all panels used on that crtc. However panels and workloads can vary quite a bit, and a timeout which works well for one doesn't work well for another. In the extreme, if the timeout is too short we could get in a situation where the self refresh exits are taking so long we queue up a self refresh entry before the exit commit is even finished. This patch changes the idle timeout to a moving average of the entry times + a moving average of exit times + the crtc constant. This patch was tested on rockchip, with a kevin CrOS panel the idle delay averages out to about ~235ms (35 entry + 100 exit + 100 const). On the same board, the bob panel idle delay lands around ~340ms (90 entry + 150 exit + 100 const). WRT the dedicated mutex in self_refresh_data, it would be nice if we could rely on drm_crtc.mutex to protect the average times, but there are a few reasons why a separate lock is a better choice: - We can't rely on drm_crtc.mutex being held if we're doing a nonblocking commit - We can't grab drm_crtc.mutex since drm_modeset_lock() doesn't tell us whether the lock was already held in the acquire context (it eats -EALREADY), so we can't tell if we should drop it or not - We don't need such a heavy-handed lock for what we're trying to do, commit ordering doesn't matter, so a point-of-use lock will be less contentious Reviewed-by: Daniel Vetter Signed-off-by: Sean Paul Link to v1: https://patchwork.freedesktop.org/patch/msgid/20190917200443.64481-2-sean@poorly.run Link: https://patchwork.freedesktop.org/patch/msgid/20190918200734.149876-2-sean@poorly.run Changes in v2: - Migrate locking explanation from comment to commit msg (Daniel) - Turf constant entry delay and multiply the avg times by 2 (Daniel) --- drivers/gpu/drm/drm_atomic_helper.c | 20 ++++++ drivers/gpu/drm/drm_self_refresh_helper.c | 72 +++++++++++++++++++-- drivers/gpu/drm/rockchip/rockchip_drm_vop.c | 5 +- include/drm/drm_self_refresh_helper.h | 6 +- 4 files changed, 90 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index 3a67f63c3146..3ef2ac52ce94 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -26,6 +26,7 @@ */ #include +#include #include #include @@ -1580,9 +1581,23 @@ static void commit_tail(struct drm_atomic_state *old_state) { struct drm_device *dev = old_state->dev; const struct drm_mode_config_helper_funcs *funcs; + ktime_t start; + s64 commit_time_ms; funcs = dev->mode_config.helper_private; + /* + * We're measuring the _entire_ commit, so the time will vary depending + * on how many fences and objects are involved. For the purposes of self + * refresh, this is desirable since it'll give us an idea of how + * congested things are. This will inform our decision on how often we + * should enter self refresh after idle. + * + * These times will be averaged out in the self refresh helpers to avoid + * overreacting over one outlier frame + */ + start = ktime_get(); + drm_atomic_helper_wait_for_fences(dev, old_state, false); drm_atomic_helper_wait_for_dependencies(old_state); @@ -1592,6 +1607,11 @@ static void commit_tail(struct drm_atomic_state *old_state) else drm_atomic_helper_commit_tail(old_state); + commit_time_ms = ktime_ms_delta(ktime_get(), start); + if (commit_time_ms > 0) + drm_self_refresh_helper_update_avg_times(old_state, + (unsigned long)commit_time_ms); + drm_atomic_helper_commit_cleanup_done(old_state); drm_atomic_state_put(old_state); diff --git a/drivers/gpu/drm/drm_self_refresh_helper.c b/drivers/gpu/drm/drm_self_refresh_helper.c index 9095cebf2147..68f4765a5896 100644 --- a/drivers/gpu/drm/drm_self_refresh_helper.c +++ b/drivers/gpu/drm/drm_self_refresh_helper.c @@ -5,6 +5,7 @@ * Authors: * Sean Paul */ +#include #include #include #include @@ -50,10 +51,17 @@ * atomic_check when &drm_crtc_state.self_refresh_active is true. */ +#define SELF_REFRESH_AVG_SEED_MS 200 + +DECLARE_EWMA(psr_time, 4, 4) + struct drm_self_refresh_data { struct drm_crtc *crtc; struct delayed_work entry_work; - unsigned int entry_delay_ms; + + struct mutex avg_mutex; + struct ewma_psr_time entry_avg_ms; + struct ewma_psr_time exit_avg_ms; }; static void drm_self_refresh_helper_entry_work(struct work_struct *work) @@ -121,6 +129,44 @@ out_drop_locks: drm_modeset_acquire_fini(&ctx); } +/** + * drm_self_refresh_helper_update_avg_times - Updates a crtc's SR time averages + * @state: the state which has just been applied to hardware + * @commit_time_ms: the amount of time in ms that this commit took to complete + * + * Called after &drm_mode_config_funcs.atomic_commit_tail, this function will + * update the average entry/exit self refresh times on self refresh transitions. + * These averages will be used when calculating how long to delay before + * entering self refresh mode after activity. + */ +void drm_self_refresh_helper_update_avg_times(struct drm_atomic_state *state, + unsigned int commit_time_ms) +{ + struct drm_crtc *crtc; + struct drm_crtc_state *old_crtc_state, *new_crtc_state; + int i; + + for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, + new_crtc_state, i) { + struct drm_self_refresh_data *sr_data = crtc->self_refresh_data; + struct ewma_psr_time *time; + + if (old_crtc_state->self_refresh_active == + new_crtc_state->self_refresh_active) + continue; + + if (new_crtc_state->self_refresh_active) + time = &sr_data->entry_avg_ms; + else + time = &sr_data->exit_avg_ms; + + mutex_lock(&sr_data->avg_mutex); + ewma_psr_time_add(time, commit_time_ms); + mutex_unlock(&sr_data->avg_mutex); + } +} +EXPORT_SYMBOL(drm_self_refresh_helper_update_avg_times); + /** * drm_self_refresh_helper_alter_state - Alters the atomic state for SR exit * @state: the state currently being checked @@ -152,6 +198,7 @@ void drm_self_refresh_helper_alter_state(struct drm_atomic_state *state) for_each_new_crtc_in_state(state, crtc, crtc_state, i) { struct drm_self_refresh_data *sr_data; + unsigned int delay; /* Don't trigger the entry timer when we're already in SR */ if (crtc_state->self_refresh_active) @@ -161,8 +208,13 @@ void drm_self_refresh_helper_alter_state(struct drm_atomic_state *state) if (!sr_data) continue; + mutex_lock(&sr_data->avg_mutex); + delay = (ewma_psr_time_read(&sr_data->entry_avg_ms) + + ewma_psr_time_read(&sr_data->exit_avg_ms)) * 2; + mutex_unlock(&sr_data->avg_mutex); + mod_delayed_work(system_wq, &sr_data->entry_work, - msecs_to_jiffies(sr_data->entry_delay_ms)); + msecs_to_jiffies(delay)); } } EXPORT_SYMBOL(drm_self_refresh_helper_alter_state); @@ -170,12 +222,10 @@ EXPORT_SYMBOL(drm_self_refresh_helper_alter_state); /** * drm_self_refresh_helper_init - Initializes self refresh helpers for a crtc * @crtc: the crtc which supports self refresh supported displays - * @entry_delay_ms: amount of inactivity to wait before entering self refresh * * Returns zero if successful or -errno on failure */ -int drm_self_refresh_helper_init(struct drm_crtc *crtc, - unsigned int entry_delay_ms) +int drm_self_refresh_helper_init(struct drm_crtc *crtc) { struct drm_self_refresh_data *sr_data = crtc->self_refresh_data; @@ -189,8 +239,18 @@ int drm_self_refresh_helper_init(struct drm_crtc *crtc, INIT_DELAYED_WORK(&sr_data->entry_work, drm_self_refresh_helper_entry_work); - sr_data->entry_delay_ms = entry_delay_ms; sr_data->crtc = crtc; + mutex_init(&sr_data->avg_mutex); + ewma_psr_time_init(&sr_data->entry_avg_ms); + ewma_psr_time_init(&sr_data->exit_avg_ms); + + /* + * Seed the averages so they're non-zero (and sufficiently large + * for even poorly performing panels). As time goes on, this will be + * averaged out and the values will trend to their true value. + */ + ewma_psr_time_add(&sr_data->entry_avg_ms, SELF_REFRESH_AVG_SEED_MS); + ewma_psr_time_add(&sr_data->exit_avg_ms, SELF_REFRESH_AVG_SEED_MS); crtc->self_refresh_data = sr_data; return 0; diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c index 2f821c58007c..613404f86668 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c @@ -39,8 +39,6 @@ #include "rockchip_drm_vop.h" #include "rockchip_rgb.h" -#define VOP_SELF_REFRESH_ENTRY_DELAY_MS 100 - #define VOP_WIN_SET(vop, win, name, v) \ vop_reg_set(vop, &win->phy->name, win->base, ~0, v, #name) #define VOP_SCL_SET(vop, win, name, v) \ @@ -1563,8 +1561,7 @@ static int vop_create_crtc(struct vop *vop) init_completion(&vop->line_flag_completion); crtc->port = port; - ret = drm_self_refresh_helper_init(crtc, - VOP_SELF_REFRESH_ENTRY_DELAY_MS); + ret = drm_self_refresh_helper_init(crtc); if (ret) DRM_DEV_DEBUG_KMS(vop->dev, "Failed to init %s with SR helpers %d, ignoring\n", diff --git a/include/drm/drm_self_refresh_helper.h b/include/drm/drm_self_refresh_helper.h index 397a583ccca7..5b79d253fb46 100644 --- a/include/drm/drm_self_refresh_helper.h +++ b/include/drm/drm_self_refresh_helper.h @@ -12,9 +12,9 @@ struct drm_atomic_state; struct drm_crtc; void drm_self_refresh_helper_alter_state(struct drm_atomic_state *state); +void drm_self_refresh_helper_update_avg_times(struct drm_atomic_state *state, + unsigned int commit_time_ms); -int drm_self_refresh_helper_init(struct drm_crtc *crtc, - unsigned int entry_delay_ms); - +int drm_self_refresh_helper_init(struct drm_crtc *crtc); void drm_self_refresh_helper_cleanup(struct drm_crtc *crtc); #endif From 7fd25e6fc035f4b04b75bca6d7e8daa069603a76 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 19 Sep 2019 14:12:34 +0200 Subject: [PATCH 0631/2880] ieee802154: atusb: fix use-after-free at disconnect The disconnect callback was accessing the hardware-descriptor private data after having having freed it. Fixes: 7490b008d123 ("ieee802154: add support for atusb transceiver") Cc: stable # 4.2 Cc: Alexander Aring Reported-by: syzbot+f4509a9138a1472e7e80@syzkaller.appspotmail.com Signed-off-by: Johan Hovold Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/atusb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/atusb.c b/drivers/net/ieee802154/atusb.c index ceddb424f887..0dd0ba915ab9 100644 --- a/drivers/net/ieee802154/atusb.c +++ b/drivers/net/ieee802154/atusb.c @@ -1137,10 +1137,11 @@ static void atusb_disconnect(struct usb_interface *interface) ieee802154_unregister_hw(atusb->hw); + usb_put_dev(atusb->usb_dev); + ieee802154_free_hw(atusb->hw); usb_set_intfdata(interface, NULL); - usb_put_dev(atusb->usb_dev); pr_debug("%s done\n", __func__); } From e430d802d6a3aaf61bd3ed03d9404888a29b9bf9 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 19 Sep 2019 20:04:47 +0800 Subject: [PATCH 0632/2880] timer: Read jiffies once when forwarding base clk The timer delayed for more than 3 seconds warning was triggered during testing. Workqueue: events_unbound sched_tick_remote RIP: 0010:sched_tick_remote+0xee/0x100 ... Call Trace: process_one_work+0x18c/0x3a0 worker_thread+0x30/0x380 kthread+0x113/0x130 ret_from_fork+0x22/0x40 The reason is that the code in collect_expired_timers() uses jiffies unprotected: if (next_event > jiffies) base->clk = jiffies; As the compiler is allowed to reload the value base->clk can advance between the check and the store and in the worst case advance farther than next event. That causes the timer expiry to be delayed until the wheel pointer wraps around. Convert the code to use READ_ONCE() Fixes: 236968383cf5 ("timers: Optimize collect_expired_timers() for NOHZ") Signed-off-by: Li RongQing Signed-off-by: Liang ZhiCheng Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1568894687-14499-1-git-send-email-lirongqing@baidu.com --- kernel/time/timer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 0e315a2e77ae..4820823515e9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1678,24 +1678,26 @@ void timer_clear_idle(void) static int collect_expired_timers(struct timer_base *base, struct hlist_head *heads) { + unsigned long now = READ_ONCE(jiffies); + /* * NOHZ optimization. After a long idle sleep we need to forward the * base to current jiffies. Avoid a loop by searching the bitfield for * the next expiring timer. */ - if ((long)(jiffies - base->clk) > 2) { + if ((long)(now - base->clk) > 2) { unsigned long next = __next_timer_interrupt(base); /* * If the next timer is ahead of time forward to current * jiffies, otherwise forward to the next expiry time: */ - if (time_after(next, jiffies)) { + if (time_after(next, now)) { /* * The call site will increment base->clk and then * terminate the expiry loop immediately. */ - base->clk = jiffies; + base->clk = now; return 0; } base->clk = next; From edfa07504c5bb188b262fb2f4fc487de966f89a2 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 4 Sep 2019 13:30:32 +0100 Subject: [PATCH 0633/2880] drm/panfrost: Fix regulator_get_optional() misuse The panfrost driver requests a supply using regulator_get_optional() but both the name of the supply and the usage pattern suggest that it is being used for the main power for the device and is not at all optional for the device for function, there is no meaningful handling for absent supplies. Such regulators should use the vanilla regulator_get() interface, it will ensure that even if a supply is not described in the system integration one will be provided in software. Signed-off-by: Mark Brown Signed-off-by: Rob Herring Link: https://patchwork.freedesktop.org/patch/msgid/20190904123032.23263-1-broonie@kernel.org --- drivers/gpu/drm/panfrost/panfrost_device.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index 46b0b02e4289..238fb6d54df4 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -89,12 +89,9 @@ static int panfrost_regulator_init(struct panfrost_device *pfdev) { int ret; - pfdev->regulator = devm_regulator_get_optional(pfdev->dev, "mali"); + pfdev->regulator = devm_regulator_get(pfdev->dev, "mali"); if (IS_ERR(pfdev->regulator)) { ret = PTR_ERR(pfdev->regulator); - pfdev->regulator = NULL; - if (ret == -ENODEV) - return 0; dev_err(pfdev->dev, "failed to get regulator: %d\n", ret); return ret; } @@ -110,8 +107,7 @@ static int panfrost_regulator_init(struct panfrost_device *pfdev) static void panfrost_regulator_fini(struct panfrost_device *pfdev) { - if (pfdev->regulator) - regulator_disable(pfdev->regulator); + regulator_disable(pfdev->regulator); } int panfrost_device_init(struct panfrost_device *pfdev) From d18a96620411298f7925bf9a9e26e9e7add8ea2b Mon Sep 17 00:00:00 2001 From: Steven Price Date: Fri, 6 Sep 2019 15:20:53 +0100 Subject: [PATCH 0634/2880] drm/panfrost: Remove NULL checks for regulator devm_regulator_get() is now used to populate pfdev->regulator which ensures that this cannot be NULL (a dummy regulator will be returned if necessary). So remove the checks in panfrost_devfreq_target(). Signed-off-by: Steven Price Signed-off-by: Rob Herring Link: https://patchwork.freedesktop.org/patch/msgid/3e3a2c8a-b4fc-8af6-39e1-b26160db2c7c@arm.com --- drivers/gpu/drm/panfrost/panfrost_devfreq.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c b/drivers/gpu/drm/panfrost/panfrost_devfreq.c index a1f5fa6a742a..12ff77dacc95 100644 --- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c +++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c @@ -39,7 +39,7 @@ static int panfrost_devfreq_target(struct device *dev, unsigned long *freq, * If frequency scaling from low to high, adjust voltage first. * If frequency scaling from high to low, adjust frequency first. */ - if (old_clk_rate < target_rate && pfdev->regulator) { + if (old_clk_rate < target_rate) { err = regulator_set_voltage(pfdev->regulator, target_volt, target_volt); if (err) { @@ -53,14 +53,12 @@ static int panfrost_devfreq_target(struct device *dev, unsigned long *freq, if (err) { dev_err(dev, "Cannot set frequency %lu (%d)\n", target_rate, err); - if (pfdev->regulator) - regulator_set_voltage(pfdev->regulator, - pfdev->devfreq.cur_volt, - pfdev->devfreq.cur_volt); + regulator_set_voltage(pfdev->regulator, pfdev->devfreq.cur_volt, + pfdev->devfreq.cur_volt); return err; } - if (old_clk_rate > target_rate && pfdev->regulator) { + if (old_clk_rate > target_rate) { err = regulator_set_voltage(pfdev->regulator, target_volt, target_volt); if (err) From 65e51e30d8625c82ddfe405da46124e9bbffaa71 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Fri, 13 Sep 2019 17:03:10 +0100 Subject: [PATCH 0635/2880] drm/panfrost: Prevent race when handling page fault When handling a GPU page fault addr_to_drm_mm_node() is used to translate the GPU address to a buffer object. However it is possible for the buffer object to be freed after the function has returned resulting in a use-after-free of the BO. Change addr_to_drm_mm_node to return the panfrost_gem_object with an extra reference on it, preventing the BO from being freed until after the page fault has been handled. Signed-off-by: Steven Price Signed-off-by: Rob Herring Link: https://patchwork.freedesktop.org/patch/msgid/20190913160310.50444-1-steven.price@arm.com --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 55 ++++++++++++++++--------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 387d830cb7cf..6e7891ded464 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -386,28 +386,40 @@ void panfrost_mmu_pgtable_free(struct panfrost_file_priv *priv) free_io_pgtable_ops(mmu->pgtbl_ops); } -static struct drm_mm_node *addr_to_drm_mm_node(struct panfrost_device *pfdev, int as, u64 addr) +static struct panfrost_gem_object * +addr_to_drm_mm_node(struct panfrost_device *pfdev, int as, u64 addr) { - struct drm_mm_node *node = NULL; + struct panfrost_gem_object *bo = NULL; + struct panfrost_file_priv *priv; + struct drm_mm_node *node; u64 offset = addr >> PAGE_SHIFT; struct panfrost_mmu *mmu; spin_lock(&pfdev->as_lock); list_for_each_entry(mmu, &pfdev->as_lru_list, list) { - struct panfrost_file_priv *priv; - if (as != mmu->as) - continue; + if (as == mmu->as) + break; + } + if (as != mmu->as) + goto out; - priv = container_of(mmu, struct panfrost_file_priv, mmu); - drm_mm_for_each_node(node, &priv->mm) { - if (offset >= node->start && offset < (node->start + node->size)) - goto out; + priv = container_of(mmu, struct panfrost_file_priv, mmu); + + spin_lock(&priv->mm_lock); + + drm_mm_for_each_node(node, &priv->mm) { + if (offset >= node->start && + offset < (node->start + node->size)) { + bo = drm_mm_node_to_panfrost_bo(node); + drm_gem_object_get(&bo->base.base); + break; } } + spin_unlock(&priv->mm_lock); out: spin_unlock(&pfdev->as_lock); - return node; + return bo; } #define NUM_FAULT_PAGES (SZ_2M / PAGE_SIZE) @@ -415,29 +427,28 @@ out: int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, u64 addr) { int ret, i; - struct drm_mm_node *node; struct panfrost_gem_object *bo; struct address_space *mapping; pgoff_t page_offset; struct sg_table *sgt; struct page **pages; - node = addr_to_drm_mm_node(pfdev, as, addr); - if (!node) + bo = addr_to_drm_mm_node(pfdev, as, addr); + if (!bo) return -ENOENT; - bo = drm_mm_node_to_panfrost_bo(node); if (!bo->is_heap) { dev_WARN(pfdev->dev, "matching BO is not heap type (GPU VA = %llx)", - node->start << PAGE_SHIFT); - return -EINVAL; + bo->node.start << PAGE_SHIFT); + ret = -EINVAL; + goto err_bo; } WARN_ON(bo->mmu->as != as); /* Assume 2MB alignment and size multiple */ addr &= ~((u64)SZ_2M - 1); page_offset = addr >> PAGE_SHIFT; - page_offset -= node->start; + page_offset -= bo->node.start; mutex_lock(&bo->base.pages_lock); @@ -446,7 +457,8 @@ int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, u64 addr) sizeof(struct sg_table), GFP_KERNEL | __GFP_ZERO); if (!bo->sgts) { mutex_unlock(&bo->base.pages_lock); - return -ENOMEM; + ret = -ENOMEM; + goto err_bo; } pages = kvmalloc_array(bo->base.base.size >> PAGE_SHIFT, @@ -455,7 +467,8 @@ int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, u64 addr) kfree(bo->sgts); bo->sgts = NULL; mutex_unlock(&bo->base.pages_lock); - return -ENOMEM; + ret = -ENOMEM; + goto err_bo; } bo->base.pages = pages; bo->base.pages_use_count = 1; @@ -493,12 +506,16 @@ int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, u64 addr) dev_dbg(pfdev->dev, "mapped page fault @ AS%d %llx", as, addr); + drm_gem_object_put_unlocked(&bo->base.base); + return 0; err_map: sg_free_table(sgt); err_pages: drm_gem_shmem_put_pages(&bo->base); +err_bo: + drm_gem_object_put_unlocked(&bo->base.base); return ret; } From d7f76f36a8b4dc8eff0c22819e4a5d55b0dee62a Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Thu, 15 Aug 2019 11:30:14 +0530 Subject: [PATCH 0636/2880] ata: libahci_platform: Add of_node_put() before loop exit Each iteration of for_each_child_of_node puts the previous node, but in the case of a goto from the middle of the loop, there is no put, thus causing a memory leak. Add an of_node_put before three such goto statements. Issue found with Coccinelle. Reviewed-by: Hans de Goede Signed-off-by: Nishka Dasgupta Signed-off-by: Jens Axboe --- drivers/ata/libahci_platform.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c index 9e9583a6bba9..e742780950de 100644 --- a/drivers/ata/libahci_platform.c +++ b/drivers/ata/libahci_platform.c @@ -497,6 +497,7 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, if (of_property_read_u32(child, "reg", &port)) { rc = -EINVAL; + of_node_put(child); goto err_out; } @@ -514,14 +515,18 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, if (port_dev) { rc = ahci_platform_get_regulator(hpriv, port, &port_dev->dev); - if (rc == -EPROBE_DEFER) + if (rc == -EPROBE_DEFER) { + of_node_put(child); goto err_out; + } } #endif rc = ahci_platform_get_phy(hpriv, port, dev, child); - if (rc) + if (rc) { + of_node_put(child); goto err_out; + } enabled_ports++; } From 2d88b2cf2f002417cd7436f0fd34716e8c288fb1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 18 Sep 2019 16:49:03 +0300 Subject: [PATCH 0637/2880] iwlwifi: mvm: fix build w/o CONFIG_THERMAL Without CONFIG_THERMAL, the driver fails to link as it calls iwl_mvm_send_temp_report_ths_cmd() unconditionally. Fix this by making that function available, but do almost nothing but send the empty firmware command to enable CT-kill reporting. While at it, also fix that function itself to not error out when the thermal zone hasn't been initialized, but instead just send the empty firmware command in this case as well. Fixes: 242d9c8b9a93 ("iwlwifi: mvm: use FW thermal monitoring regardless of CONFIG_THERMAL") Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/mvm/tt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c index 32a708301cfc..f0c539b37ea7 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c @@ -555,16 +555,19 @@ static int compare_temps(const void *a, const void *b) return ((s16)le16_to_cpu(*(__le16 *)a) - (s16)le16_to_cpu(*(__le16 *)b)); } +#endif int iwl_mvm_send_temp_report_ths_cmd(struct iwl_mvm *mvm) { struct temp_report_ths_cmd cmd = {0}; - int ret, i, j, idx = 0; + int ret; +#ifdef CONFIG_THERMAL + int i, j, idx = 0; lockdep_assert_held(&mvm->mutex); if (!mvm->tz_device.tzone) - return -EINVAL; + goto send; /* The driver holds array of temperature trips that are unsorted * and uncompressed, the FW should get it compressed and sorted @@ -597,6 +600,7 @@ int iwl_mvm_send_temp_report_ths_cmd(struct iwl_mvm *mvm) } send: +#endif ret = iwl_mvm_send_cmd_pdu(mvm, WIDE_ID(PHY_OPS_GROUP, TEMP_REPORTING_THRESHOLDS_CMD), 0, sizeof(cmd), &cmd); @@ -607,6 +611,7 @@ send: return ret; } +#ifdef CONFIG_THERMAL static int iwl_mvm_tzone_get_temp(struct thermal_zone_device *device, int *temperature) { From 6fe7b9901400152238e1b76198747f6716c78aad Mon Sep 17 00:00:00 2001 From: Matthew Bobrowski Date: Thu, 19 Sep 2019 15:32:44 -0700 Subject: [PATCH 0638/2880] iomap: split size and error for iomap_dio_rw ->end_io Modify the calling convention for the iomap_dio_rw ->end_io() callback. Rather than passing either dio->error or dio->size as the 'size' argument, instead pass both the dio->error and the dio->size value separately. In the instance that an error occurred during a write, we currently cannot determine whether any blocks have been allocated beyond the current EOF and data has subsequently been written to these blocks within the ->end_io() callback. As a result, we cannot judge whether we should take the truncate failed write path. Having both dio->error and dio->size will allow us to perform such checks within this callback. Signed-off-by: Matthew Bobrowski [hch: minor cleanups] Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Matthew Wilcox (Oracle) --- fs/iomap/direct-io.c | 9 +++------ fs/xfs/xfs_file.c | 8 +++++--- include/linux/iomap.h | 4 ++-- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 10517cea9682..2ccf1c6460d4 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -77,13 +77,10 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) loff_t offset = iocb->ki_pos; ssize_t ret; - if (dio->end_io) { - ret = dio->end_io(iocb, - dio->error ? dio->error : dio->size, - dio->flags); - } else { + if (dio->end_io) + ret = dio->end_io(iocb, dio->size, dio->error, dio->flags); + else ret = dio->error; - } if (likely(!ret)) { ret = dio->size; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 28101bbc0b78..74411296f6b5 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -369,21 +369,23 @@ static int xfs_dio_write_end_io( struct kiocb *iocb, ssize_t size, + int error, unsigned flags) { struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); loff_t offset = iocb->ki_pos; unsigned int nofs_flag; - int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (size <= 0) - return size; + if (error) + return error; + if (!size) + return 0; /* * Capture amount written on completion as we can't reliably account diff --git a/include/linux/iomap.h b/include/linux/iomap.h index bc499ceae392..50bb746d2216 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -188,8 +188,8 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, */ #define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */ #define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */ -typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret, - unsigned flags); +typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags); ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, iomap_dio_end_io_t end_io); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); From 838c4f3d7515efe9d0e32c846fb5d102b6d8a29d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Sep 2019 15:32:45 -0700 Subject: [PATCH 0639/2880] iomap: move the iomap_dio_rw ->end_io callback into a structure Add a new iomap_dio_ops structure that for now just contains the end_io handler. This avoid storing the function pointer in a mutable structure, which is a possible exploit vector for kernel code execution, and prepares for adding a submit_io handler that btrfs needs. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/direct-io.c | 21 ++++++++++----------- fs/xfs/xfs_file.c | 6 +++++- include/linux/iomap.h | 10 +++++++--- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 2ccf1c6460d4..1fc28c2da279 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -24,7 +24,7 @@ struct iomap_dio { struct kiocb *iocb; - iomap_dio_end_io_t *end_io; + const struct iomap_dio_ops *dops; loff_t i_size; loff_t size; atomic_t ref; @@ -72,15 +72,14 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, static ssize_t iomap_dio_complete(struct iomap_dio *dio) { + const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; - ssize_t ret; + ssize_t ret = dio->error; - if (dio->end_io) - ret = dio->end_io(iocb, dio->size, dio->error, dio->flags); - else - ret = dio->error; + if (dops && dops->end_io) + ret = dops->end_io(iocb, dio->size, ret, dio->flags); if (likely(!ret)) { ret = dio->size; @@ -98,9 +97,9 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) * one is a pretty crazy thing to do, so we don't support it 100%. If * this invalidation fails, tough, the write still worked... * - * And this page cache invalidation has to be after dio->end_io(), as - * some filesystems convert unwritten extents to real allocations in - * end_io() when necessary, otherwise a racing buffer read would cache + * And this page cache invalidation has to be after ->end_io(), as some + * filesystems convert unwritten extents to real allocations in + * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ if (!dio->error && @@ -393,7 +392,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, */ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, - const struct iomap_ops *ops, iomap_dio_end_io_t end_io) + const struct iomap_ops *ops, const struct iomap_dio_ops *dops) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); @@ -418,7 +417,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, atomic_set(&dio->ref, 1); dio->size = 0; dio->i_size = i_size_read(inode); - dio->end_io = end_io; + dio->dops = dops; dio->error = 0; dio->flags = 0; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 74411296f6b5..21bd3d575aaa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -442,6 +442,10 @@ out: return error; } +static const struct iomap_dio_ops xfs_dio_write_ops = { + .end_io = xfs_dio_write_end_io, +}; + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -542,7 +546,7 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); - ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops); /* * If unaligned, this is the only IO in-flight. If it has not yet diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 50bb746d2216..7aa5d6117936 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -188,10 +188,14 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, */ #define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */ #define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */ -typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t size, int error, - unsigned int flags); + +struct iomap_dio_ops { + int (*end_io)(struct kiocb *iocb, ssize_t size, int error, + unsigned flags); +}; + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, - const struct iomap_ops *ops, iomap_dio_end_io_t end_io); + const struct iomap_ops *ops, const struct iomap_dio_ops *dops); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); #ifdef CONFIG_SWAP From b47bea2b5c3bf17270defe8529774dad114bcee5 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 17 Sep 2019 16:26:16 -0700 Subject: [PATCH 0640/2880] ionic: Remove unnecessary ternary operator in ionic_debugfs_add_ident clang warns: ../drivers/net/ethernet/pensando/ionic/ionic_debugfs.c:60:37: warning: expression result unused [-Wunused-value] ionic, &identity_fops) ? 0 : -EOPNOTSUPP; ^~~~~~~~~~~ 1 warning generated. The return value of debugfs_create_file does not need to be checked [1] and the function returns void so get rid of the ternary operator, it is unnecessary. [1]: https://lore.kernel.org/linux-mm/20150815160730.GB25186@kroah.com/ Fixes: fbfb8031533c ("ionic: Add hardware init and device commands") Link: https://github.com/ClangBuiltLinux/linux/issues/658 Signed-off-by: Nathan Chancellor Acked-by: Shannon Nelson Reviewed-by: Greg Kroah-Hartman --- drivers/net/ethernet/pensando/ionic/ionic_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c b/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c index 7afc4a365b75..bc03cecf80cc 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c @@ -57,7 +57,7 @@ DEFINE_SHOW_ATTRIBUTE(identity); void ionic_debugfs_add_ident(struct ionic *ionic) { debugfs_create_file("identity", 0400, ionic->dentry, - ionic, &identity_fops) ? 0 : -EOPNOTSUPP; + ionic, &identity_fops); } void ionic_debugfs_add_sizes(struct ionic *ionic) From dd0f9d896d167ab37732dd83986adc3a017a13b4 Mon Sep 17 00:00:00 2001 From: Murilo Fossa Vicentini Date: Mon, 16 Sep 2019 11:50:37 -0300 Subject: [PATCH 0641/2880] ibmvnic: Warn unknown speed message only when carrier is present With commit 0655f9943df2 ("net/ibmvnic: Update carrier state after link state change") we are now able to detect when the carrier is properly present in the device, so only report an unexpected unknown speed when it is properly detected. Unknown speed is expected to be seen by the device in case the backing device has no link detected. Reported-by: Abdul Haleem Tested-by: Abdul Haleem Signed-off-by: Murilo Fossa Vicentini Reviewed-by: Thomas Falcon --- drivers/net/ethernet/ibm/ibmvnic.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 2e5172f61564..3816fff75bb5 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4312,13 +4312,14 @@ static int handle_query_phys_parms_rsp(union ibmvnic_crq *crq, { struct net_device *netdev = adapter->netdev; int rc; + __be32 rspeed = cpu_to_be32(crq->query_phys_parms_rsp.speed); rc = crq->query_phys_parms_rsp.rc.code; if (rc) { netdev_err(netdev, "Error %d in QUERY_PHYS_PARMS\n", rc); return rc; } - switch (cpu_to_be32(crq->query_phys_parms_rsp.speed)) { + switch (rspeed) { case IBMVNIC_10MBPS: adapter->speed = SPEED_10; break; @@ -4344,8 +4345,8 @@ static int handle_query_phys_parms_rsp(union ibmvnic_crq *crq, adapter->speed = SPEED_100000; break; default: - netdev_warn(netdev, "Unknown speed 0x%08x\n", - cpu_to_be32(crq->query_phys_parms_rsp.speed)); + if (netif_carrier_ok(netdev)) + netdev_warn(netdev, "Unknown speed 0x%08x\n", rspeed); adapter->speed = SPEED_UNKNOWN; } if (crq->query_phys_parms_rsp.flags1 & IBMVNIC_FULL_DUPLEX) From cf0eba334268563152e4a8bc9ab865d0037a7948 Mon Sep 17 00:00:00 2001 From: Vijay Khemka Date: Thu, 12 Sep 2019 12:04:50 -0700 Subject: [PATCH 0642/2880] net/ncsi: Disable global multicast filter Disabling multicast filtering from NCSI if it is supported. As it should not filter any multicast packets. In current code, multicast filter is enabled and with an exception of optional field supported by device are disabled filtering. Mainly I see if goal is to disable filtering for IPV6 packets then let it disabled for every other types as well. As we are seeing issues with LLDP not working with this enabled filtering. And there are other issues with IPV6. By Disabling this multicast completely, it is working for both IPV6 as well as LLDP. Signed-off-by: Vijay Khemka Acked-by: Samuel Mendoza-Jonas Signed-off-by: Jakub Kicinski --- net/ncsi/internal.h | 7 +-- net/ncsi/ncsi-manage.c | 104 ++++++----------------------------------- 2 files changed, 15 insertions(+), 96 deletions(-) diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 0b3f0673e1a2..ad3fd7f1da75 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -264,9 +264,7 @@ enum { ncsi_dev_state_config_ev, ncsi_dev_state_config_sma, ncsi_dev_state_config_ebf, -#if IS_ENABLED(CONFIG_IPV6) - ncsi_dev_state_config_egmf, -#endif + ncsi_dev_state_config_dgmf, ncsi_dev_state_config_ecnt, ncsi_dev_state_config_ec, ncsi_dev_state_config_ae, @@ -295,9 +293,6 @@ struct ncsi_dev_priv { #define NCSI_DEV_RESET 8 /* Reset state of NC */ unsigned int gma_flag; /* OEM GMA flag */ spinlock_t lock; /* Protect the NCSI device */ -#if IS_ENABLED(CONFIG_IPV6) - unsigned int inet6_addr_num; /* Number of IPv6 addresses */ -#endif unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ struct list_head packages; /* List of packages */ diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 755aab66dcab..70fe02697544 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include "internal.h" @@ -978,9 +977,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) case ncsi_dev_state_config_ev: case ncsi_dev_state_config_sma: case ncsi_dev_state_config_ebf: -#if IS_ENABLED(CONFIG_IPV6) - case ncsi_dev_state_config_egmf: -#endif + case ncsi_dev_state_config_dgmf: case ncsi_dev_state_config_ecnt: case ncsi_dev_state_config_ec: case ncsi_dev_state_config_ae: @@ -1033,23 +1030,23 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } else if (nd->state == ncsi_dev_state_config_ebf) { nca.type = NCSI_PKT_CMD_EBF; nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap; + /* if multicast global filtering is supported then + * disable it so that all multicast packet will be + * forwarded to management controller + */ + if (nc->caps[NCSI_CAP_GENERIC].cap & + NCSI_CAP_GENERIC_MC) + nd->state = ncsi_dev_state_config_dgmf; + else if (ncsi_channel_is_tx(ndp, nc)) + nd->state = ncsi_dev_state_config_ecnt; + else + nd->state = ncsi_dev_state_config_ec; + } else if (nd->state == ncsi_dev_state_config_dgmf) { + nca.type = NCSI_PKT_CMD_DGMF; if (ncsi_channel_is_tx(ndp, nc)) nd->state = ncsi_dev_state_config_ecnt; else nd->state = ncsi_dev_state_config_ec; -#if IS_ENABLED(CONFIG_IPV6) - if (ndp->inet6_addr_num > 0 && - (nc->caps[NCSI_CAP_GENERIC].cap & - NCSI_CAP_GENERIC_MC)) - nd->state = ncsi_dev_state_config_egmf; - } else if (nd->state == ncsi_dev_state_config_egmf) { - nca.type = NCSI_PKT_CMD_EGMF; - nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; - if (ncsi_channel_is_tx(ndp, nc)) - nd->state = ncsi_dev_state_config_ecnt; - else - nd->state = ncsi_dev_state_config_ec; -#endif /* CONFIG_IPV6 */ } else if (nd->state == ncsi_dev_state_config_ecnt) { if (np->preferred_channel && nc != np->preferred_channel) @@ -1483,70 +1480,6 @@ out: return -ENODEV; } -#if IS_ENABLED(CONFIG_IPV6) -static int ncsi_inet6addr_event(struct notifier_block *this, - unsigned long event, void *data) -{ - struct inet6_ifaddr *ifa = data; - struct net_device *dev = ifa->idev->dev; - struct ncsi_dev *nd = ncsi_find_dev(dev); - struct ncsi_dev_priv *ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; - struct ncsi_package *np; - struct ncsi_channel *nc; - struct ncsi_cmd_arg nca; - bool action; - int ret; - - if (!ndp || (ipv6_addr_type(&ifa->addr) & - (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK))) - return NOTIFY_OK; - - switch (event) { - case NETDEV_UP: - action = (++ndp->inet6_addr_num) == 1; - nca.type = NCSI_PKT_CMD_EGMF; - break; - case NETDEV_DOWN: - action = (--ndp->inet6_addr_num == 0); - nca.type = NCSI_PKT_CMD_DGMF; - break; - default: - return NOTIFY_OK; - } - - /* We might not have active channel or packages. The IPv6 - * required multicast will be enabled when active channel - * or packages are chosen. - */ - np = ndp->active_package; - nc = ndp->active_channel; - if (!action || !np || !nc) - return NOTIFY_OK; - - /* We needn't enable or disable it if the function isn't supported */ - if (!(nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC)) - return NOTIFY_OK; - - nca.ndp = ndp; - nca.req_flags = 0; - nca.package = np->id; - nca.channel = nc->id; - nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; - ret = ncsi_xmit_cmd(&nca); - if (ret) { - netdev_warn(dev, "Fail to %s global multicast filter (%d)\n", - (event == NETDEV_UP) ? "enable" : "disable", ret); - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} - -static struct notifier_block ncsi_inet6addr_notifier = { - .notifier_call = ncsi_inet6addr_event, -}; -#endif /* CONFIG_IPV6 */ - static int ncsi_kick_channels(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; @@ -1725,11 +1658,6 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, } spin_lock_irqsave(&ncsi_dev_lock, flags); -#if IS_ENABLED(CONFIG_IPV6) - ndp->inet6_addr_num = 0; - if (list_empty(&ncsi_dev_list)) - register_inet6addr_notifier(&ncsi_inet6addr_notifier); -#endif list_add_tail_rcu(&ndp->node, &ncsi_dev_list); spin_unlock_irqrestore(&ncsi_dev_lock, flags); @@ -1896,10 +1824,6 @@ void ncsi_unregister_dev(struct ncsi_dev *nd) spin_lock_irqsave(&ncsi_dev_lock, flags); list_del_rcu(&ndp->node); -#if IS_ENABLED(CONFIG_IPV6) - if (list_empty(&ncsi_dev_list)) - unregister_inet6addr_notifier(&ncsi_inet6addr_notifier); -#endif spin_unlock_irqrestore(&ncsi_dev_lock, flags); ncsi_unregister_netlink(nd->dev); From 20b3f7d700130326db555d724e7507ce4606219a Mon Sep 17 00:00:00 2001 From: James Byrne Date: Fri, 13 Sep 2019 16:46:35 +0000 Subject: [PATCH 0643/2880] dt-bindings: net: Correct the documentation of KSZ9021 skew values The documentation of skew values for the KSZ9021 PHY was misleading because the driver implementation followed the erroneous information given in the original KSZ9021 datasheet before it was corrected in revision 1.2 (Feb 2014). It is probably too late to correct the driver now because of the many existing device trees, so instead this just corrects the documentation to explain that what you actually get is not what you might think when looking at the device tree. Signed-off-by: James Byrne Signed-off-by: Jakub Kicinski --- .../bindings/net/micrel-ksz90x1.txt | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt index 5100358177c9..b921731cd970 100644 --- a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt +++ b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt @@ -12,8 +12,36 @@ and therefore may overwrite them. KSZ9021: All skew control options are specified in picoseconds. The minimum - value is 0, the maximum value is 3000, and it is incremented by 200ps - steps. + value is 0, the maximum value is 3000, and it can be specified in 200ps + steps, *but* these values are in not fact what you get because this chip's + skew values actually increase in 120ps steps, starting from -840ps. The + incorrect values came from an error in the original KSZ9021 datasheet + before it was corrected in revision 1.2 (Feb 2014), but it is too late to + change the driver now because of the many existing device trees that have + been created using values that go up in increments of 200. + + The following table shows the actual skew delay you will get for each of the + possible devicetree values, and the number that will be programmed into the + corresponding pad skew register: + + Device Tree Value Delay Pad Skew Register Value + ----------------------------------------------------- + 0 -840ps 0000 + 200 -720ps 0001 + 400 -600ps 0010 + 600 -480ps 0011 + 800 -360ps 0100 + 1000 -240ps 0101 + 1200 -120ps 0110 + 1400 0ps 0111 + 1600 120ps 1000 + 1800 240ps 1001 + 2000 360ps 1010 + 2200 480ps 1011 + 2400 600ps 1100 + 2600 720ps 1101 + 2800 840ps 1110 + 3000 960ps 1111 Optional properties: From 864668bfc374dfbf4851ec828b9049e08f9057b1 Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Mon, 16 Sep 2019 08:26:50 -0400 Subject: [PATCH 0644/2880] selftests: Add test cases for `ip nexthop flush proto XX` Add some test cases to allow the fib_nexthops.sh test code to test the flushing of nexthops based upon the proto passed in upon creation of the nexthop group. Signed-off-by: Donald Sharp Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_nexthops.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index f9ebeac1e6f2..796670ebc65b 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -940,6 +940,20 @@ basic() run_cmd "$IP nexthop add id 104 group 1 dev veth1" log_test $? 2 "Nexthop group and device" + # Tests to ensure that flushing works as expected. + run_cmd "$IP nexthop add id 105 blackhole proto 99" + run_cmd "$IP nexthop add id 106 blackhole proto 100" + run_cmd "$IP nexthop add id 107 blackhole proto 99" + run_cmd "$IP nexthop flush proto 99" + check_nexthop "id 105" "" + check_nexthop "id 106" "id 106 blackhole proto 100" + check_nexthop "id 107" "" + run_cmd "$IP nexthop flush proto 100" + check_nexthop "id 106" "" + + run_cmd "$IP nexthop flush proto 100" + log_test $? 0 "Test proto flush" + run_cmd "$IP nexthop add id 104 group 1 blackhole" log_test $? 2 "Nexthop group and blackhole" From 91d99a724e9c60e14332c26ab2284bf696b94c8e Mon Sep 17 00:00:00 2001 From: Wang Xiayang Date: Fri, 20 Sep 2019 14:55:47 +0800 Subject: [PATCH 0645/2880] nios2: force the string buffer NULL-terminated strncpy() does not ensure NULL-termination when the input string size equals to the destination buffer size COMMAND_LINE_SIZE. Besides, grep under arch/ with 'boot_command_line' shows no other arch-specific code uses strncpy() when copying boot_command_line. Use strlcpy() instead. This issue is identified by a Coccinelle script. Signed-off-by: Wang Xiayang Signed-off-by: Ley Foon Tan --- arch/nios2/kernel/setup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index 6bbd4ae2beb0..4cf35b09c0ec 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -123,7 +123,7 @@ asmlinkage void __init nios2_boot_init(unsigned r4, unsigned r5, unsigned r6, dtb_passed = r6; if (r7) - strncpy(cmdline_passed, (char *)r7, COMMAND_LINE_SIZE); + strlcpy(cmdline_passed, (char *)r7, COMMAND_LINE_SIZE); } #endif @@ -131,10 +131,10 @@ asmlinkage void __init nios2_boot_init(unsigned r4, unsigned r5, unsigned r6, #ifndef CONFIG_CMDLINE_FORCE if (cmdline_passed[0]) - strncpy(boot_command_line, cmdline_passed, COMMAND_LINE_SIZE); + strlcpy(boot_command_line, cmdline_passed, COMMAND_LINE_SIZE); #ifdef CONFIG_NIOS2_CMDLINE_IGNORE_DTB else - strncpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); + strlcpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); #endif #endif From d2c63b7dfd06788a466d5ec8a850491f084c5fc2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 20 Sep 2019 09:30:40 +0200 Subject: [PATCH 0646/2880] ALSA: hda - Apply AMD controller workaround for Raven platform It's reported that the garbled sound on HP Envy x360 13z-ag000 (Ryzen Laptop) is fixed by the same workaround applied to other AMD chips. Update the driver_data entry for Raven (1022:15e3) to use the newly introduced preset, AZX_DCAPS_PRESET_AMD_SB. Since it already contains AZX_DCAPS_PM_RUNTIME, we can drop that bit, too. Reported-and-tested-by: Dennis Padiernos Cc: Link: https://lore.kernel.org/r/20190920073040.31764-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 91e71be42fa4..240f4ca76391 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2485,8 +2485,7 @@ static const struct pci_device_id azx_ids[] = { AZX_DCAPS_PM_RUNTIME }, /* AMD Raven */ { PCI_DEVICE(0x1022, 0x15e3), - .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_ATI_SB | - AZX_DCAPS_PM_RUNTIME }, + .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_AMD_SB }, /* ATI HDMI */ { PCI_DEVICE(0x1002, 0x0002), .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS }, From b74d957f6317e60924d0d3c0c01750814c24611b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 29 Aug 2019 15:43:49 +0200 Subject: [PATCH 0647/2880] ARM: aspeed: ast2500 is ARMv6K Linux supports both the original ARMv6 level (early ARM1136) and ARMv6K (later ARM1136, ARM1176 and ARM11mpcore). ast2500 falls into the second categoy, being based on arm1176jzf-s. This is enabled by default when using ARCH_MULTI_V6, so we should not 'select CPU_V6'. Removing this will lead to more efficient use of atomic instructions. Fixes: 8c2ed9bcfbeb ("arm: Add Aspeed machine") Signed-off-by: Arnd Bergmann Reviewed-by: Joel Stanley --- arch/arm/mach-aspeed/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/mach-aspeed/Kconfig b/arch/arm/mach-aspeed/Kconfig index a15c3a291386..44ee6bac255e 100644 --- a/arch/arm/mach-aspeed/Kconfig +++ b/arch/arm/mach-aspeed/Kconfig @@ -26,7 +26,6 @@ config MACH_ASPEED_G4 config MACH_ASPEED_G5 bool "Aspeed SoC 5th Generation" depends on ARCH_MULTI_V6 - select CPU_V6 select PINCTRL_ASPEED_G5 help Say yes if you intend to run on an Aspeed ast2500 or similar From bd9c10bc663dd2eaac8fe39dad0f18cd21527446 Mon Sep 17 00:00:00 2001 From: Jan-Marek Glogowski Date: Sun, 15 Sep 2019 16:57:28 +0200 Subject: [PATCH 0648/2880] ALSA: hda/realtek - PCI quirk for Medion E4254 The laptop has a combined jack to attach headsets on the right. The BIOS encodes them as two different colored jacks at the front, but otherwise it seems to be configured ok. But any adaption of the pins config on its own doesn't fix the jack detection to work in Linux. Still Windows works correct. This is somehow fixed by chaining ALC256_FIXUP_ASUS_HEADSET_MODE, which seems to register the microphone jack as a headset part and also results in fixing jack sensing, visible in dmesg as: -snd_hda_codec_realtek hdaudioC0D0: Mic=0x19 +snd_hda_codec_realtek hdaudioC0D0: Headset Mic=0x19 [ Actually the essential change is the location of the jack; the driver created "Front Mic Jack" without the matching volume / mute control element due to its jack location, which confused PA. -- tiwai ] Signed-off-by: Jan-Marek Glogowski Cc: Link: https://lore.kernel.org/r/8f4f9b20-0aeb-f8f1-c02f-fd53c09679f1@fbihome.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e27ef434b60d..b000b36ac3c6 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5872,6 +5872,7 @@ enum { ALC256_FIXUP_ASUS_MIC_NO_PRESENCE, ALC299_FIXUP_PREDATOR_SPK, ALC294_FIXUP_ASUS_INTSPK_HEADSET_MIC, + ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE, }; static const struct hda_fixup alc269_fixups[] = { @@ -6937,6 +6938,16 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC }, + [ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x04a11040 }, + { 0x21, 0x04211020 }, + { } + }, + .chained = true, + .chain_id = ALC256_FIXUP_ASUS_HEADSET_MODE + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -7200,6 +7211,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x9e54, "LENOVO NB", ALC269_FIXUP_LENOVO_EAPD), SND_PCI_QUIRK(0x19e5, 0x3204, "Huawei MACH-WX9", ALC256_FIXUP_HUAWEI_MACH_WX9_PINS), SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */ + SND_PCI_QUIRK(0x10ec, 0x118c, "Medion EE4254 MD62100", ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE), #if 0 /* Below is a quirk table taken from the old code. @@ -7368,6 +7380,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC295_FIXUP_CHROME_BOOK, .name = "alc-chrome-book"}, {.id = ALC299_FIXUP_PREDATOR_SPK, .name = "predator-spk"}, {.id = ALC298_FIXUP_HUAWEI_MBX_STEREO, .name = "huawei-mbx-stereo"}, + {.id = ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE, .name = "alc256-medion-headset"}, {} }; #define ALC225_STANDARD_PINS \ From ad652f3811d8644d547506154ec9a9c22c8771cd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 16 Sep 2019 18:33:08 +0200 Subject: [PATCH 0649/2880] netfilter: nf_tables: add NFT_CHAIN_POLICY_UNSET and use it Default policy is defined as a unsigned 8-bit field, do not use a negative value to leave it unset, use this new NFT_CHAIN_POLICY_UNSET instead. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2655e03dbe1b..a26d64056fc8 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -889,6 +889,8 @@ enum nft_chain_flags { NFT_CHAIN_HW_OFFLOAD = 0x2, }; +#define NFT_CHAIN_POLICY_UNSET U8_MAX + /** * struct nft_chain - nf_tables chain * diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e4a68dc42694..4a5d6ef2b706 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1715,7 +1715,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err2; } - nft_trans_chain_policy(trans) = -1; + nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; if (nft_is_base_chain(chain)) nft_trans_chain_policy(trans) = policy; From ff175d0b0eab99f512b9afdb571f0ed18b63f533 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 16 Sep 2019 18:33:09 +0200 Subject: [PATCH 0650/2880] netfilter: nf_tables_offload: fix always true policy is unset check New smatch warnings: net/netfilter/nf_tables_offload.c:316 nft_flow_offload_chain() warn: always true condition '(policy != -1) => (0-255 != (-1))' Reported-by: kbuild test robot Fixes: c9626a2cbdb2 ("netfilter: nf_tables: add hardware offload support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 21bb772cb4b7..e546f759b7a7 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -313,7 +313,7 @@ static int nft_flow_offload_chain(struct nft_chain *chain, policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ - if (cmd == FLOW_BLOCK_BIND && policy != -1 && policy != NF_ACCEPT) + if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; if (dev->netdev_ops->ndo_setup_tc) From acab713177377d9e0889c46bac7ff0cfb9a90c4d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 19 Sep 2019 16:56:44 +0200 Subject: [PATCH 0651/2880] netfilter: nf_tables: allow lookups in dynamic sets This un-breaks lookups in sets that have the 'dynamic' flag set. Given this active example configuration: table filter { set set1 { type ipv4_addr size 64 flags dynamic,timeout timeout 1m } chain input { type filter hook input priority 0; policy accept; } } ... this works: nft add rule ip filter input add @set1 { ip saddr } -> whenever rule is triggered, the source ip address is inserted into the set (if it did not exist). This won't work: nft add rule ip filter input ip saddr @set1 counter Error: Could not process rule: Operation not supported In other words, we can add entries to the set, but then can't make matching decision based on that set. That is just wrong -- all set backends support lookups (else they would not be very useful). The failure comes from an explicit rejection in nft_lookup.c. Looking at the history, it seems like NFT_SET_EVAL used to mean 'set contains expressions' (aka. "is a meter"), for instance something like nft add rule ip filter input meter example { ip saddr limit rate 10/second } or nft add rule ip filter input meter example { ip saddr counter } The actual meaning of NFT_SET_EVAL however, is 'set can be updated from the packet path'. 'meters' and packet-path insertions into sets, such as 'add @set { ip saddr }' use exactly the same kernel code (nft_dynset.c) and thus require a set backend that provides the ->update() function. The only set that provides this also is the only one that has the NFT_SET_EVAL feature flag. Removing the wrong check makes the above example work. While at it, also fix the flag check during set instantiation to allow supported combinations only. Fixes: 8aeff920dcc9b3f ("netfilter: nf_tables: add stateful object reference to set elements") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 +++++-- net/netfilter/nft_lookup.c | 3 --- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4a5d6ef2b706..6dc46f9b5f7b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3562,8 +3562,11 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, NFT_SET_OBJECT)) return -EINVAL; /* Only one of these operations is supported */ - if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) == - (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) + if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == + (NFT_SET_MAP | NFT_SET_OBJECT)) + return -EOPNOTSUPP; + if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) == + (NFT_SET_EVAL | NFT_SET_OBJECT)) return -EOPNOTSUPP; } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index c0560bf3c31b..660bad688e2b 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -73,9 +73,6 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_EVAL) - return -EOPNOTSUPP; - priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]); err = nft_validate_register_load(priv->sreg, set->klen); if (err < 0) From 7a5d9815cc010b055c2a99ccf418c4629365fa43 Mon Sep 17 00:00:00 2001 From: Bard liao Date: Wed, 18 Sep 2019 21:31:31 +0800 Subject: [PATCH 0652/2880] ASoC: core: use list_del_init and move it back to soc_cleanup_component commit a0a4bf57a977 ("ASoC: core: delete component->card_list in soc_remove_component only") was trying to fix a kernel oops when list_del was called twice without re-init the list. Use list_del_init() can solve it, too. Besides, it will be more readable if we cleanup all component related resource at soc_cleanup_component(). Suggested-by: Kuninori Morimoto Signed-off-by: Bard liao Link: https://lore.kernel.org/r/20190918133131.15045-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index aff4b4bf4d07..88978a3036c4 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -978,6 +978,7 @@ static void soc_cleanup_component(struct snd_soc_component *component) /* For framework level robustness */ snd_soc_component_set_jack(component, NULL, NULL); + list_del_init(&component->card_list); snd_soc_dapm_free(snd_soc_component_get_dapm(component)); soc_cleanup_component_debugfs(component); component->card = NULL; @@ -990,7 +991,7 @@ static void soc_remove_component(struct snd_soc_component *component) return; snd_soc_component_remove(component); - list_del(&component->card_list); + soc_cleanup_component(component); } From 7b2db65b59c30d58c129d3c8b2101feca686155a Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 19 Sep 2019 10:16:52 +0300 Subject: [PATCH 0653/2880] ASoC: pcm3168a: The codec does not support S32_LE 24 bits is supported in all modes and 16 bit only when the codec is slave and the DAI is set to RIGHT_J. Remove the unsupported sample format. Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20190919071652.31724-1-peter.ujfalusi@ti.com Signed-off-by: Mark Brown --- sound/soc/codecs/pcm3168a.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/soc/codecs/pcm3168a.c b/sound/soc/codecs/pcm3168a.c index 50ed86d45c26..88b75695fbf7 100644 --- a/sound/soc/codecs/pcm3168a.c +++ b/sound/soc/codecs/pcm3168a.c @@ -21,8 +21,7 @@ #define PCM3168A_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | \ SNDRV_PCM_FMTBIT_S24_3LE | \ - SNDRV_PCM_FMTBIT_S24_LE | \ - SNDRV_PCM_FMTBIT_S32_LE) + SNDRV_PCM_FMTBIT_S24_LE) #define PCM3168A_FMT_I2S 0x0 #define PCM3168A_FMT_LEFT_J 0x1 From 147162f575152db80000fb3de26358264768ee9f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Sep 2019 09:50:18 +0200 Subject: [PATCH 0654/2880] ASoC: ti: fix SND_SOC_DM365_VOICE_CODEC dependencies SND_SOC_DM365_VOICE_CODEC is a 'bool' option in a choice statement, meaning it cannot be set to =m, but it selects two other drivers that we may want to be loadable modules after all: WARNING: unmet direct dependencies detected for SND_SOC_CQ0093VC Depends on [m]: SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] Selected by [y]: - SND_SOC_DM365_VOICE_CODEC [=y] && Selected by [m]: - SND_SOC_ALL_CODECS [=m] && SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] && COMPILE_TEST [=y] Add an intermediate symbol that sets SND_SOC_CQ0093VC and MFD_DAVINCI_VOICECODEC to =m if SND_SOC=m. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20190920075046.3210393-1-arnd@arndb.de Signed-off-by: Mark Brown --- sound/soc/ti/Kconfig | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sound/soc/ti/Kconfig b/sound/soc/ti/Kconfig index 87a9b9dd4e98..29f61053ab62 100644 --- a/sound/soc/ti/Kconfig +++ b/sound/soc/ti/Kconfig @@ -200,11 +200,18 @@ config SND_SOC_DM365_AIC3X_CODEC config SND_SOC_DM365_VOICE_CODEC bool "Voice Codec - CQ93VC" - select MFD_DAVINCI_VOICECODEC - select SND_SOC_CQ0093VC help Say Y if you want to add support for SoC On-chip voice codec endchoice +config SND_SOC_DM365_VOICE_CODEC_MODULE + def_tristate y + depends on SND_SOC_DM365_VOICE_CODEC && SND_SOC + select MFD_DAVINCI_VOICECODEC + select SND_SOC_CQ0093VC + help + The is an internal symbol needed to ensure that the codec + and MFD driver can be built as loadable modules if necessary. + endmenu From 79743bc927f695e5a24f26c31cfe2d69f6ee00f7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 2 Sep 2019 15:37:21 -0300 Subject: [PATCH 0655/2880] perf jvmti: Link against tools/lib/string.o to have weak strlcpy() That is needed in systems such some S/390 distros. $ readelf -s /tmp/build/perf/jvmti/jvmti-in.o | grep strlcpy 452: 0000000000002990 125 FUNC WEAK DEFAULT 119 strlcpy $ Thanks to Jiri Olsa for fixing up my initial stab at this, I forgot how Makefiles are picky about spaces versus tabs. Reported-by: Thomas Richter Cc: Adrian Hunter Cc: Andreas Krebbel Cc: Jiri Olsa Cc: Namhyung Kim Cc: Sergey Melnikov Link: https://lkml.kernel.org/n/tip-x8vg9sffgb2t1tzqmhkrulh7@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/jvmti/Build | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/perf/jvmti/Build b/tools/perf/jvmti/Build index eaeb8cb5379b..1e148bbdf820 100644 --- a/tools/perf/jvmti/Build +++ b/tools/perf/jvmti/Build @@ -1,8 +1,17 @@ jvmti-y += libjvmti.o jvmti-y += jvmti_agent.o +# For strlcpy +jvmti-y += libstring.o + CFLAGS_jvmti = -fPIC -DPIC -I$(JDIR)/include -I$(JDIR)/include/linux CFLAGS_REMOVE_jvmti = -Wmissing-declarations CFLAGS_REMOVE_jvmti += -Wstrict-prototypes CFLAGS_REMOVE_jvmti += -Wextra CFLAGS_REMOVE_jvmti += -Wwrite-strings + +CFLAGS_libstring.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" + +$(OUTPUT)jvmti/libstring.o: ../lib/string.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_o_c) From 9e282b739466dd319667788417426145da62e381 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 2 Sep 2019 16:08:19 +0000 Subject: [PATCH 0656/2880] perf tools: Add PMU event JSON files for ARM Cortex-A76 and, Neoverse N1. The source of the event codes and description text was the Neoverse N1 technical reference manual at: http://infocenter.arm.com/help/topic/com.arm.doc.100616_0301_01_en/neoverse_n1_trm_100616_0301_01_en.pdf The Cortex-A76 shares the same event IDs as the Neoverse N1 and they can be viewed at: https://static.docs.arm.com/100798/0400/cortex_a76_trm_100798_0400_00_en.pdf Signed-off-by: James Clark Cc: "linux-perf-users@vger.kernel.org" Cc: Alexander Shishkin Cc: Jeremy Linton Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: james clark Cc: nd Link: http://lore.kernel.org/lkml/20190902160713.1425-2-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/arm64/arm/cortex-a76-n1/branch.json | 14 ++ .../arch/arm64/arm/cortex-a76-n1/bus.json | 24 ++ .../arch/arm64/arm/cortex-a76-n1/cache.json | 207 ++++++++++++++++++ .../arm64/arm/cortex-a76-n1/exception.json | 52 +++++ .../arm64/arm/cortex-a76-n1/instruction.json | 108 +++++++++ .../arch/arm64/arm/cortex-a76-n1/memory.json | 23 ++ .../arch/arm64/arm/cortex-a76-n1/other.json | 7 + .../arm64/arm/cortex-a76-n1/pipeline.json | 14 ++ tools/perf/pmu-events/arch/arm64/mapfile.csv | 2 + 9 files changed, 451 insertions(+) create mode 100644 tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/branch.json create mode 100644 tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/bus.json create mode 100644 tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/cache.json create mode 100644 tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/exception.json create mode 100644 tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/instruction.json create mode 100644 tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json create mode 100644 tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json create mode 100644 tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/pipeline.json diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/branch.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/branch.json new file mode 100644 index 000000000000..b5e5d055c70d --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/branch.json @@ -0,0 +1,14 @@ +[ + { + "PublicDescription": "Mispredicted or not predicted branch speculatively executed. This event counts any predictable branch instruction which is mispredicted either due to dynamic misprediction or because the MMU is off and the branches are statically predicted not taken.", + "EventCode": "0x10", + "EventName": "BR_MIS_PRED", + "BriefDescription": "Mispredicted or not predicted branch speculatively executed." + }, + { + "PublicDescription": "Predictable branch speculatively executed. This event counts all predictable branches.", + "EventCode": "0x12", + "EventName": "BR_PRED", + "BriefDescription": "Predictable branch speculatively executed." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/bus.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/bus.json new file mode 100644 index 000000000000..fce7309ae624 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/bus.json @@ -0,0 +1,24 @@ +[ + { + "EventCode": "0x11", + "EventName": "CPU_CYCLES", + "BriefDescription": "The number of core clock cycles." + }, + { + "PublicDescription": "Bus access. This event counts for every beat of data transferred over the data channels between the core and the SCU. If both read and write data beats are transferred on a given cycle, this event is counted twice on that cycle. This event counts the sum of BUS_ACCESS_RD and BUS_ACCESS_WR.", + "EventCode": "0x19", + "EventName": "BUS_ACCESS", + "BriefDescription": "Bus access." + }, + { + "EventCode": "0x1D", + "EventName": "BUS_CYCLES", + "BriefDescription": "Bus cycles. This event duplicates CPU_CYCLES." + }, + { + "ArchStdEvent": "BUS_ACCESS_RD" + }, + { + "ArchStdEvent": "BUS_ACCESS_WR" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/cache.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/cache.json new file mode 100644 index 000000000000..24594081c199 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/cache.json @@ -0,0 +1,207 @@ +[ + { + "PublicDescription": "L1 instruction cache refill. This event counts any instruction fetch which misses in the cache.", + "EventCode": "0x01", + "EventName": "L1I_CACHE_REFILL", + "BriefDescription": "L1 instruction cache refill" + }, + { + "PublicDescription": "L1 instruction TLB refill. This event counts any refill of the instruction L1 TLB from the L2 TLB. This includes refills that result in a translation fault.", + "EventCode": "0x02", + "EventName": "L1I_TLB_REFILL", + "BriefDescription": "L1 instruction TLB refill" + }, + { + "PublicDescription": "L1 data cache refill. This event counts any load or store operation or page table walk access which causes data to be read from outside the L1, including accesses which do not allocate into L1.", + "EventCode": "0x03", + "EventName": "L1D_CACHE_REFILL", + "BriefDescription": "L1 data cache refill" + }, + { + "PublicDescription": "L1 data cache access. This event counts any load or store operation or page table walk access which looks up in the L1 data cache. In particular, any access which could count the L1D_CACHE_REFILL event causes this event to count.", + "EventCode": "0x04", + "EventName": "L1D_CACHE", + "BriefDescription": "L1 data cache access" + }, + { + "PublicDescription": "L1 data TLB refill. This event counts any refill of the data L1 TLB from the L2 TLB. This includes refills that result in a translation fault.", + "EventCode": "0x05", + "EventName": "L1D_TLB_REFILL", + "BriefDescription": "L1 data TLB refill" + }, + { + "PublicDescription": "Level 1 instruction cache access or Level 0 Macro-op cache access. This event counts any instruction fetch which accesses the L1 instruction cache or L0 Macro-op cache.", + "EventCode": "0x14", + "EventName": "L1I_CACHE", + "BriefDescription": "L1 instruction cache access" + }, + { + "PublicDescription": "L1 data cache Write-Back. This event counts any write-back of data from the L1 data cache to L2 or L3. This counts both victim line evictions and snoops, including cache maintenance operations.", + "EventCode": "0x15", + "EventName": "L1D_CACHE_WB", + "BriefDescription": "L1 data cache Write-Back" + }, + { + "PublicDescription": "L2 data cache access. This event counts any transaction from L1 which looks up in the L2 cache, and any write-back from the L1 to the L2. Snoops from outside the core and cache maintenance operations are not counted.", + "EventCode": "0x16", + "EventName": "L2D_CACHE", + "BriefDescription": "L2 data cache access" + }, + { + "PublicDescription": "L2 data cache refill. This event counts any cacheable transaction from L1 which causes data to be read from outside the core. L2 refills caused by stashes into L2 should not be counted", + "EventCode": "0x17", + "EventName": "L2D_CACHE_REFILL", + "BriefDescription": "L2 data cache refill" + }, + { + "PublicDescription": "L2 data cache write-back. This event counts any write-back of data from the L2 cache to outside the core. This includes snoops to the L2 which return data, regardless of whether they cause an invalidation. Invalidations from the L2 which do not write data outside of the core and snoops which return data from the L1 are not counted", + "EventCode": "0x18", + "EventName": "L2D_CACHE_WB", + "BriefDescription": "L2 data cache write-back" + }, + { + "PublicDescription": "L2 data cache allocation without refill. This event counts any full cache line write into the L2 cache which does not cause a linefill, including write-backs from L1 to L2 and full-line writes which do not allocate into L1.", + "EventCode": "0x20", + "EventName": "L2D_CACHE_ALLOCATE", + "BriefDescription": "L2 data cache allocation without refill" + }, + { + "PublicDescription": "Level 1 data TLB access. This event counts any load or store operation which accesses the data L1 TLB. If both a load and a store are executed on a cycle, this event counts twice. This event counts regardless of whether the MMU is enabled.", + "EventCode": "0x25", + "EventName": "L1D_TLB", + "BriefDescription": "Level 1 data TLB access." + }, + { + "PublicDescription": "Level 1 instruction TLB access. This event counts any instruction fetch which accesses the instruction L1 TLB.This event counts regardless of whether the MMU is enabled.", + "EventCode": "0x26", + "EventName": "L1I_TLB", + "BriefDescription": "Level 1 instruction TLB access" + }, + { + "PublicDescription": "This event counts any full cache line write into the L3 cache which does not cause a linefill, including write-backs from L2 to L3 and full-line writes which do not allocate into L2", + "EventCode": "0x29", + "EventName": "L3D_CACHE_ALLOCATE", + "BriefDescription": "Allocation without refill" + }, + { + "PublicDescription": "Attributable Level 3 unified cache refill. This event counts for any cacheable read transaction returning datafrom the SCU for which the data source was outside the cluster. Transactions such as ReadUnique are counted here as 'read' transactions, even though they can be generated by store instructions.", + "EventCode": "0x2A", + "EventName": "L3D_CACHE_REFILL", + "BriefDescription": "Attributable Level 3 unified cache refill." + }, + { + "PublicDescription": "Attributable Level 3 unified cache access. This event counts for any cacheable read transaction returning datafrom the SCU, or for any cacheable write to the SCU.", + "EventCode": "0x2B", + "EventName": "L3D_CACHE", + "BriefDescription": "Attributable Level 3 unified cache access." + }, + { + "PublicDescription": "Attributable L2 data or unified TLB refill. This event counts on anyrefill of the L2 TLB, caused by either an instruction or data access.This event does not count if the MMU is disabled.", + "EventCode": "0x2D", + "EventName": "L2D_TLB_REFILL", + "BriefDescription": "Attributable L2 data or unified TLB refill" + }, + { + "PublicDescription": "Attributable L2 data or unified TLB access. This event counts on any access to the L2 TLB (caused by a refill of any of the L1 TLBs). This event does not count if the MMU is disabled.", + "EventCode": "0x2F", + "EventName": "L2D_TLB", + "BriefDescription": "Attributable L2 data or unified TLB access" + }, + { + "PublicDescription": "Access to data TLB that caused a page table walk. This event counts on any data access which causes L2D_TLB_REFILL to count.", + "EventCode": "0x34", + "EventName": "DTLB_WALK", + "BriefDescription": "Access to data TLB that caused a page table walk." + }, + { + "PublicDescription": "Access to instruction TLB that caused a page table walk. This event counts on any instruction access which causes L2D_TLB_REFILL to count.", + "EventCode": "0x35", + "EventName": "ITLB_WALK", + "BriefDescription": "Access to instruction TLB that caused a page table walk." + }, + { + "EventCode": "0x36", + "EventName": "LL_CACHE_RD", + "BriefDescription": "Last level cache access, read" + }, + { + "EventCode": "0x37", + "EventName": "LL_CACHE_MISS_RD", + "BriefDescription": "Last level cache miss, read" + }, + { + "ArchStdEvent": "L1D_CACHE_INVAL" + }, + { + "ArchStdEvent": "L1D_CACHE_RD" + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_INNER" + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_OUTER" + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_RD" + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_WR" + }, + { + "ArchStdEvent": "L1D_CACHE_WB_CLEAN" + }, + { + "ArchStdEvent": "L1D_CACHE_WB_VICTIM" + }, + { + "ArchStdEvent": "L1D_CACHE_WR" + }, + { + "ArchStdEvent": "L1D_TLB_RD" + }, + { + "ArchStdEvent": "L1D_TLB_REFILL_RD" + }, + { + "ArchStdEvent": "L1D_TLB_REFILL_WR" + }, + { + "ArchStdEvent": "L1D_TLB_WR" + }, + { + "ArchStdEvent": "L2D_CACHE_INVAL" + }, + { + "ArchStdEvent": "L2D_CACHE_RD" + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_RD" + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_WR" + }, + { + "ArchStdEvent": "L2D_CACHE_WB_CLEAN" + }, + { + "ArchStdEvent": "L2D_CACHE_WB_VICTIM" + }, + { + "ArchStdEvent": "L2D_CACHE_WR" + }, + { + "ArchStdEvent": "L2D_TLB_RD" + }, + { + "ArchStdEvent": "L2D_TLB_REFILL_RD" + }, + { + "ArchStdEvent": "L2D_TLB_REFILL_WR" + }, + { + "ArchStdEvent": "L2D_TLB_WR" + }, + { + "ArchStdEvent": "L3D_CACHE_RD" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/exception.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/exception.json new file mode 100644 index 000000000000..98d29c862320 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/exception.json @@ -0,0 +1,52 @@ +[ + { + "EventCode": "0x09", + "EventName": "EXC_TAKEN", + "BriefDescription": "Exception taken." + }, + { + "PublicDescription": "Local memory error. This event counts any correctable or uncorrectable memory error (ECC or parity) in the protected core RAMs", + "EventCode": "0x1A", + "EventName": "MEMORY_ERROR", + "BriefDescription": "Local memory error." + }, + { + "ArchStdEvent": "EXC_DABORT" + }, + { + "ArchStdEvent": "EXC_FIQ" + }, + { + "ArchStdEvent": "EXC_HVC" + }, + { + "ArchStdEvent": "EXC_IRQ" + }, + { + "ArchStdEvent": "EXC_PABORT" + }, + { + "ArchStdEvent": "EXC_SMC" + }, + { + "ArchStdEvent": "EXC_SVC" + }, + { + "ArchStdEvent": "EXC_TRAP_DABORT" + }, + { + "ArchStdEvent": "EXC_TRAP_FIQ" + }, + { + "ArchStdEvent": "EXC_TRAP_IRQ" + }, + { + "ArchStdEvent": "EXC_TRAP_OTHER" + }, + { + "ArchStdEvent": "EXC_TRAP_PABORT" + }, + { + "ArchStdEvent": "EXC_UNDEF" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/instruction.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/instruction.json new file mode 100644 index 000000000000..c153ac706d8d --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/instruction.json @@ -0,0 +1,108 @@ +[ + { + "PublicDescription": "Software increment. Instruction architecturally executed (condition code check pass).", + "EventCode": "0x00", + "EventName": "SW_INCR", + "BriefDescription": "Software increment." + }, + { + "PublicDescription": "Instruction architecturally executed. This event counts all retired instructions, including those that fail their condition check.", + "EventCode": "0x08", + "EventName": "INST_RETIRED", + "BriefDescription": "Instruction architecturally executed." + }, + { + "EventCode": "0x0A", + "EventName": "EXC_RETURN", + "BriefDescription": "Instruction architecturally executed, condition code check pass, exception return." + }, + { + "PublicDescription": "Instruction architecturally executed, condition code check pass, write to CONTEXTIDR. This event only counts writes to CONTEXTIDR in AArch32 state, and via the CONTEXTIDR_EL1 mnemonic in AArch64 state.", + "EventCode": "0x0B", + "EventName": "CID_WRITE_RETIRED", + "BriefDescription": "Instruction architecturally executed, condition code check pass, write to CONTEXTIDR." + }, + { + "EventCode": "0x1B", + "EventName": "INST_SPEC", + "BriefDescription": "Operation speculatively executed" + }, + { + "PublicDescription": "Instruction architecturally executed, condition code check pass, write to TTBR. This event only counts writes to TTBR0/TTBR1 in AArch32 state and TTBR0_EL1/TTBR1_EL1 in AArch64 state.", + "EventCode": "0x1C", + "EventName": "TTBR_WRITE_RETIRED", + "BriefDescription": "Instruction architecturally executed, condition code check pass, write to TTBR" + }, + { + "PublicDescription": "Instruction architecturally executed, branch. This event counts all branches, taken or not. This excludes exception entries, debug entries and CCFAIL branches.", + "EventCode": "0x21", + "EventName": "BR_RETIRED", + "BriefDescription": "Instruction architecturally executed, branch." + }, + { + "PublicDescription": "Instruction architecturally executed, mispredicted branch. This event counts any branch counted by BR_RETIRED which is not correctly predicted and causes a pipeline flush.", + "EventCode": "0x22", + "EventName": "BR_MIS_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, mispredicted branch." + }, + { + "ArchStdEvent": "ASE_SPEC" + }, + { + "ArchStdEvent": "BR_IMMED_SPEC" + }, + { + "ArchStdEvent": "BR_INDIRECT_SPEC" + }, + { + "ArchStdEvent": "BR_RETURN_SPEC" + }, + { + "ArchStdEvent": "CRYPTO_SPEC" + }, + { + "ArchStdEvent": "DMB_SPEC" + }, + { + "ArchStdEvent": "DP_SPEC" + }, + { + "ArchStdEvent": "DSB_SPEC" + }, + { + "ArchStdEvent": "ISB_SPEC" + }, + { + "ArchStdEvent": "LDREX_SPEC" + }, + { + "ArchStdEvent": "LDST_SPEC" + }, + { + "ArchStdEvent": "LD_SPEC" + }, + { + "ArchStdEvent": "PC_WRITE_SPEC" + }, + { + "ArchStdEvent": "RC_LD_SPEC" + }, + { + "ArchStdEvent": "RC_ST_SPEC" + }, + { + "ArchStdEvent": "STREX_FAIL_SPEC" + }, + { + "ArchStdEvent": "STREX_PASS_SPEC" + }, + { + "ArchStdEvent": "STREX_SPEC" + }, + { + "ArchStdEvent": "ST_SPEC" + }, + { + "ArchStdEvent": "VFP_SPEC" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json new file mode 100644 index 000000000000..b86643253f19 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json @@ -0,0 +1,23 @@ +[ + { + "PublicDescription": "Data memory access. This event counts memory accesses due to load or store instructions. This event counts the sum of MEM_ACCESS_RD and MEM_ACCESS_WR.", + "EventCode": "0x13", + "EventName": "MEM_ACCESS", + "BriefDescription": "Data memory access" + }, + { + "ArchStdEvent": "MEM_ACCESS_RD" + }, + { + "ArchStdEvent": "MEM_ACCESS_WR" + }, + { + "ArchStdEvent": "UNALIGNED_LD_SPEC" + }, + { + "ArchStdEvent": "UNALIGNED_ST_SPEC" + }, + { + "ArchStdEvent": "UNALIGNED_LDST_SPEC" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json new file mode 100644 index 000000000000..8bde029a62d5 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json @@ -0,0 +1,7 @@ +[ + { + "EventCode": "0x31", + "EventName": "REMOTE_ACCESS", + "BriefDescription": "Access to another socket in a multi-socket system" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/pipeline.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/pipeline.json new file mode 100644 index 000000000000..010a647f9d02 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/pipeline.json @@ -0,0 +1,14 @@ +[ + { + "PublicDescription": "No operation issued because of the frontend. The counter counts on any cycle when there are no fetched instructions available to dispatch.", + "EventCode": "0x23", + "EventName": "STALL_FRONTEND", + "BriefDescription": "No operation issued because of the frontend." + }, + { + "PublicDescription": "No operation issued because of the backend. The counter counts on any cycle fetched instructions are not dispatched due to resource constraints.", + "EventCode": "0x24", + "EventName": "STALL_BACKEND", + "BriefDescription": "No operation issued because of the backend." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv index 927fcddcb4aa..0d609149b82a 100644 --- a/tools/perf/pmu-events/arch/arm64/mapfile.csv +++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv @@ -16,6 +16,8 @@ 0x00000000420f1000,v1,arm/cortex-a53,core 0x00000000410fd070,v1,arm/cortex-a57-a72,core 0x00000000410fd080,v1,arm/cortex-a57-a72,core +0x00000000410fd0b0,v1,arm/cortex-a76-n1,core +0x00000000410fd0c0,v1,arm/cortex-a76-n1,core 0x00000000420f5160,v1,cavium/thunderx2,core 0x00000000430f0af0,v1,cavium/thunderx2,core 0x00000000480fd010,v1,hisilicon/hip08,core From 8fcbeae44fde9756036147664d1e74fab7c9902c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 3 Sep 2019 09:47:53 -0300 Subject: [PATCH 0657/2880] perf tools: Remove needless builtin.h include directives Now that builtin.h isn't included by any other header, we can check where it is really needed, i.e. we can remove it and be sure that it isn't being obtained indirectly. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-mn7jheex85iw9qo6tlv26hb2@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/numa.c | 1 - tools/perf/bench/sched-messaging.c | 1 - tools/perf/bench/sched-pipe.c | 1 - tools/perf/util/jitdump.c | 1 - 4 files changed, 4 deletions(-) diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 62b8ef4bcb1f..5797253b9700 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -9,7 +9,6 @@ /* For the CLR_() macros */ #include -#include "../builtin.h" #include #include "../util/cloexec.h" diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c index c63eb9a46346..6e499b32bf00 100644 --- a/tools/perf/bench/sched-messaging.c +++ b/tools/perf/bench/sched-messaging.c @@ -12,7 +12,6 @@ #include "../util/util.h" #include -#include "../builtin.h" #include "bench.h" /* Test groups of 20 processes spraying to 20 receivers */ diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c index 35b07f197d48..edd40aafa318 100644 --- a/tools/perf/bench/sched-pipe.c +++ b/tools/perf/bench/sched-pipe.c @@ -11,7 +11,6 @@ */ #include "../util/util.h" #include -#include "../builtin.h" #include "bench.h" #include diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index b80f29bfc7bb..00db9957fdb0 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -27,7 +27,6 @@ #include "jit.h" #include "jitdump.h" #include "genelf.h" -#include "../builtin.h" #include #include From b22bb139dcb370cd3a7f3c5ae29d55bb69235ace Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 3 Sep 2019 09:57:50 -0300 Subject: [PATCH 0658/2880] perf debug: No need to include ui/util.h Nothing from that file is used in util/debug.h, it is only needed in some places that get it indirectly via including util/debug.h, remove that entanglement. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-hn9v4jdova2nt018fqsjyzun@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/scripts.c | 1 + tools/perf/ui/gtk/setup.c | 3 ++- tools/perf/util/debug.h | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/ui/browsers/scripts.c b/tools/perf/ui/browsers/scripts.c index 586a21acc13d..3b81075b8da8 100644 --- a/tools/perf/ui/browsers/scripts.c +++ b/tools/perf/ui/browsers/scripts.c @@ -2,6 +2,7 @@ #include "../../builtin.h" #include "../../perf.h" #include "../../util/util.h" +#include "../util.h" #include "../../util/hist.h" #include "../../util/debug.h" #include "../../util/symbol.h" diff --git a/tools/perf/ui/gtk/setup.c b/tools/perf/ui/gtk/setup.c index 1a2616b97b5c..f5eee4d66873 100644 --- a/tools/perf/ui/gtk/setup.c +++ b/tools/perf/ui/gtk/setup.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "gtk.h" -#include "../../util/debug.h" +#include +#include "../util.h" extern struct perf_error_ops perf_gtk_eops; diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h index b2deee987ffa..d25ae1c4cee9 100644 --- a/tools/perf/util/debug.h +++ b/tools/perf/util/debug.h @@ -3,9 +3,9 @@ #ifndef __PERF_DEBUG_H #define __PERF_DEBUG_H +#include #include #include -#include "../ui/util.h" extern int verbose; extern bool quiet, dump_trace; From 4a903c2e151423be9af19c7eb35d4667be21c4c1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 3 Sep 2019 10:11:53 -0300 Subject: [PATCH 0659/2880] perf tools: Remove debug.h from places where it is not needed Pruning a bit more the includes dependency tree. Building this thing on lots of containers takes time, we better reduce the time per build, each container is doing 6 builds when clang and clang-devel are available, and the plan is to do a 'make -C tools/perf build-test' that have many more. Also helps when doing normal development, as touching some random file will have a much reduced chance of triggering lots of rebuilds. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-r889ur2cxe16m91m2a4pl15p@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/unwind-libunwind.c | 2 +- tools/perf/arch/powerpc/util/sym-handling.c | 1 - tools/perf/tests/clang.c | 1 - tools/perf/ui/browsers/header.c | 1 - tools/perf/ui/gtk/helpline.c | 1 - tools/perf/ui/gtk/util.c | 1 - tools/perf/ui/helpline.c | 1 - tools/perf/ui/tui/helpline.c | 1 - tools/perf/ui/tui/util.c | 1 - tools/perf/util/branch.c | 1 - tools/perf/util/demangle-java.c | 1 - tools/perf/util/libunwind/arm64.c | 1 - tools/perf/util/libunwind/x86_32.c | 1 - tools/perf/util/target.c | 1 - tools/perf/util/usage.c | 1 - tools/perf/util/zlib.c | 2 -- 16 files changed, 1 insertion(+), 17 deletions(-) diff --git a/tools/perf/arch/arm64/util/unwind-libunwind.c b/tools/perf/arch/arm64/util/unwind-libunwind.c index 002520d4036b..1495a9523a23 100644 --- a/tools/perf/arch/arm64/util/unwind-libunwind.c +++ b/tools/perf/arch/arm64/util/unwind-libunwind.c @@ -5,8 +5,8 @@ #include #include "perf_regs.h" #include "../../util/unwind.h" -#include "../../util/debug.h" #endif +#include "../../util/debug.h" int LIBUNWIND__ARCH_REG_ID(int regnum) { diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c index 8a4b717e0a53..abb7a12d8f93 100644 --- a/tools/perf/arch/powerpc/util/sym-handling.c +++ b/tools/perf/arch/powerpc/util/sym-handling.c @@ -4,7 +4,6 @@ * Copyright (C) 2015 Naveen N. Rao, IBM Corporation */ -#include "debug.h" #include "dso.h" #include "symbol.h" #include "map.h" diff --git a/tools/perf/tests/clang.c b/tools/perf/tests/clang.c index f45fe11dcf50..ff2711a40940 100644 --- a/tools/perf/tests/clang.c +++ b/tools/perf/tests/clang.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include "tests.h" -#include "debug.h" #include "util.h" #include "c++/clang-c.h" #include diff --git a/tools/perf/ui/browsers/header.c b/tools/perf/ui/browsers/header.c index 0f59a7001479..57e6e4332f74 100644 --- a/tools/perf/ui/browsers/header.c +++ b/tools/perf/ui/browsers/header.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include "util/debug.h" #include "ui/browser.h" #include "ui/keysyms.h" #include "ui/ui.h" diff --git a/tools/perf/ui/gtk/helpline.c b/tools/perf/ui/gtk/helpline.c index e166da9ec767..e40a006aead8 100644 --- a/tools/perf/ui/gtk/helpline.c +++ b/tools/perf/ui/gtk/helpline.c @@ -6,7 +6,6 @@ #include "gtk.h" #include "../ui.h" #include "../helpline.h" -#include "../../util/debug.h" static void gtk_helpline_pop(void) { diff --git a/tools/perf/ui/gtk/util.c b/tools/perf/ui/gtk/util.c index c2c558958b9c..c47f5c387838 100644 --- a/tools/perf/ui/gtk/util.c +++ b/tools/perf/ui/gtk/util.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include "../util.h" -#include "../../util/debug.h" #include "gtk.h" #include diff --git a/tools/perf/ui/helpline.c b/tools/perf/ui/helpline.c index 54bcd08df87e..e00901450f91 100644 --- a/tools/perf/ui/helpline.c +++ b/tools/perf/ui/helpline.c @@ -3,7 +3,6 @@ #include #include -#include "../util/debug.h" #include "helpline.h" #include "ui.h" #include "../util/util.h" diff --git a/tools/perf/ui/tui/helpline.c b/tools/perf/ui/tui/helpline.c index 5f188f678c55..298d6af82fdd 100644 --- a/tools/perf/ui/tui/helpline.c +++ b/tools/perf/ui/tui/helpline.c @@ -6,7 +6,6 @@ #include #include -#include "../../util/debug.h" #include "../helpline.h" #include "../ui.h" #include "../libslang.h" diff --git a/tools/perf/ui/tui/util.c b/tools/perf/ui/tui/util.c index 087d9ab054c8..b98dd0e31dc1 100644 --- a/tools/perf/ui/tui/util.c +++ b/tools/perf/ui/tui/util.c @@ -5,7 +5,6 @@ #include #include -#include "../../util/debug.h" #include "../browser.h" #include "../keysyms.h" #include "../helpline.h" diff --git a/tools/perf/util/branch.c b/tools/perf/util/branch.c index 9d1e090084a2..52261e5f718a 100644 --- a/tools/perf/util/branch.c +++ b/tools/perf/util/branch.c @@ -1,5 +1,4 @@ #include "util/util.h" -#include "util/debug.h" #include "util/map_symbol.h" #include "util/branch.h" #include diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c index 763328c151e9..6fb7f34c0814 100644 --- a/tools/perf/util/demangle-java.c +++ b/tools/perf/util/demangle-java.c @@ -3,7 +3,6 @@ #include #include #include -#include "debug.h" #include "symbol.h" #include "demangle-java.h" diff --git a/tools/perf/util/libunwind/arm64.c b/tools/perf/util/libunwind/arm64.c index 66756e6be111..6b4e5a0892f8 100644 --- a/tools/perf/util/libunwind/arm64.c +++ b/tools/perf/util/libunwind/arm64.c @@ -22,7 +22,6 @@ #define LIBUNWIND__ARCH_REG_SP PERF_REG_ARM64_SP #include "unwind.h" -#include "debug.h" #include "libunwind-aarch64.h" #include <../../../../arch/arm64/include/uapi/asm/perf_regs.h> #include "../../arch/arm64/util/unwind-libunwind.c" diff --git a/tools/perf/util/libunwind/x86_32.c b/tools/perf/util/libunwind/x86_32.c index c5e568188e19..21c216c40a3b 100644 --- a/tools/perf/util/libunwind/x86_32.c +++ b/tools/perf/util/libunwind/x86_32.c @@ -22,7 +22,6 @@ #define LIBUNWIND__ARCH_REG_SP PERF_REG_X86_SP #include "unwind.h" -#include "debug.h" #include "libunwind-x86.h" #include <../../../../arch/x86/include/uapi/asm/perf_regs.h> diff --git a/tools/perf/util/target.c b/tools/perf/util/target.c index 565f7aef7e6c..e152d2bbe3a3 100644 --- a/tools/perf/util/target.c +++ b/tools/perf/util/target.c @@ -7,7 +7,6 @@ #include "target.h" #include "util.h" -#include "debug.h" #include #include diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c index 3949a60b00ae..196438ee4c9d 100644 --- a/tools/perf/util/usage.c +++ b/tools/perf/util/usage.c @@ -8,7 +8,6 @@ * Copyright (C) Linus Torvalds, 2005 */ #include "util.h" -#include "debug.h" #include #include #include diff --git a/tools/perf/util/zlib.c b/tools/perf/util/zlib.c index 59d456f716e9..deb6e69adb5a 100644 --- a/tools/perf/util/zlib.c +++ b/tools/perf/util/zlib.c @@ -10,8 +10,6 @@ #include "util/compress.h" #include "util/util.h" -#include "util/debug.h" - #define CHUNK_SIZE 16384 From fb71c86cc804b8f490fce1b9140014043ec41858 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 3 Sep 2019 10:56:06 -0300 Subject: [PATCH 0660/2880] perf tools: Remove util.h from where it is not needed Check that it is not needed and remove, fixing up some fallout for places where it was only serving to get something else. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-9h6dg6lsqe2usyqjh5rrues4@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 2 +- tools/perf/arch/arm64/util/arm-spe.c | 2 +- tools/perf/arch/arm64/util/dwarf-regs.c | 1 - tools/perf/arch/powerpc/util/dwarf-regs.c | 1 - tools/perf/arch/powerpc/util/header.c | 1 - tools/perf/arch/s390/util/machine.c | 2 +- tools/perf/arch/x86/tests/intel-cqm.c | 1 - tools/perf/arch/x86/tests/rdpmc.c | 2 +- tools/perf/arch/x86/util/intel-bts.c | 2 +- tools/perf/arch/x86/util/intel-pt.c | 2 +- tools/perf/arch/x86/util/machine.c | 2 +- tools/perf/bench/sched-messaging.c | 1 - tools/perf/bench/sched-pipe.c | 1 - tools/perf/builtin-config.c | 1 - tools/perf/builtin-evlist.c | 2 -- tools/perf/builtin-report.c | 2 +- tools/perf/perf.c | 2 +- tools/perf/tests/clang.c | 1 - tools/perf/tests/dso-data.c | 1 - tools/perf/tests/event-times.c | 1 - tools/perf/tests/llvm.c | 1 - tools/perf/tests/mmap-thread-lookup.c | 2 +- tools/perf/tests/parse-events.c | 1 - tools/perf/tests/parse-no-sample-id-all.c | 2 -- tools/perf/tests/perf-hooks.c | 1 - tools/perf/tests/pmu.c | 1 - tools/perf/tests/sample-parsing.c | 1 - tools/perf/tests/topology.c | 1 - tools/perf/tests/vmlinux-kallsyms.c | 2 +- tools/perf/ui/browser.c | 1 - tools/perf/ui/browsers/annotate.c | 1 - tools/perf/ui/browsers/map.c | 1 - tools/perf/ui/browsers/res_sample.c | 2 +- tools/perf/ui/browsers/scripts.c | 2 +- tools/perf/ui/gtk/progress.c | 1 - tools/perf/ui/helpline.c | 1 - tools/perf/ui/hist.c | 1 - tools/perf/ui/setup.c | 2 +- tools/perf/ui/tui/setup.c | 2 +- tools/perf/util/annotate.c | 2 +- tools/perf/util/auxtrace.c | 4 +++- tools/perf/util/branch.c | 1 - tools/perf/util/branch.h | 9 ++++++++- tools/perf/util/build-id.c | 2 +- tools/perf/util/cloexec.c | 2 +- tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 1 - tools/perf/util/cs-etm.c | 1 - tools/perf/util/data.c | 3 ++- tools/perf/util/debug.c | 1 - tools/perf/util/demangle-rust.c | 1 - tools/perf/util/dwarf-regs.c | 1 - tools/perf/util/evlist.c | 2 +- tools/perf/util/evsel.c | 1 + tools/perf/util/header.c | 3 ++- tools/perf/util/jitdump.c | 1 - tools/perf/util/llvm-utils.c | 1 + tools/perf/util/lzma.c | 2 +- tools/perf/util/perf-hooks.c | 1 - tools/perf/util/record.c | 1 - tools/perf/util/rwsem.c | 1 + tools/perf/util/s390-sample-raw.c | 1 - tools/perf/util/scripting-engines/trace-event-python.c | 1 - tools/perf/util/session.c | 1 + tools/perf/util/srccode.c | 2 +- tools/perf/util/symbol-elf.c | 2 ++ tools/perf/util/symbol-minimal.c | 3 +-- tools/perf/util/symbol.c | 2 +- tools/perf/util/target.c | 1 - tools/perf/util/trace-event-info.c | 2 +- tools/perf/util/trace-event-read.c | 1 - tools/perf/util/trace-event.c | 1 - tools/perf/util/unwind-libdw.c | 1 - tools/perf/util/unwind-libunwind-local.c | 1 - tools/perf/util/vdso.c | 2 +- tools/perf/util/zlib.c | 2 +- 75 files changed, 47 insertions(+), 73 deletions(-) diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index c32db09baf0d..5d120b1e35ed 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -25,7 +25,7 @@ #include "../../util/evsel.h" #include "../../util/pmu.h" #include "../../util/cs-etm.h" -#include "../../util/util.h" +#include "../../util/util.h" // page_size #include "../../util/session.h" #include diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 4b364692da67..eebbf31f995c 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -16,7 +16,7 @@ #include "../../util/evsel.h" #include "../../util/evlist.h" #include "../../util/session.h" -#include "../../util/util.h" +#include "../../util/util.h" // page_size #include "../../util/pmu.h" #include "../../util/debug.h" #include "../../util/auxtrace.h" diff --git a/tools/perf/arch/arm64/util/dwarf-regs.c b/tools/perf/arch/arm64/util/dwarf-regs.c index b047b882c5b1..917b97d7c5d3 100644 --- a/tools/perf/arch/arm64/util/dwarf-regs.c +++ b/tools/perf/arch/arm64/util/dwarf-regs.c @@ -11,7 +11,6 @@ #include #include /* for struct user_pt_regs */ #include -#include "util.h" struct pt_regs_dwarfnum { const char *name; diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c b/tools/perf/arch/powerpc/util/dwarf-regs.c index 4952890b9428..0c4f4caf53ac 100644 --- a/tools/perf/arch/powerpc/util/dwarf-regs.c +++ b/tools/perf/arch/powerpc/util/dwarf-regs.c @@ -12,7 +12,6 @@ #include #include #include -#include "util.h" struct pt_regs_dwarfnum { const char *name; diff --git a/tools/perf/arch/powerpc/util/header.c b/tools/perf/arch/powerpc/util/header.c index 0b242664f5ea..b6b7bc7e31a1 100644 --- a/tools/perf/arch/powerpc/util/header.c +++ b/tools/perf/arch/powerpc/util/header.c @@ -6,7 +6,6 @@ #include #include #include "header.h" -#include "util.h" #define mfspr(rn) ({unsigned long rval; \ asm volatile("mfspr %0," __stringify(rn) \ diff --git a/tools/perf/arch/s390/util/machine.c b/tools/perf/arch/s390/util/machine.c index c8c86a0c9b79..df099d6cc3f5 100644 --- a/tools/perf/arch/s390/util/machine.c +++ b/tools/perf/arch/s390/util/machine.c @@ -2,7 +2,7 @@ #include #include #include -#include "util.h" +#include "util.h" // page_size #include "machine.h" #include "api/fs/fs.h" #include "debug.h" diff --git a/tools/perf/arch/x86/tests/intel-cqm.c b/tools/perf/arch/x86/tests/intel-cqm.c index 3b5cc3373821..111c0ab2e7b5 100644 --- a/tools/perf/arch/x86/tests/intel-cqm.c +++ b/tools/perf/arch/x86/tests/intel-cqm.c @@ -5,7 +5,6 @@ #include "evlist.h" #include "evsel.h" #include "arch-tests.h" -#include "util.h" #include #include diff --git a/tools/perf/arch/x86/tests/rdpmc.c b/tools/perf/arch/x86/tests/rdpmc.c index 6e67cee792b1..e7640fb047de 100644 --- a/tools/perf/arch/x86/tests/rdpmc.c +++ b/tools/perf/arch/x86/tests/rdpmc.c @@ -13,7 +13,7 @@ #include "tests/tests.h" #include "cloexec.h" #include "event.h" -#include "util.h" +#include "util.h" // page_size #include "arch-tests.h" static u64 rdpmc(unsigned int counter) diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index d263430c045f..090d90e093df 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -22,7 +22,7 @@ #include "../../util/tsc.h" #include "../../util/auxtrace.h" #include "../../util/intel-bts.h" -#include "../../util/util.h" +#include "../../util/util.h" // page_size #define KiB(x) ((x) * 1024) #define MiB(x) ((x) * 1024 * 1024) diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index cb7cf16af79c..3d041b89f018 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -26,7 +26,7 @@ #include "../../util/record.h" #include "../../util/target.h" #include "../../util/tsc.h" -#include "../../util/util.h" +#include "../../util/util.h" // page_size #include "../../util/intel-pt.h" #define KiB(x) ((x) * 1024) diff --git a/tools/perf/arch/x86/util/machine.c b/tools/perf/arch/x86/util/machine.c index 1e9ec783b9a1..42418040bc07 100644 --- a/tools/perf/arch/x86/util/machine.c +++ b/tools/perf/arch/x86/util/machine.c @@ -3,7 +3,7 @@ #include #include -#include "../../util/util.h" +#include "../../util/util.h" // page_size #include "../../util/machine.h" #include "../../util/map.h" #include "../../util/symbol.h" diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c index 6e499b32bf00..97e4a4fb3362 100644 --- a/tools/perf/bench/sched-messaging.c +++ b/tools/perf/bench/sched-messaging.c @@ -10,7 +10,6 @@ * */ -#include "../util/util.h" #include #include "bench.h" diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c index edd40aafa318..3c88d1f201f1 100644 --- a/tools/perf/bench/sched-pipe.c +++ b/tools/perf/bench/sched-pipe.c @@ -9,7 +9,6 @@ * http://people.redhat.com/mingo/cfs-scheduler/tools/pipe-test-1m.c * Ported to perf by Hitoshi Mitake */ -#include "../util/util.h" #include #include "bench.h" diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c index 42d8157e047a..2603015f98be 100644 --- a/tools/perf/builtin-config.c +++ b/tools/perf/builtin-config.c @@ -9,7 +9,6 @@ #include "util/cache.h" #include -#include "util/util.h" #include "util/debug.h" #include "util/config.h" #include diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index 238fa3876805..294494e60e48 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -5,8 +5,6 @@ */ #include "builtin.h" -#include "util/util.h" - #include #include "perf.h" diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index b18fab94d38d..3047e5169d9d 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -48,7 +48,7 @@ #include "util/auxtrace.h" #include "util/units.h" #include "util/branch.h" -#include "util/util.h" +#include "util/util.h" // perf_tip() #include "ui/ui.h" #include "ui/progress.h" diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 1193b923e801..bb40a108b8f7 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -20,7 +20,7 @@ #include "util/bpf-loader.h" #include "util/debug.h" #include "util/event.h" -#include "util/util.h" +#include "util/util.h" // page_size, usage() #include "ui/ui.h" #include "perf-sys.h" #include diff --git a/tools/perf/tests/clang.c b/tools/perf/tests/clang.c index ff2711a40940..2577d3ed1531 100644 --- a/tools/perf/tests/clang.c +++ b/tools/perf/tests/clang.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include "tests.h" -#include "util.h" #include "c++/clang-c.h" #include diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c index a4874d4ce7ef..627c1aaf1c9e 100644 --- a/tools/perf/tests/dso-data.c +++ b/tools/perf/tests/dso-data.c @@ -10,7 +10,6 @@ #include #include #include "dso.h" -#include "util.h" #include "machine.h" #include "symbol.h" #include "tests.h" diff --git a/tools/perf/tests/event-times.c b/tools/perf/tests/event-times.c index d824a726906c..0228ba435a2a 100644 --- a/tools/perf/tests/event-times.c +++ b/tools/perf/tests/event-times.c @@ -9,7 +9,6 @@ #include "tests.h" #include "evlist.h" #include "evsel.h" -#include "util.h" #include "debug.h" #include "parse-events.h" #include "thread_map.h" diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c index 022e4c9cf092..ae6cda81c209 100644 --- a/tools/perf/tests/llvm.c +++ b/tools/perf/tests/llvm.c @@ -7,7 +7,6 @@ #include "llvm.h" #include "tests.h" #include "debug.h" -#include "util.h" #ifdef HAVE_LIBBPF_SUPPORT static int test__bpf_parsing(void *obj_buf, size_t obj_buf_sz) diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c index 360d70deb855..f72889c13538 100644 --- a/tools/perf/tests/mmap-thread-lookup.c +++ b/tools/perf/tests/mmap-thread-lookup.c @@ -14,7 +14,7 @@ #include "map.h" #include "symbol.h" #include "thread.h" -#include "util.h" +#include "util.h" // page_size #define THREADS 4 diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 02ba696fb87f..c25c8e7b41e5 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -6,7 +6,6 @@ #include "tests.h" #include "debug.h" #include "pmu.h" -#include "util.h" #include #include #include diff --git a/tools/perf/tests/parse-no-sample-id-all.c b/tools/perf/tests/parse-no-sample-id-all.c index 8284752a60c8..adf3c9c4a416 100644 --- a/tools/perf/tests/parse-no-sample-id-all.c +++ b/tools/perf/tests/parse-no-sample-id-all.c @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: GPL-2.0 #include #include #include @@ -8,7 +7,6 @@ #include "event.h" #include "evlist.h" #include "header.h" -#include "util.h" #include "debug.h" static int process_event(struct evlist **pevlist, union perf_event *event) diff --git a/tools/perf/tests/perf-hooks.c b/tools/perf/tests/perf-hooks.c index a693bcf017ea..dbc27199c65e 100644 --- a/tools/perf/tests/perf-hooks.c +++ b/tools/perf/tests/perf-hooks.c @@ -4,7 +4,6 @@ #include "tests.h" #include "debug.h" -#include "util.h" #include "perf-hooks.h" static void sigsegv_handler(int sig __maybe_unused) diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index 14a78898d79e..74379ff1f7fa 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "parse-events.h" #include "pmu.h" -#include "util.h" #include "tests.h" #include #include diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index 5fcc06817076..e3b965e7a233 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -9,7 +9,6 @@ #include "map_symbol.h" #include "branch.h" -#include "util.h" #include "event.h" #include "evsel.h" #include "debug.h" diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index a4f9f5182b47..c963f301d15e 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -4,7 +4,6 @@ #include #include #include "tests.h" -#include "util.h" #include "session.h" #include "evlist.h" #include "debug.h" diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index 01f434c067c6..7f28775875c2 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -7,7 +7,7 @@ #include "dso.h" #include "map.h" #include "symbol.h" -#include "util.h" +#include "util.h" // page_size #include "tests.h" #include "debug.h" #include "machine.h" diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c index f93d40b1c203..781afe42e90e 100644 --- a/tools/perf/ui/browser.c +++ b/tools/perf/ui/browser.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include "../util/util.h" #include "../util/string2.h" #include "../util/config.h" #include "libslang.h" diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index ac74ed2c23a0..82207db8f97c 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -2,7 +2,6 @@ #include "../browser.h" #include "../helpline.h" #include "../ui.h" -#include "../util.h" #include "../../util/annotate.h" #include "../../util/debug.h" #include "../../util/dso.h" diff --git a/tools/perf/ui/browsers/map.c b/tools/perf/ui/browsers/map.c index 893b065971f6..3d49b916c9e4 100644 --- a/tools/perf/ui/browsers/map.c +++ b/tools/perf/ui/browsers/map.c @@ -5,7 +5,6 @@ #include #include #include -#include "../../util/util.h" #include "../../util/debug.h" #include "../../util/map.h" #include "../../util/dso.h" diff --git a/tools/perf/ui/browsers/res_sample.c b/tools/perf/ui/browsers/res_sample.c index f16a38fea45e..76d356a18790 100644 --- a/tools/perf/ui/browsers/res_sample.c +++ b/tools/perf/ui/browsers/res_sample.c @@ -7,7 +7,7 @@ #include "config.h" #include "time-utils.h" #include "../util.h" -#include "../../util/util.h" +#include "../../util/util.h" // perf_exe() #include "../../perf.h" #include #include diff --git a/tools/perf/ui/browsers/scripts.c b/tools/perf/ui/browsers/scripts.c index 3b81075b8da8..fc733a6354d4 100644 --- a/tools/perf/ui/browsers/scripts.c +++ b/tools/perf/ui/browsers/scripts.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "../../builtin.h" #include "../../perf.h" -#include "../../util/util.h" +#include "../../util/util.h" // perf_exe() #include "../util.h" #include "../../util/hist.h" #include "../../util/debug.h" diff --git a/tools/perf/ui/gtk/progress.c b/tools/perf/ui/gtk/progress.c index b6ad8857da78..eea6fcde518a 100644 --- a/tools/perf/ui/gtk/progress.c +++ b/tools/perf/ui/gtk/progress.c @@ -3,7 +3,6 @@ #include "gtk.h" #include "../progress.h" -#include "util.h" static GtkWidget *dialog; static GtkWidget *progress; diff --git a/tools/perf/ui/helpline.c b/tools/perf/ui/helpline.c index e00901450f91..911182b3f5e6 100644 --- a/tools/perf/ui/helpline.c +++ b/tools/perf/ui/helpline.c @@ -5,7 +5,6 @@ #include "helpline.h" #include "ui.h" -#include "../util/util.h" char ui_helpline__current[512]; diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index 3e533de7d852..f73675500061 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -8,7 +8,6 @@ #include "../util/callchain.h" #include "../util/debug.h" #include "../util/hist.h" -#include "../util/util.h" #include "../util/sort.h" #include "../util/evsel.h" #include "../util/evlist.h" diff --git a/tools/perf/ui/setup.c b/tools/perf/ui/setup.c index c7a86b4be9f5..700335cde618 100644 --- a/tools/perf/ui/setup.c +++ b/tools/perf/ui/setup.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include "../util/debug.h" #include "../util/hist.h" -#include "../util/util.h" #include "ui.h" pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER; diff --git a/tools/perf/ui/tui/setup.c b/tools/perf/ui/tui/setup.c index 56651a4f5aa0..e9bfe856a5de 100644 --- a/tools/perf/ui/tui/setup.c +++ b/tools/perf/ui/tui/setup.c @@ -2,13 +2,13 @@ #include #include #include +#include #include #ifdef HAVE_BACKTRACE_SUPPORT #include #endif #include "../../util/debug.h" -#include "../../util/util.h" #include "../../perf.h" #include "../browser.h" #include "../helpline.h" diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 1748f528b6e9..d441cca6a517 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -14,7 +14,7 @@ #include #include #include -#include "util.h" +#include "util.h" // hex_width() #include "ui/ui.h" #include "sort.h" #include "build-id.h" diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 6f25224a3def..7ec0a6caa6cd 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -50,10 +50,12 @@ #include "intel-bts.h" #include "arm-spe.h" #include "s390-cpumsf.h" -#include "util.h" +#include "util.h" // page_size #include +#include #include "symbol/kallsyms.h" +#include static bool auxtrace__dont_decode(struct perf_session *session) { diff --git a/tools/perf/util/branch.c b/tools/perf/util/branch.c index 52261e5f718a..2285b1eb3128 100644 --- a/tools/perf/util/branch.c +++ b/tools/perf/util/branch.c @@ -1,4 +1,3 @@ -#include "util/util.h" #include "util/map_symbol.h" #include "util/branch.h" #include diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h index 06f66dad0b79..88e00d268f6f 100644 --- a/tools/perf/util/branch.h +++ b/tools/perf/util/branch.h @@ -1,8 +1,15 @@ #ifndef _PERF_BRANCH_H #define _PERF_BRANCH_H 1 - +/* + * The linux/stddef.h isn't need here, but is needed for __always_inline used + * in files included from uapi/linux/perf_event.h such as + * /usr/include/linux/swab.h and /usr/include/linux/byteorder/little_endian.h, + * detected in at least musl libc, used in Alpine Linux. -acme + */ #include #include +#include +#include #include #include diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index e5fb77755d9e..7928c398a063 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -7,7 +7,7 @@ * Copyright (C) 2009, 2010 Red Hat Inc. * Copyright (C) 2009, 2010 Arnaldo Carvalho de Melo */ -#include "util.h" +#include "util.h" // copyfile_ns(), lsdir(), mkdir_p(), rm_rf() #include #include #include diff --git a/tools/perf/util/cloexec.c b/tools/perf/util/cloexec.c index 4e904fcb2783..a12872f2856a 100644 --- a/tools/perf/util/cloexec.c +++ b/tools/perf/util/cloexec.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include "util.h" +#include "util.h" // for sched_getcpu() #include "../perf-sys.h" #include "cloexec.h" #include "event.h" diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index 37d7c492b155..cd92a99eb89d 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -17,7 +17,6 @@ #include "cs-etm.h" #include "cs-etm-decoder.h" #include "intlist.h" -#include "util.h" /* use raw logging */ #ifdef CS_DEBUG_RAW diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 707afdbd9529..f87b9c1c9f9a 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -35,7 +35,6 @@ #include "thread.h" #include "thread-stack.h" #include -#include "util.h" #define MAX_TIMESTAMP (~0ULL) diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index e75c3a279fe8..88fba2ba549f 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -13,9 +13,10 @@ #include #include "data.h" -#include "util.h" +#include "util.h" // rm_rf_perf_data() #include "debug.h" #include "header.h" +#include static void close_dir(struct perf_data_file *files, int nr) { diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index a1b59bd35519..e55114f0336f 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -17,7 +17,6 @@ #include "event.h" #include "debug.h" #include "print_binary.h" -#include "util.h" #include "target.h" #include "ui/helpline.h" #include "ui/ui.h" diff --git a/tools/perf/util/demangle-rust.c b/tools/perf/util/demangle-rust.c index 423afbbd386b..a659fc69f73a 100644 --- a/tools/perf/util/demangle-rust.c +++ b/tools/perf/util/demangle-rust.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include "util.h" #include "debug.h" #include "demangle-rust.h" diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index db55eddce8cd..1b49ecee5aff 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -5,7 +5,6 @@ * Written by: Masami Hiramatsu */ -#include #include #include #include diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 095924aa186b..ea9517d506d8 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -16,7 +16,7 @@ #include "evsel.h" #include "debug.h" #include "units.h" -#include "util.h" +#include "util.h" // page_size #include "../perf.h" #include "asm/bug.h" #include "bpf-event.h" diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 85825384f9e8..c194ec787f96 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -45,6 +45,7 @@ #include "../perf-sys.h" #include "util/parse-branch-options.h" #include +#include #include diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index b0c34dda30a0..d85827de1b60 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -42,11 +42,12 @@ #include "tool.h" #include "time-utils.h" #include "units.h" -#include "util.h" +#include "util.h" // page_size, perf_exe() #include "cputopo.h" #include "bpf-event.h" #include +#include /* * magic2 = "PERFILE2" diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index 00db9957fdb0..9f9c6c6a2fd3 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -15,7 +15,6 @@ #include #include "build-id.h" -#include "util.h" #include "event.h" #include "debug.h" #include "evlist.h" diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index 55fb4b3b1157..8d04e3d070b1 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/perf/util/lzma.c b/tools/perf/util/lzma.c index 397447066033..39062df02629 100644 --- a/tools/perf/util/lzma.c +++ b/tools/perf/util/lzma.c @@ -7,10 +7,10 @@ #include #include #include "compress.h" -#include "util.h" #include "debug.h" #include #include +#include #define BUFSIZE 8192 diff --git a/tools/perf/util/perf-hooks.c b/tools/perf/util/perf-hooks.c index e635c594f773..7a0ab3507bd5 100644 --- a/tools/perf/util/perf-hooks.c +++ b/tools/perf/util/perf-hooks.c @@ -12,7 +12,6 @@ #include #include #include -#include "util/util.h" #include "util/debug.h" #include "util/perf-hooks.h" diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 286fe816c0f3..860c48895ab6 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -10,7 +10,6 @@ #include #include #include -#include "util.h" #include "cloexec.h" #include "record.h" #include "../perf-sys.h" diff --git a/tools/perf/util/rwsem.c b/tools/perf/util/rwsem.c index 5e52e7baa7b6..f3d29d8ddc99 100644 --- a/tools/perf/util/rwsem.c +++ b/tools/perf/util/rwsem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "util.h" #include "rwsem.h" diff --git a/tools/perf/util/s390-sample-raw.c b/tools/perf/util/s390-sample-raw.c index 4d9593e331ea..05b43ab4eeef 100644 --- a/tools/perf/util/s390-sample-raw.c +++ b/tools/perf/util/s390-sample-raw.c @@ -22,7 +22,6 @@ #include #include "debug.h" -#include "util.h" #include "session.h" #include "evlist.h" #include "color.h" diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 666a56e88d8e..bb6828532741 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -37,7 +37,6 @@ #include "../dso.h" #include "../callchain.h" #include "../evsel.h" -#include "../util.h" #include "../event.h" #include "../thread.h" #include "../comm.h" diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index e9e4a04f15db..2b133bc22caa 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -34,6 +34,7 @@ #include "ui/progress.h" #include "../perf.h" #include "arch/common.h" +#include #ifdef HAVE_ZSTD_SUPPORT static int perf_session__process_compressed_event(struct perf_session *session, diff --git a/tools/perf/util/srccode.c b/tools/perf/util/srccode.c index adfcf1ff464c..b402f9ca89ab 100644 --- a/tools/perf/util/srccode.c +++ b/tools/perf/util/srccode.c @@ -15,7 +15,7 @@ #include #include "srccode.h" #include "debug.h" -#include "util.h" +#include "util.h" // page_size #define MAXSRCCACHE (32*1024*1024) #define MAXSRCFILES 64 diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 9428639872a6..8b9199c343dd 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -18,8 +18,10 @@ #include "debug.h" #include "util.h" #include +#include #include #include +#include #ifndef EM_AARCH64 #define EM_AARCH64 183 /* ARM 64 bit */ diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c index 7e2813ec9498..d6e99af263ec 100644 --- a/tools/perf/util/symbol-minimal.c +++ b/tools/perf/util/symbol-minimal.c @@ -1,8 +1,6 @@ -// SPDX-License-Identifier: GPL-2.0 #include "dso.h" #include "symbol.h" #include "symsrc.h" -#include "util.h" #include #include @@ -13,6 +11,7 @@ #include #include #include +#include static bool check_need_swap(int file_endian) { diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 765c75df2904..a8f80e427674 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -19,7 +19,7 @@ #include "build-id.h" #include "cap.h" #include "dso.h" -#include "util.h" +#include "util.h" // lsdir() #include "debug.h" #include "event.h" #include "machine.h" diff --git a/tools/perf/util/target.c b/tools/perf/util/target.c index e152d2bbe3a3..a3db13dea937 100644 --- a/tools/perf/util/target.c +++ b/tools/perf/util/target.c @@ -6,7 +6,6 @@ */ #include "target.h" -#include "util.h" #include #include diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index d63d542b2cde..fe07e7550930 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -2,7 +2,7 @@ /* * Copyright (C) 2008,2009, Steven Rostedt */ -#include "util.h" +#include "util.h" // page_size #include #include #include diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c index b6c0db068be0..8593d3c200c6 100644 --- a/tools/perf/util/trace-event-read.c +++ b/tools/perf/util/trace-event-read.c @@ -15,7 +15,6 @@ #include #include -#include "util.h" #include "trace-event.h" #include "debug.h" diff --git a/tools/perf/util/trace-event.c b/tools/perf/util/trace-event.c index 01b9d89bf5bf..b3ee651e3d91 100644 --- a/tools/perf/util/trace-event.c +++ b/tools/perf/util/trace-event.c @@ -14,7 +14,6 @@ #include #include "trace-event.h" #include "machine.h" -#include "util.h" /* * global trace_event object used by trace_event__tp_format diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index 9ece188ae48a..15f6e46d7124 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -17,7 +17,6 @@ #include "event.h" #include "perf_regs.h" #include "callchain.h" -#include "util.h" static char *debuginfo_path; diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index ebdbb056510c..1800887b2255 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -37,7 +37,6 @@ #include "unwind.h" #include "map.h" #include "symbol.h" -#include "util.h" #include "debug.h" #include "asm/bug.h" #include "dso.h" diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c index e5e6599603f4..ba4b4395f35d 100644 --- a/tools/perf/util/vdso.c +++ b/tools/perf/util/vdso.c @@ -11,7 +11,7 @@ #include "vdso.h" #include "dso.h" -#include "util.h" +#include #include "map.h" #include "symbol.h" #include "machine.h" diff --git a/tools/perf/util/zlib.c b/tools/perf/util/zlib.c index deb6e69adb5a..78d2297c1b67 100644 --- a/tools/perf/util/zlib.c +++ b/tools/perf/util/zlib.c @@ -7,9 +7,9 @@ #include #include #include +#include #include "util/compress.h" -#include "util/util.h" #define CHUNK_SIZE 16384 From 36f3f450a8dc02d9ba0d31bc31d5b329759ed855 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Sep 2019 16:16:27 +0100 Subject: [PATCH 0661/2880] perf probe: Add missing build-id.h header. It uses things defined in that header and was getting it only indirectly, thru dso.h, fix it. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-7u3sf4j5huhi3mqa1q77524b@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c index d13db55a2feb..b659466ea498 100644 --- a/tools/perf/util/probe-file.c +++ b/tools/perf/util/probe-file.c @@ -16,6 +16,7 @@ #include "strlist.h" #include "strfilter.h" #include "debug.h" +#include "build-id.h" #include "dso.h" #include "color.h" #include "symbol.h" From 09aa3b002c8cfcea65b2925ec4ff5a6ccdc44d87 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Sep 2019 16:17:19 +0100 Subject: [PATCH 0662/2880] perf symbols: Add missing dso.h header This was being obtained only indirectly, by luck. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-xeolxwr3iftwfw9kmw26shfe@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol-elf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 8b9199c343dd..6fbfdf8bf61f 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -7,6 +7,7 @@ #include #include +#include "dso.h" #include "map.h" #include "map_groups.h" #include "symbol.h" From 87ffb6c6407023419ae6b2770142b0754d9cbaa1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Sep 2019 16:29:02 +0100 Subject: [PATCH 0663/2880] perf env: Remove needless cpumap.h header Only a 'struct perf_cmp_map' forward allocation is necessary, fix the places that need the header but were getting it indirectly, by luck, from env.h. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-3sj3n534zghxhk7ygzeaqlx9@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/header.c | 4 +++- tools/perf/arch/x86/tests/perf-time-to-tsc.c | 1 - tools/perf/bench/epoll-ctl.c | 2 +- tools/perf/bench/epoll-wait.c | 2 +- tools/perf/bench/futex-hash.c | 2 +- tools/perf/bench/futex-lock-pi.c | 2 +- tools/perf/bench/futex-requeue.c | 2 +- tools/perf/bench/futex-wake-parallel.c | 3 ++- tools/perf/bench/futex-wake.c | 2 +- tools/perf/builtin-c2c.c | 1 + tools/perf/builtin-sched.c | 2 ++ tools/perf/tests/bitmap.c | 2 +- tools/perf/tests/code-reading.c | 1 - tools/perf/tests/event_update.c | 1 + tools/perf/tests/keep-tracking.c | 3 +-- tools/perf/tests/mem2node.c | 2 +- tools/perf/tests/mmap-basic.c | 3 +-- tools/perf/tests/openat-syscall-all-cpus.c | 5 +++-- tools/perf/tests/switch-tracking.c | 1 - tools/perf/tests/task-exit.c | 2 +- tools/perf/tests/topology.c | 1 + tools/perf/util/arm-spe.c | 1 - tools/perf/util/auxtrace.c | 1 - tools/perf/util/env.h | 3 ++- tools/perf/util/event.c | 2 ++ tools/perf/util/evsel.c | 2 +- tools/perf/util/intel-bts.c | 1 - tools/perf/util/parse-events.c | 1 - tools/perf/util/pmu.c | 1 - tools/perf/util/python.c | 1 - tools/perf/util/record.c | 1 - tools/perf/util/s390-cpumsf.c | 1 - tools/perf/util/scripting-engines/trace-event-python.c | 1 - tools/perf/util/session.c | 1 - tools/perf/util/stat.c | 1 + tools/perf/util/svghelper.c | 2 +- tools/perf/util/top.c | 1 - 37 files changed, 31 insertions(+), 34 deletions(-) diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c index e41defaaa2e6..a32e4b72a98f 100644 --- a/tools/perf/arch/arm64/util/header.c +++ b/tools/perf/arch/arm64/util/header.c @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include "debug.h" #include "header.h" @@ -29,7 +31,7 @@ char *get_cpuid_str(struct perf_pmu *pmu) /* read midr from list of cpus mapped to this pmu */ cpus = perf_cpu_map__get(pmu->cpus); - for (cpu = 0; cpu < cpus->nr; cpu++) { + for (cpu = 0; cpu < perf_cpu_map__nr(cpus); cpu++) { scnprintf(path, PATH_MAX, "%s/devices/system/cpu/cpu%d"MIDR, sysfs, cpus->map[cpu]); diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index eb3635941c2b..0a4570b340fa 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -15,7 +15,6 @@ #include "evlist.h" #include "evsel.h" #include "thread_map.h" -#include "cpumap.h" #include "record.h" #include "tsc.h" #include "tests/tests.h" diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c index d1caa4a0a12a..bb617e568841 100644 --- a/tools/perf/bench/epoll-ctl.c +++ b/tools/perf/bench/epoll-ctl.c @@ -21,12 +21,12 @@ #include #include #include +#include #include #include "../util/stat.h" #include #include "bench.h" -#include "cpumap.h" #include diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c index f6b4472847d2..7af694437f4e 100644 --- a/tools/perf/bench/epoll-wait.c +++ b/tools/perf/bench/epoll-wait.c @@ -76,12 +76,12 @@ #include #include #include +#include #include #include "../util/stat.h" #include #include "bench.h" -#include "cpumap.h" #include diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index 80e138904c66..8ba0c3330a9a 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -20,13 +20,13 @@ #include #include #include +#include #include #include "../util/stat.h" #include #include "bench.h" #include "futex.h" -#include "cpumap.h" #include diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index c5d6d0abbaa9..d0cae8125423 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -14,10 +14,10 @@ #include #include #include +#include #include #include "bench.h" #include "futex.h" -#include "cpumap.h" #include #include diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c index 75d3418c1a88..a00a6891447a 100644 --- a/tools/perf/bench/futex-requeue.c +++ b/tools/perf/bench/futex-requeue.c @@ -20,10 +20,10 @@ #include #include #include +#include #include #include "bench.h" #include "futex.h" -#include "cpumap.h" #include #include diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c index 163fe16c275a..a053cf2b7039 100644 --- a/tools/perf/bench/futex-wake-parallel.c +++ b/tools/perf/bench/futex-wake-parallel.c @@ -29,7 +29,8 @@ int bench_futex_wake_parallel(int argc __maybe_unused, const char **argv __maybe #include #include #include "futex.h" -#include "cpumap.h" +#include +#include #include #include diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c index 77dcdc13618a..df810096abfe 100644 --- a/tools/perf/bench/futex-wake.c +++ b/tools/perf/bench/futex-wake.c @@ -20,10 +20,10 @@ #include #include #include +#include #include #include "bench.h" #include "futex.h" -#include "cpumap.h" #include #include diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index b09b12e0976b..61aaacc2aedd 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -20,6 +20,7 @@ #include #include "debug.h" #include "builtin.h" +#include #include #include #include "map_symbol.h" diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index ec96d64aec69..511e19a7aafa 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -3,6 +3,7 @@ #include "perf.h" #include "perf-sys.h" +#include "util/cpumap.h" #include "util/evlist.h" #include "util/evsel.h" #include "util/symbol.h" @@ -36,6 +37,7 @@ #include #include #include +#include #include #include diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c index db2aadff3708..96c137360918 100644 --- a/tools/perf/tests/bitmap.c +++ b/tools/perf/tests/bitmap.c @@ -2,8 +2,8 @@ #include #include #include +#include #include "tests.h" -#include "cpumap.h" #include "debug.h" #define NBITS 100 diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index c1c29e08e7fb..8d9020c46ca9 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -19,7 +19,6 @@ #include "evlist.h" #include "evsel.h" #include "thread_map.h" -#include "cpumap.h" #include "machine.h" #include "map.h" #include "symbol.h" diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index cac4290e233a..317eb8c5ccd4 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -2,6 +2,7 @@ #include #include #include +#include "cpumap.h" #include "evlist.h" #include "evsel.h" #include "header.h" diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index 9f0762d987fa..df0fd5a44e04 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -12,7 +12,6 @@ #include "evsel.h" #include "record.h" #include "thread_map.h" -#include "cpumap.h" #include "tests.h" #define CHECK__(x) { \ @@ -143,7 +142,7 @@ int test__keep_tracking(struct test *test __maybe_unused, int subtest __maybe_un found = find_comm(evlist, comm); if (found != 1) { - pr_debug("Seconf time, failed to find tracking event.\n"); + pr_debug("Second time, failed to find tracking event.\n"); goto out_err; } diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c index 7672ade70f20..a258bd51f1a4 100644 --- a/tools/perf/tests/mem2node.c +++ b/tools/perf/tests/mem2node.c @@ -4,7 +4,7 @@ #include #include #include -#include "cpumap.h" +#include #include "debug.h" #include "env.h" #include "mem2node.h" diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index 85e1d7337dc0..042757629e90 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -10,7 +10,6 @@ #include "evlist.h" #include "evsel.h" #include "thread_map.h" -#include "cpumap.h" #include "tests.h" #include #include @@ -53,7 +52,7 @@ int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unuse cpus = perf_cpu_map__new(NULL); if (cpus == NULL) { - pr_debug("cpu_map__new\n"); + pr_debug("perf_cpu_map__new\n"); goto out_free_threads; } diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c index 9171f77cd9cd..93c176523e38 100644 --- a/tools/perf/tests/openat-syscall-all-cpus.c +++ b/tools/perf/tests/openat-syscall-all-cpus.c @@ -14,7 +14,8 @@ #include "evsel.h" #include "tests.h" #include "thread_map.h" -#include "cpumap.h" +#include +#include #include "debug.h" #include "stat.h" #include "util/counts.h" @@ -37,7 +38,7 @@ int test__openat_syscall_event_on_all_cpus(struct test *test __maybe_unused, int cpus = perf_cpu_map__new(NULL); if (cpus == NULL) { - pr_debug("cpu_map__new\n"); + pr_debug("perf_cpu_map__new\n"); goto out_thread_map_delete; } diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index 1a60fa1219f5..3fb1ff7b8a2f 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -14,7 +14,6 @@ #include "evlist.h" #include "evsel.h" #include "thread_map.h" -#include "cpumap.h" #include "record.h" #include "tests.h" diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index f610e8c0a083..088c7708b03a 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -4,12 +4,12 @@ #include "evsel.h" #include "target.h" #include "thread_map.h" -#include "cpumap.h" #include "tests.h" #include #include #include +#include #include static int exited; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index c963f301d15e..7d845d913d7d 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -3,6 +3,7 @@ #include #include #include +#include "cpumap.h" #include "tests.h" #include "session.h" #include "evlist.h" diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index 8a7340f6a2a2..53be12b23ff4 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -16,7 +16,6 @@ #include #include -#include "cpumap.h" #include "color.h" #include "evsel.h" #include "machine.h" diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 7ec0a6caa6cd..1c0ff5acff83 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -31,7 +31,6 @@ #include "map.h" #include "pmu.h" #include "evsel.h" -#include "cpumap.h" #include "symbol.h" #include "thread_map.h" #include "asm/bug.h" diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index d8e083d42610..db40906e2937 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -4,9 +4,10 @@ #include #include -#include "cpumap.h" #include "rwsem.h" +struct perf_cpu_map; + struct cpu_topology_map { int socket_id; int die_id; diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index f4afbb858ebb..d65ae7cf9316 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -11,6 +12,7 @@ #include #include #include +#include "cpumap.h" #include "dso.h" #include "event.h" #include "debug.h" diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index c194ec787f96..5b40b840624c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -31,7 +31,7 @@ #include "event.h" #include "evsel.h" #include "evlist.h" -#include "cpumap.h" +#include #include "thread_map.h" #include "target.h" #include "perf_regs.h" diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index aacffa2b0362..15f87a09f4fe 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -14,7 +14,6 @@ #include #include -#include "cpumap.h" #include "color.h" #include "evsel.h" #include "evlist.h" diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 5ec21d21113c..a33a1d5059a2 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -30,7 +30,6 @@ #include "parse-events-flex.h" #include "pmu.h" #include "thread_map.h" -#include "cpumap.h" #include "probe-file.h" #include "asm/bug.h" #include "util/parse-branch-options.h" diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index fb597fa94234..5608da82ad23 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -20,7 +20,6 @@ #include "debug.h" #include "pmu.h" #include "parse-events.h" -#include "cpumap.h" #include "header.h" #include "pmu-events/pmu-events.h" #include "string2.h" diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 07ca4535e6f7..96dd3333c911 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -11,7 +11,6 @@ #include "callchain.h" #include "evsel.h" #include "event.h" -#include "cpumap.h" #include "print_binary.h" #include "thread_map.h" #include "trace-event.h" diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 860c48895ab6..8a015fc0aba0 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -2,7 +2,6 @@ #include "debug.h" #include "evlist.h" #include "evsel.h" -#include "cpumap.h" #include "parse-events.h" #include #include diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c index 24a99909d8b3..6785cd87aa4d 100644 --- a/tools/perf/util/s390-cpumsf.c +++ b/tools/perf/util/s390-cpumsf.c @@ -151,7 +151,6 @@ #include #include -#include "cpumap.h" #include "color.h" #include "evsel.h" #include "evlist.h" diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index bb6828532741..5d341efc3237 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -48,7 +48,6 @@ #include "map.h" #include "symbol.h" #include "thread_map.h" -#include "cpumap.h" #include "print_binary.h" #include "stat.h" #include "mem-events.h" diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 2b133bc22caa..2b583e6adb49 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -22,7 +22,6 @@ #include "symbol.h" #include "session.h" #include "tool.h" -#include "cpumap.h" #include "perf_regs.h" #include "asm/bug.h" #include "auxtrace.h" diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 8f1ea27f976f..d309c1cc13db 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -4,6 +4,7 @@ #include #include #include "counts.h" +#include "cpumap.h" #include "debug.h" #include "header.h" #include "stat.h" diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c index 582f4a69cd48..96f941e01681 100644 --- a/tools/perf/util/svghelper.c +++ b/tools/perf/util/svghelper.c @@ -17,11 +17,11 @@ #include #include #include +#include #include #include "env.h" #include "svghelper.h" -#include "cpumap.h" static u64 first_time, last_time; static u64 turbo_frequency, max_freq; diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c index 51fb574998bb..ef96e3dd6902 100644 --- a/tools/perf/util/top.c +++ b/tools/perf/util/top.c @@ -5,7 +5,6 @@ * Refactored from builtin-top.c, see that files for further copyright notes. */ -#include "cpumap.h" #include "event.h" #include "evlist.h" #include "evsel.h" From 278306163882a3557fb2c69fd2cc632a2f9ef601 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Sep 2019 16:42:52 +0100 Subject: [PATCH 0664/2880] perf event: Move perf_event__synthesize* to event.h Where is the perf_event__handler_t typedef they need, which was the only reason for header.h to be including event.h, untangle that. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-outjyzh1o29ndcv9lsqyzt87@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/event.h | 36 ++++++++++++++++++++++++++++++++++ tools/perf/util/header.h | 42 +++------------------------------------- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 47ad81d47b1a..4e6d33c76d57 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -279,6 +279,9 @@ enum { void perf_event__print_totals(void); +struct evlist; +struct evsel; +struct perf_session; struct perf_tool; struct perf_thread_map; struct perf_cpu_map; @@ -290,6 +293,39 @@ typedef int (*perf_event__handler_t)(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine); +int perf_event__synthesize_attr(struct perf_tool *tool, + struct perf_event_attr *attr, u32 ids, u64 *id, + perf_event__handler_t process); +int perf_event__synthesize_attrs(struct perf_tool *tool, + struct evlist *evlist, + perf_event__handler_t process); +int perf_event__synthesize_build_id(struct perf_tool *tool, + struct dso *pos, u16 misc, + perf_event__handler_t process, + struct machine *machine); +int perf_event__synthesize_extra_attr(struct perf_tool *tool, + struct evlist *evsel_list, + perf_event__handler_t process, + bool is_pipe); +int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, + struct evsel *evsel, + perf_event__handler_t process); +int perf_event__synthesize_event_update_name(struct perf_tool *tool, + struct evsel *evsel, + perf_event__handler_t process); +int perf_event__synthesize_event_update_scale(struct perf_tool *tool, + struct evsel *evsel, + perf_event__handler_t process); +int perf_event__synthesize_event_update_unit(struct perf_tool *tool, + struct evsel *evsel, + perf_event__handler_t process); +int perf_event__synthesize_features(struct perf_tool *tool, + struct perf_session *session, + struct evlist *evlist, + perf_event__handler_t process); +int perf_event__synthesize_tracing_data(struct perf_tool *tool, + int fd, struct evlist *evlist, + perf_event__handler_t process); int perf_event__synthesize_thread_map(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 3e48ae3c49b1..999dac41871e 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -5,10 +5,10 @@ #include #include #include +#include // FILE #include #include #include -#include "event.h" #include "env.h" #include "pmu.h" @@ -94,6 +94,8 @@ struct perf_header { struct evlist; struct perf_session; +struct perf_tool; +union perf_event; int perf_session__read_header(struct perf_session *session); int perf_session__write_header(struct perf_session *session, @@ -115,54 +117,16 @@ int perf_header__process_sections(struct perf_header *header, int fd, int perf_header__fprintf_info(struct perf_session *s, FILE *fp, bool full); -int perf_event__synthesize_features(struct perf_tool *tool, - struct perf_session *session, - struct evlist *evlist, - perf_event__handler_t process); - -int perf_event__synthesize_extra_attr(struct perf_tool *tool, - struct evlist *evsel_list, - perf_event__handler_t process, - bool is_pipe); - int perf_event__process_feature(struct perf_session *session, union perf_event *event); - -int perf_event__synthesize_attr(struct perf_tool *tool, - struct perf_event_attr *attr, u32 ids, u64 *id, - perf_event__handler_t process); -int perf_event__synthesize_attrs(struct perf_tool *tool, - struct evlist *evlist, - perf_event__handler_t process); -int perf_event__synthesize_event_update_unit(struct perf_tool *tool, - struct evsel *evsel, - perf_event__handler_t process); -int perf_event__synthesize_event_update_scale(struct perf_tool *tool, - struct evsel *evsel, - perf_event__handler_t process); -int perf_event__synthesize_event_update_name(struct perf_tool *tool, - struct evsel *evsel, - perf_event__handler_t process); -int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, - struct evsel *evsel, - perf_event__handler_t process); int perf_event__process_attr(struct perf_tool *tool, union perf_event *event, struct evlist **pevlist); int perf_event__process_event_update(struct perf_tool *tool, union perf_event *event, struct evlist **pevlist); size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp); - -int perf_event__synthesize_tracing_data(struct perf_tool *tool, - int fd, struct evlist *evlist, - perf_event__handler_t process); int perf_event__process_tracing_data(struct perf_session *session, union perf_event *event); - -int perf_event__synthesize_build_id(struct perf_tool *tool, - struct dso *pos, u16 misc, - perf_event__handler_t process, - struct machine *machine); int perf_event__process_build_id(struct perf_session *session, union perf_event *event); bool is_perf_magic(u64 magic); From b251892d6ceafa3c8f8e6835a664e248766b1b3e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Sep 2019 17:17:33 +0100 Subject: [PATCH 0665/2880] perf stat: Move perf_stat_synthesize_config() to event.h Together with the other synthsizers, and rename it to perf_event__synthesize_stat_events(). This allows us to stop including event.h in util/stat.h. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-q5ebhrp44txboobs86htu5r9@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 4 ++-- tools/perf/util/event.h | 5 +++++ tools/perf/util/stat.c | 10 +++++----- tools/perf/util/stat.h | 8 ++------ 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 5bc0c570b7b6..b55e8060810b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -540,8 +540,8 @@ try_again: if (err < 0) return err; - err = perf_stat_synthesize_config(&stat_config, NULL, evsel_list, - process_synthesized_event, is_pipe); + err = perf_event__synthesize_stat_events(&stat_config, NULL, evsel_list, + process_synthesized_event, is_pipe); if (err < 0) return err; } diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 4e6d33c76d57..89a2404170a0 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -293,6 +293,11 @@ typedef int (*perf_event__handler_t)(struct perf_tool *tool, struct perf_sample *sample, struct machine *machine); +int perf_event__synthesize_stat_events(struct perf_stat_config *config, + struct perf_tool *tool, + struct evlist *evlist, + perf_event__handler_t process, + bool attrs); int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *attr, u32 ids, u64 *id, perf_event__handler_t process); diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index d309c1cc13db..2e318d95c528 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -495,11 +495,11 @@ int create_perf_stat_counter(struct evsel *evsel, return perf_evsel__open_per_thread(evsel, evsel->core.threads); } -int perf_stat_synthesize_config(struct perf_stat_config *config, - struct perf_tool *tool, - struct evlist *evlist, - perf_event__handler_t process, - bool attrs) +int perf_event__synthesize_stat_events(struct perf_stat_config *config, + struct perf_tool *tool, + struct evlist *evlist, + perf_event__handler_t process, + bool attrs) { int err; diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 14fe3e548229..0f9c9f6e2041 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -7,8 +7,9 @@ #include #include #include "rblist.h" -#include "event.h" +struct perf_cpu_map; +struct perf_stat_config; struct timespec; struct stats { @@ -210,11 +211,6 @@ size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp); int create_perf_stat_counter(struct evsel *evsel, struct perf_stat_config *config, struct target *target); -int perf_stat_synthesize_config(struct perf_stat_config *config, - struct perf_tool *tool, - struct evlist *evlist, - perf_event__handler_t process, - bool attrs); void perf_evlist__print_counters(struct evlist *evlist, struct perf_stat_config *config, From 9c9e754fb804828473c5131bb3e7df78bde396e6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Sep 2019 17:24:07 +0100 Subject: [PATCH 0666/2880] perf callchain: Remove needless event.h include All we need is a bunch of struct forward declarations and then add event.h to the only place that was getting it indirectly via callchain.h. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-qq2xhyuxcvx5vmxha9otjd8d@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/util/skip-callchain-idx.c | 1 + tools/perf/util/callchain.h | 5 ++++- tools/perf/util/evsel_fprintf.c | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c index fc9c2f5fcd52..3018a054526a 100644 --- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c +++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c @@ -13,6 +13,7 @@ #include "util/callchain.h" #include "util/debug.h" #include "util/dso.h" +#include "util/event.h" // struct ip_callchain #include "util/map.h" #include "util/symbol.h" diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index b042ceef4114..83398e5bbe4b 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -4,12 +4,15 @@ #include #include -#include "event.h" #include "map_symbol.h" #include "branch.h" +struct addr_location; struct evsel; +struct ip_callchain; struct map; +struct perf_sample; +struct thread; #define HELP_PAD "\t\t\t\t" diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index 496fec01f5d1..a8cbdf4c2753 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -4,6 +4,7 @@ #include #include #include "evsel.h" +#include "util/event.h" #include "callchain.h" #include "map.h" #include "strlist.h" From 5939cacc60d22161adba3d6c8b3fe88b76e0f6c0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 11 Sep 2019 12:54:33 +0100 Subject: [PATCH 0667/2880] perf python: Remove debug.h We only need to have the prototype for the eprintf() replacement we use in the python binding, provide it and avoid dragging debug.h as a dependency. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-s0gy4ur3drmhsknsddwjco59@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/python.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 96dd3333c911..0ba19dd75510 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -6,7 +6,6 @@ #include #include #include -#include "debug.h" #include "evlist.h" #include "callchain.h" #include "evsel.h" @@ -60,6 +59,8 @@ int parse_callchain_record(const char *arg __maybe_unused, */ int verbose; +int eprintf(int level, int var, const char *fmt, ...); + int eprintf(int level, int var, const char *fmt, ...) { va_list args; From 3793d4de06fac8dcd25bf91386cf88415dbe99ad Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 18 Sep 2019 10:09:54 -0300 Subject: [PATCH 0668/2880] perf hist: Add missing 'struct branch_stack' forward declaration Its needed, was being obtained indirectly, fix it. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-srzphk0ehptfn3zqmpkgsi65@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 34803e33dc80..6a186b668303 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -15,6 +15,7 @@ struct addr_location; struct map_symbol; struct mem_info; struct branch_info; +struct branch_stack; struct block_info; struct symbol; struct ui_progress; From 3f79132a47035dd4609cc5f47715331dfafaa1fa Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 18 Sep 2019 10:10:43 -0300 Subject: [PATCH 0669/2880] perf annotate: Add missing machine.h include directive We use what is defined there, were getting it by luck, indirectly, fix it. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-56g4jshmktniundmiw7h845k@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 4e4d2e76232e..553c651fa0a2 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -27,6 +27,7 @@ #include "util/sort.h" #include "util/hist.h" #include "util/dso.h" +#include "util/machine.h" #include "util/map.h" #include "util/session.h" #include "util/tool.h" From f12be047d981bb802d8cf78eb220db3ee97f0513 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 18 Sep 2019 10:11:20 -0300 Subject: [PATCH 0670/2880] perf sched: Add missing event.h include directive We use what is defined there, were getting it by luck, indirectly, fix it. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-e1cdt9557ctpvs3jb9c16qe6@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 511e19a7aafa..f0b828c201cc 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -24,6 +24,7 @@ #include "util/trace-event.h" #include "util/debug.h" +#include "util/event.h" #include #include From bd23ac11fe9312bab40e129b402757fd7a23dc8e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 18 Sep 2019 10:12:07 -0300 Subject: [PATCH 0671/2880] perf auxtrace: Add missing 'struct perf_sample' forward declaration Its needed, was being obtained indirectly, fix it. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-c3k1il7sm28old4e22nwlm7l@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/auxtrace.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 37e70dc01436..1ed902a24a39 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -24,6 +24,7 @@ struct perf_session; struct evlist; struct perf_tool; struct perf_mmap; +struct perf_sample; struct option; struct record_opts; struct perf_record_auxtrace_info; From ea49e01cfabd73c94a61649cd04fa524a2beff3c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 18 Sep 2019 11:36:13 -0300 Subject: [PATCH 0672/2880] perf tools: Move event synthesizing routines to separate header Those are the only routines using the perf_event__handler_t typedef and are all related, so move to a separate header to reduce the header dependency tree, lots of places were getting event.h and even stdio.h, limits.h indirectly, so fix those as well. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-yvx9u1mf7baq6cu1abfhbqgs@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/archinsn.c | 1 + tools/perf/arch/x86/util/event.c | 2 + tools/perf/arch/x86/util/machine.c | 1 + tools/perf/arch/x86/util/tsc.c | 2 + tools/perf/builtin-inject.c | 1 + tools/perf/builtin-kvm.c | 1 + tools/perf/builtin-record.c | 1 + tools/perf/builtin-stat.c | 1 + tools/perf/builtin-top.c | 1 + tools/perf/builtin-trace.c | 1 + tools/perf/tests/code-reading.c | 1 + tools/perf/tests/cpumap.c | 1 + tools/perf/tests/dwarf-unwind.c | 1 + tools/perf/tests/event_update.c | 1 + tools/perf/tests/hists_common.c | 2 + tools/perf/tests/mmap-thread-lookup.c | 2 + tools/perf/tests/sample-parsing.c | 1 + tools/perf/tests/stat.c | 1 + tools/perf/tests/thread-map.c | 1 + tools/perf/ui/stdio/hist.c | 1 + tools/perf/util/auxtrace.c | 1 + tools/perf/util/auxtrace.h | 17 +--- tools/perf/util/bpf-event.c | 1 + tools/perf/util/bpf-event.h | 15 +--- tools/perf/util/callchain.c | 1 + tools/perf/util/event.c | 1 + tools/perf/util/event.h | 118 +------------------------- tools/perf/util/evsel.c | 1 + tools/perf/util/header.c | 1 + tools/perf/util/intel-bts.c | 1 + tools/perf/util/intel-pt.c | 1 + tools/perf/util/machine.c | 10 +++ tools/perf/util/machine.h | 15 ---- tools/perf/util/session.c | 1 + tools/perf/util/session.h | 5 -- tools/perf/util/stat.c | 1 + tools/perf/util/synthetic-events.h | 103 ++++++++++++++++++++++ tools/perf/util/tsc.h | 14 +-- 38 files changed, 154 insertions(+), 177 deletions(-) create mode 100644 tools/perf/util/synthetic-events.h diff --git a/tools/perf/arch/x86/util/archinsn.c b/tools/perf/arch/x86/util/archinsn.c index 9876c7a7ed7c..3e6791531ca5 100644 --- a/tools/perf/arch/x86/util/archinsn.c +++ b/tools/perf/arch/x86/util/archinsn.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "../../../../arch/x86/include/asm/insn.h" #include "archinsn.h" +#include "event.h" #include "machine.h" #include "thread.h" #include "symbol.h" diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c index a3a0b6884779..d357c625c09f 100644 --- a/tools/perf/arch/x86/util/event.c +++ b/tools/perf/arch/x86/util/event.c @@ -3,6 +3,8 @@ #include #include +#include "../../util/event.h" +#include "../../util/synthetic-events.h" #include "../../util/machine.h" #include "../../util/tool.h" #include "../../util/map.h" diff --git a/tools/perf/arch/x86/util/machine.c b/tools/perf/arch/x86/util/machine.c index 42418040bc07..f0c289862f9f 100644 --- a/tools/perf/arch/x86/util/machine.c +++ b/tools/perf/arch/x86/util/machine.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include "../../util/util.h" // page_size diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c index c5197a15119b..2f55afb14e1f 100644 --- a/tools/perf/arch/x86/util/tsc.c +++ b/tools/perf/arch/x86/util/tsc.c @@ -8,6 +8,8 @@ #include #include #include "../../../util/debug.h" +#include "../../../util/event.h" +#include "../../../util/synthetic-events.h" #include "../../../util/tsc.h" int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index c14f40b858bc..23a76cf3846f 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -21,6 +21,7 @@ #include "util/auxtrace.h" #include "util/jit.h" #include "util/symbol.h" +#include "util/synthetic-events.h" #include "util/thread.h" #include diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 0a4fcbe32bf6..ac6d6e061dc5 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -17,6 +17,7 @@ #include "util/debug.h" #include "util/tool.h" #include "util/stat.h" +#include "util/synthetic-events.h" #include "util/top.h" #include "util/data.h" #include "util/ordered-events.h" diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 1447004eee8a..907d4d4677a3 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -38,6 +38,7 @@ #include "util/trigger.h" #include "util/perf-hooks.h" #include "util/cpu-set-sched.h" +#include "util/synthetic-events.h" #include "util/time-utils.h" #include "util/units.h" #include "util/bpf-event.h" diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index b55e8060810b..eece3d1e429a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -61,6 +61,7 @@ #include "util/tool.h" #include "util/string2.h" #include "util/metricgroup.h" +#include "util/synthetic-events.h" #include "util/target.h" #include "util/time-utils.h" #include "util/top.h" diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 726e3f2dd8c7..b052470f89b4 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -32,6 +32,7 @@ #include "util/map.h" #include "util/session.h" #include "util/symbol.h" +#include "util/synthetic-events.h" #include "util/top.h" #include "util/util.h" #include diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 0f633f0d6be8..f0f735093e21 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -28,6 +28,7 @@ #include "util/dso.h" #include "util/env.h" #include "util/event.h" +#include "util/synthetic-events.h" #include "util/evlist.h" #include "util/evswitch.h" #include diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 8d9020c46ca9..fd02c1f1d976 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -24,6 +24,7 @@ #include "symbol.h" #include "event.h" #include "record.h" +#include "util/synthetic-events.h" #include "thread.h" #include "tests.h" diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c index 39493de50117..8a0d236202b0 100644 --- a/tools/perf/tests/cpumap.c +++ b/tools/perf/tests/cpumap.c @@ -3,6 +3,7 @@ #include #include "cpumap.h" #include "event.h" +#include "util/synthetic-events.h" #include #include #include diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c index 4125255ff637..4f4ecbcbe87e 100644 --- a/tools/perf/tests/dwarf-unwind.c +++ b/tools/perf/tests/dwarf-unwind.c @@ -15,6 +15,7 @@ #include "symbol.h" #include "thread.h" #include "callchain.h" +#include "util/synthetic-events.h" #if defined (__x86_64__) || defined (__i386__) || defined (__powerpc__) #include "arch-tests.h" diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index 317eb8c5ccd4..4bb772e2b73d 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -7,6 +7,7 @@ #include "evsel.h" #include "header.h" #include "machine.h" +#include "util/synthetic-events.h" #include "tool.h" #include "tests.h" #include "debug.h" diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c index de110d8f169b..6f34d08b84e5 100644 --- a/tools/perf/tests/hists_common.c +++ b/tools/perf/tests/hists_common.c @@ -2,6 +2,7 @@ #include #include "util/debug.h" #include "util/dso.h" +#include "util/event.h" // struct perf_sample #include "util/map.h" #include "util/symbol.h" #include "util/sort.h" @@ -10,6 +11,7 @@ #include "util/thread.h" #include "tests/hists_common.h" #include +#include static struct { u32 pid; diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c index f72889c13538..33b496d194f4 100644 --- a/tools/perf/tests/mmap-thread-lookup.c +++ b/tools/perf/tests/mmap-thread-lookup.c @@ -8,11 +8,13 @@ #include #include #include "debug.h" +#include "event.h" #include "tests.h" #include "machine.h" #include "thread_map.h" #include "map.h" #include "symbol.h" +#include "util/synthetic-events.h" #include "thread.h" #include "util.h" // page_size diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index e3b965e7a233..3a02426db9a6 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -12,6 +12,7 @@ #include "event.h" #include "evsel.h" #include "debug.h" +#include "util/synthetic-events.h" #include "tests.h" diff --git a/tools/perf/tests/stat.c b/tools/perf/tests/stat.c index cc10b4116c9f..c1911501c39c 100644 --- a/tools/perf/tests/stat.c +++ b/tools/perf/tests/stat.c @@ -5,6 +5,7 @@ #include "stat.h" #include "counts.h" #include "debug.h" +#include "util/synthetic-events.h" static bool has_term(struct perf_record_stat_config *config, u64 tag, u64 val) diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c index 39168c57943b..28f51c4bd373 100644 --- a/tools/perf/tests/thread-map.c +++ b/tools/perf/tests/thread-map.c @@ -8,6 +8,7 @@ #include "thread_map.h" #include "debug.h" #include "event.h" +#include "util/synthetic-events.h" #include #include diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c index 832ca6cfbe30..5365606e9dad 100644 --- a/tools/perf/ui/stdio/hist.c +++ b/tools/perf/ui/stdio/hist.c @@ -5,6 +5,7 @@ #include "../../util/callchain.h" #include "../../util/debug.h" +#include "../../util/event.h" #include "../../util/hist.h" #include "../../util/map.h" #include "../../util/map_groups.h" diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 1c0ff5acff83..0e8c89cf7cad 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -32,6 +32,7 @@ #include "pmu.h" #include "evsel.h" #include "symbol.h" +#include "util/synthetic-events.h" #include "thread_map.h" #include "asm/bug.h" #include "auxtrace.h" diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 1ed902a24a39..b110aec1da4d 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -11,14 +11,13 @@ #include #include #include +#include // FILE #include #include #include #include #include -#include "event.h" - union perf_event; struct perf_session; struct evlist; @@ -27,6 +26,7 @@ struct perf_mmap; struct perf_sample; struct option; struct record_opts; +struct perf_record_auxtrace_error; struct perf_record_auxtrace_info; struct events_stats; @@ -525,10 +525,6 @@ void auxtrace_synth_error(struct perf_record_auxtrace_error *auxtrace_error, int int code, int cpu, pid_t pid, pid_t tid, u64 ip, const char *msg, u64 timestamp); -int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, - struct perf_tool *tool, - struct perf_session *session, - perf_event__handler_t process); int perf_event__process_auxtrace_info(struct perf_session *session, union perf_event *event); s64 perf_event__process_auxtrace(struct perf_session *session, @@ -605,15 +601,6 @@ void auxtrace_record__free(struct auxtrace_record *itr __maybe_unused) { } -static inline int -perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr __maybe_unused, - struct perf_tool *tool __maybe_unused, - struct perf_session *session __maybe_unused, - perf_event__handler_t process __maybe_unused) -{ - return -EINVAL; -} - static inline int auxtrace_record__options(struct auxtrace_record *itr __maybe_unused, struct evlist *evlist __maybe_unused, diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 7a3d4b125323..f7ed5d122e22 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -16,6 +16,7 @@ #include "map.h" #include "evlist.h" #include "record.h" +#include "util/synthetic-events.h" #define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr)) diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h index a01c2fd68c03..81fdc88e6c1a 100644 --- a/tools/perf/util/bpf-event.h +++ b/tools/perf/util/bpf-event.h @@ -6,9 +6,9 @@ #include #include #include -#include "event.h" #include +struct bpf_prog_info; struct machine; union perf_event; struct perf_env; @@ -33,11 +33,6 @@ struct btf_node { #ifdef HAVE_LIBBPF_SUPPORT int machine__process_bpf(struct machine *machine, union perf_event *event, struct perf_sample *sample); - -int perf_event__synthesize_bpf_events(struct perf_session *session, - perf_event__handler_t process, - struct machine *machine, - struct record_opts *opts); int bpf_event__add_sb_event(struct evlist **evlist, struct perf_env *env); void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info, @@ -51,14 +46,6 @@ static inline int machine__process_bpf(struct machine *machine __maybe_unused, return 0; } -static inline int perf_event__synthesize_bpf_events(struct perf_session *session __maybe_unused, - perf_event__handler_t process __maybe_unused, - struct machine *machine __maybe_unused, - struct record_opts *opts __maybe_unused) -{ - return 0; -} - static inline int bpf_event__add_sb_event(struct evlist **evlist __maybe_unused, struct perf_env *env __maybe_unused) { diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index c14646c1f2eb..9a9b56ed3f0a 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -23,6 +23,7 @@ #include "debug.h" #include "dso.h" +#include "event.h" #include "hist.h" #include "sort.h" #include "machine.h" diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index d65ae7cf9316..043a08fc7398 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -32,6 +32,7 @@ #include "stat.h" #include "session.h" #include "bpf-event.h" +#include "synthetic-events.h" #include "tool.h" #include "../perf.h" diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 89a2404170a0..a0a0c91cde4a 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -279,95 +279,13 @@ enum { void perf_event__print_totals(void); -struct evlist; -struct evsel; -struct perf_session; -struct perf_tool; -struct perf_thread_map; struct perf_cpu_map; +struct perf_record_stat_config; struct perf_stat_config; -struct perf_counts_values; +struct perf_tool; -typedef int (*perf_event__handler_t)(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine); - -int perf_event__synthesize_stat_events(struct perf_stat_config *config, - struct perf_tool *tool, - struct evlist *evlist, - perf_event__handler_t process, - bool attrs); -int perf_event__synthesize_attr(struct perf_tool *tool, - struct perf_event_attr *attr, u32 ids, u64 *id, - perf_event__handler_t process); -int perf_event__synthesize_attrs(struct perf_tool *tool, - struct evlist *evlist, - perf_event__handler_t process); -int perf_event__synthesize_build_id(struct perf_tool *tool, - struct dso *pos, u16 misc, - perf_event__handler_t process, - struct machine *machine); -int perf_event__synthesize_extra_attr(struct perf_tool *tool, - struct evlist *evsel_list, - perf_event__handler_t process, - bool is_pipe); -int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, - struct evsel *evsel, - perf_event__handler_t process); -int perf_event__synthesize_event_update_name(struct perf_tool *tool, - struct evsel *evsel, - perf_event__handler_t process); -int perf_event__synthesize_event_update_scale(struct perf_tool *tool, - struct evsel *evsel, - perf_event__handler_t process); -int perf_event__synthesize_event_update_unit(struct perf_tool *tool, - struct evsel *evsel, - perf_event__handler_t process); -int perf_event__synthesize_features(struct perf_tool *tool, - struct perf_session *session, - struct evlist *evlist, - perf_event__handler_t process); -int perf_event__synthesize_tracing_data(struct perf_tool *tool, - int fd, struct evlist *evlist, - perf_event__handler_t process); -int perf_event__synthesize_thread_map(struct perf_tool *tool, - struct perf_thread_map *threads, - perf_event__handler_t process, - struct machine *machine, bool mmap_data); -int perf_event__synthesize_thread_map2(struct perf_tool *tool, - struct perf_thread_map *threads, - perf_event__handler_t process, - struct machine *machine); -int perf_event__synthesize_cpu_map(struct perf_tool *tool, - struct perf_cpu_map *cpus, - perf_event__handler_t process, - struct machine *machine); -int perf_event__synthesize_threads(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine, bool mmap_data, - unsigned int nr_threads_synthesize); -int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine); -int perf_event__synthesize_stat_config(struct perf_tool *tool, - struct perf_stat_config *config, - perf_event__handler_t process, - struct machine *machine); void perf_event__read_stat_config(struct perf_stat_config *config, struct perf_record_stat_config *event); -int perf_event__synthesize_stat(struct perf_tool *tool, - u32 cpu, u32 thread, u64 id, - struct perf_counts_values *count, - perf_event__handler_t process, - struct machine *machine); -int perf_event__synthesize_stat_round(struct perf_tool *tool, - u64 time, u64 type, - perf_event__handler_t process, - struct machine *machine); -int perf_event__synthesize_modules(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine); int perf_event__process_comm(struct perf_tool *tool, union perf_event *event, @@ -421,10 +339,6 @@ int perf_event__process_bpf(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_tool__process_synth_event(struct perf_tool *tool, - union perf_event *event, - struct machine *machine, - perf_event__handler_t process); int perf_event__process(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, @@ -446,34 +360,6 @@ void thread__resolve(struct thread *thread, struct addr_location *al, const char *perf_event__name(unsigned int id); -size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, - u64 read_format); -int perf_event__synthesize_sample(union perf_event *event, u64 type, - u64 read_format, - const struct perf_sample *sample); - -pid_t perf_event__synthesize_comm(struct perf_tool *tool, - union perf_event *event, pid_t pid, - perf_event__handler_t process, - struct machine *machine); - -int perf_event__synthesize_namespaces(struct perf_tool *tool, - union perf_event *event, - pid_t pid, pid_t tgid, - perf_event__handler_t process, - struct machine *machine); - -int perf_event__synthesize_mmap_events(struct perf_tool *tool, - union perf_event *event, - pid_t pid, pid_t tgid, - perf_event__handler_t process, - struct machine *machine, - bool mmap_data); - -int perf_event__synthesize_extra_kmaps(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine); - size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp); size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp); size_t perf_event__fprintf_mmap2(union perf_event *event, FILE *fp); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 5b40b840624c..5af025c80ec5 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -40,6 +40,7 @@ #include "trace-event.h" #include "stat.h" #include "string2.h" +#include "util/synthetic-events.h" #include "memswap.h" #include "util.h" #include "../perf-sys.h" diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d85827de1b60..a4a8342eba98 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -45,6 +45,7 @@ #include "util.h" // page_size, perf_exe() #include "cputopo.h" #include "bpf-event.h" +#include "util/synthetic-events.h" #include #include diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index 15f87a09f4fe..3888d4cd3ed1 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -28,6 +28,7 @@ #include "auxtrace.h" #include "intel-pt-decoder/intel-pt-insn-decoder.h" #include "intel-bts.h" +#include "util/synthetic-events.h" #define MAX_TIMESTAMP (~0ULL) diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 9b56fb74bedf..bcdc0359f7cf 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -33,6 +33,7 @@ #include "tsc.h" #include "intel-pt.h" #include "config.h" +#include "util/synthetic-events.h" #include "time-utils.h" #include "../arch/x86/include/uapi/asm/perf_regs.h" diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index b4749d3eed08..132de5cfb9b9 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -20,6 +20,7 @@ #include "symbol.h" #include "sort.h" #include "strlist.h" +#include "util/synthetic-events.h" #include "target.h" #include "thread.h" #include "util.h" @@ -2624,6 +2625,15 @@ int __machine__synthesize_threads(struct machine *machine, struct perf_tool *too return 0; } +int machine__synthesize_threads(struct machine *machine, struct target *target, + struct perf_thread_map *threads, bool data_mmap, + unsigned int nr_threads_synthesize) +{ + return __machine__synthesize_threads(machine, NULL, target, threads, + perf_event__process, data_mmap, + nr_threads_synthesize); +} + pid_t machine__get_current_tid(struct machine *machine, int cpu) { int nr_cpus = min(machine->env->nr_cpus_online, MAX_NR_CPUS); diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index ffd391a925a6..18e13c0ccd6a 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -6,7 +6,6 @@ #include #include "map_groups.h" #include "dsos.h" -#include "event.h" #include "rwsem.h" struct addr_location; @@ -252,20 +251,6 @@ int machines__for_each_thread(struct machines *machines, int (*fn)(struct thread *thread, void *p), void *priv); -int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, - struct target *target, struct perf_thread_map *threads, - perf_event__handler_t process, bool data_mmap, - unsigned int nr_threads_synthesize); -static inline -int machine__synthesize_threads(struct machine *machine, struct target *target, - struct perf_thread_map *threads, bool data_mmap, - unsigned int nr_threads_synthesize) -{ - return __machine__synthesize_threads(machine, NULL, target, threads, - perf_event__process, data_mmap, - nr_threads_synthesize); -} - pid_t machine__get_current_tid(struct machine *machine, int cpu); int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid, pid_t tid); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 2b583e6adb49..6267613b551d 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -29,6 +29,7 @@ #include "thread-stack.h" #include "sample-raw.h" #include "stat.h" +#include "util/synthetic-events.h" #include "util.h" #include "ui/progress.h" #include "../perf.h" diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index b7aa076ab6fd..b4c9428c18f0 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -138,9 +138,4 @@ int perf_session__deliver_synth_event(struct perf_session *session, int perf_event__process_id_index(struct perf_session *session, union perf_event *event); -int perf_event__synthesize_id_index(struct perf_tool *tool, - perf_event__handler_t process, - struct evlist *evlist, - struct machine *machine); - #endif /* __PERF_SESSION_H */ diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 2e318d95c528..46c8a5027e12 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -12,6 +12,7 @@ #include "target.h" #include "evlist.h" #include "evsel.h" +#include "util/synthetic-events.h" #include "thread_map.h" #include diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h new file mode 100644 index 000000000000..baead0cdc381 --- /dev/null +++ b/tools/perf/util/synthetic-events.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_SYNTHETIC_EVENTS_H +#define __PERF_SYNTHETIC_EVENTS_H + +#include +#include // pid_t +#include +#include + +struct auxtrace_record; +struct dso; +struct evlist; +struct evsel; +struct machine; +struct perf_counts_values; +struct perf_cpu_map; +struct perf_event_attr; +struct perf_event_mmap_page; +struct perf_sample; +struct perf_session; +struct perf_stat_config; +struct perf_thread_map; +struct perf_tool; +struct record_opts; +struct target; + +union perf_event; + +typedef int (*perf_event__handler_t)(struct perf_tool *tool, union perf_event *event, + struct perf_sample *sample, struct machine *machine); + +int perf_event__synthesize_attrs(struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process); +int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *attr, u32 ids, u64 *id, perf_event__handler_t process); +int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 misc, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_cpu_map(struct perf_tool *tool, struct perf_cpu_map *cpus, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); +int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); +int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); +int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); +int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evsel_list, perf_event__handler_t process, bool is_pipe); +int perf_event__synthesize_extra_kmaps(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session *session, struct evlist *evlist, perf_event__handler_t process); +int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine); +int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_mmap_events(struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine, bool mmap_data); +int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_namespaces(struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_format, const struct perf_sample *sample); +int perf_event__synthesize_stat_config(struct perf_tool *tool, struct perf_stat_config *config, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_stat_events(struct perf_stat_config *config, struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process, bool attrs); +int perf_event__synthesize_stat_round(struct perf_tool *tool, u64 time, u64 type, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_stat(struct perf_tool *tool, u32 cpu, u32 thread, u64 id, struct perf_counts_values *count, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_thread_map2(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_thread_map(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine, bool mmap_data); +int perf_event__synthesize_threads(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool mmap_data, unsigned int nr_threads_synthesize); +int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct evlist *evlist, perf_event__handler_t process); +int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); +pid_t perf_event__synthesize_comm(struct perf_tool *tool, union perf_event *event, pid_t pid, perf_event__handler_t process, struct machine *machine); + +int perf_tool__process_synth_event(struct perf_tool *tool, union perf_event *event, struct machine *machine, perf_event__handler_t process); + +size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, u64 read_format); + +int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, + struct target *target, struct perf_thread_map *threads, + perf_event__handler_t process, bool data_mmap, + unsigned int nr_threads_synthesize); +int machine__synthesize_threads(struct machine *machine, struct target *target, + struct perf_thread_map *threads, bool data_mmap, + unsigned int nr_threads_synthesize); + +#ifdef HAVE_AUXTRACE_SUPPORT +int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, struct perf_tool *tool, + struct perf_session *session, perf_event__handler_t process); + +#else // HAVE_AUXTRACE_SUPPORT + +#include + +static inline int +perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr __maybe_unused, + struct perf_tool *tool __maybe_unused, + struct perf_session *session __maybe_unused, + perf_event__handler_t process __maybe_unused) +{ + return -EINVAL; +} +#endif // HAVE_AUXTRACE_SUPPORT + +#ifdef HAVE_LIBBPF_SUPPORT +int perf_event__synthesize_bpf_events(struct perf_session *session, perf_event__handler_t process, + struct machine *machine, struct record_opts *opts); +#else // HAVE_LIBBPF_SUPPORT +static inline int perf_event__synthesize_bpf_events(struct perf_session *session __maybe_unused, + perf_event__handler_t process __maybe_unused, + struct machine *machine __maybe_unused, + struct record_opts *opts __maybe_unused) +{ + return 0; +} +#endif // HAVE_LIBBPF_SUPPORT + +#endif // __PERF_SYNTHETIC_EVENTS_H diff --git a/tools/perf/util/tsc.h b/tools/perf/util/tsc.h index e0c3af34ac8d..3c5a632ee57c 100644 --- a/tools/perf/util/tsc.h +++ b/tools/perf/util/tsc.h @@ -4,13 +4,12 @@ #include -#include "event.h" - struct perf_tsc_conversion { u16 time_shift; u32 time_mult; u64 time_zero; }; + struct perf_event_mmap_page; int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, @@ -20,13 +19,4 @@ u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc); u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc); u64 rdtsc(void); -struct perf_event_mmap_page; -struct perf_tool; -struct machine; - -int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, - struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine); - -#endif +#endif // __PERF_TSC_H From 5cac8ea3e6e736664ee272f94d9099891e25f782 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 18 Sep 2019 12:28:41 -0300 Subject: [PATCH 0673/2880] perf memswap: Adopt 'struct u64_swap' from evsel.h As it is not used in evsel.h and is a memory swap struct, so fits better in memswap.h. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-wvzxu7a5l3m868ywwphrnnqo@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.h | 5 ----- tools/perf/util/memswap.h | 7 +++++++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 68321d10eb2d..74df298acb31 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -179,11 +179,6 @@ struct evsel { } side_band; }; -union u64_swap { - u64 val64; - u32 val32[2]; -}; - struct perf_missing_features { bool sample_id_all; bool exclude_guest; diff --git a/tools/perf/util/memswap.h b/tools/perf/util/memswap.h index 1e29ff903ca9..2c38e8c2d548 100644 --- a/tools/perf/util/memswap.h +++ b/tools/perf/util/memswap.h @@ -2,6 +2,13 @@ #ifndef PERF_MEMSWAP_H_ #define PERF_MEMSWAP_H_ +#include + +union u64_swap { + u64 val64; + u32 val32[2]; +}; + void mem_bswap_64(void *src, int byte_size); void mem_bswap_32(void *src, int byte_size); From 055c67ed39887c5563e9540470a4617c1b772aec Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 18 Sep 2019 16:08:52 -0300 Subject: [PATCH 0674/2880] perf tools: Move event synthesizing routines to separate .c file For better grouping, in time we may end up making most of these static, i.e. generalizing the 'perf record' synthesizing code so that based on the target it can do the right thing and call the needed synthesizers. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-s9zxxhk40s95pjng9panet16@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 9 - tools/perf/util/Build | 1 + tools/perf/util/cs-etm.c | 1 + tools/perf/util/event.c | 1108 +--------------- tools/perf/util/evsel.c | 278 ---- tools/perf/util/header.c | 393 +----- tools/perf/util/header.h | 18 + tools/perf/util/machine.c | 25 - tools/perf/util/namespaces.c | 18 + tools/perf/util/namespaces.h | 2 + tools/perf/util/session.c | 71 -- tools/perf/util/stat.c | 43 - tools/perf/util/synthetic-events.c | 1884 ++++++++++++++++++++++++++++ 13 files changed, 1928 insertions(+), 1923 deletions(-) create mode 100644 tools/perf/util/synthetic-events.c diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 907d4d4677a3..4bd11c918e73 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1181,15 +1181,6 @@ static void workload_exec_failed_signal(int signo __maybe_unused, static void snapshot_sig_handler(int sig); static void alarm_sig_handler(int sig); -int __weak -perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused, - struct perf_tool *tool __maybe_unused, - perf_event__handler_t process __maybe_unused, - struct machine *machine __maybe_unused) -{ - return 0; -} - static const struct perf_event_mmap_page * perf_evlist__pick_pc(struct evlist *evlist) { diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 0b4d8e0d474c..fd89d6a8cd65 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -86,6 +86,7 @@ perf-y += stat-display.o perf-y += record.o perf-y += srcline.o perf-y += srccode.o +perf-y += synthetic-events.o perf-y += data.o perf-y += tsc.o perf-y += cloexec.o diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index f87b9c1c9f9a..6021974577d5 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -35,6 +35,7 @@ #include "thread.h" #include "thread-stack.h" #include +#include "util/synthetic-events.h" #define MAX_TIMESTAMP (~0ULL) diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 043a08fc7398..fc1e5a991008 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -1,4 +1,3 @@ -#include #include #include #include @@ -9,7 +8,6 @@ #include #include #include /* To get things like MAP_HUGETLB even on older libc headers */ -#include #include #include #include "cpumap.h" @@ -26,18 +24,16 @@ #include "time-utils.h" #include #include "map.h" +#include "util/namespaces.h" #include "symbol.h" #include "symbol/kallsyms.h" #include "asm/bug.h" #include "stat.h" #include "session.h" #include "bpf-event.h" -#include "synthetic-events.h" #include "tool.h" #include "../perf.h" -#define DEFAULT_PROC_MAP_PARSE_TIMEOUT 500 - static const char *perf_event__names[] = { [0] = "TOTAL", [PERF_RECORD_MMAP] = "MMAP", @@ -78,18 +74,6 @@ static const char *perf_event__names[] = { [PERF_RECORD_COMPRESSED] = "COMPRESSED", }; -static const char *perf_ns__names[] = { - [NET_NS_INDEX] = "net", - [UTS_NS_INDEX] = "uts", - [IPC_NS_INDEX] = "ipc", - [PID_NS_INDEX] = "pid", - [USER_NS_INDEX] = "user", - [MNT_NS_INDEX] = "mnt", - [CGROUP_NS_INDEX] = "cgroup", -}; - -unsigned int proc_map_timeout = DEFAULT_PROC_MAP_PARSE_TIMEOUT; - const char *perf_event__name(unsigned int id) { if (id >= ARRAY_SIZE(perf_event__names)) @@ -99,775 +83,6 @@ const char *perf_event__name(unsigned int id) return perf_event__names[id]; } -static const char *perf_ns__name(unsigned int id) -{ - if (id >= ARRAY_SIZE(perf_ns__names)) - return "UNKNOWN"; - return perf_ns__names[id]; -} - -int perf_tool__process_synth_event(struct perf_tool *tool, - union perf_event *event, - struct machine *machine, - perf_event__handler_t process) -{ - struct perf_sample synth_sample = { - .pid = -1, - .tid = -1, - .time = -1, - .stream_id = -1, - .cpu = -1, - .period = 1, - .cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK, - }; - - return process(tool, event, &synth_sample, machine); -}; - -/* - * Assumes that the first 4095 bytes of /proc/pid/stat contains - * the comm, tgid and ppid. - */ -static int perf_event__get_comm_ids(pid_t pid, char *comm, size_t len, - pid_t *tgid, pid_t *ppid) -{ - char filename[PATH_MAX]; - char bf[4096]; - int fd; - size_t size = 0; - ssize_t n; - char *name, *tgids, *ppids; - - *tgid = -1; - *ppid = -1; - - snprintf(filename, sizeof(filename), "/proc/%d/status", pid); - - fd = open(filename, O_RDONLY); - if (fd < 0) { - pr_debug("couldn't open %s\n", filename); - return -1; - } - - n = read(fd, bf, sizeof(bf) - 1); - close(fd); - if (n <= 0) { - pr_warning("Couldn't get COMM, tigd and ppid for pid %d\n", - pid); - return -1; - } - bf[n] = '\0'; - - name = strstr(bf, "Name:"); - tgids = strstr(bf, "Tgid:"); - ppids = strstr(bf, "PPid:"); - - if (name) { - char *nl; - - name = skip_spaces(name + 5); /* strlen("Name:") */ - nl = strchr(name, '\n'); - if (nl) - *nl = '\0'; - - size = strlen(name); - if (size >= len) - size = len - 1; - memcpy(comm, name, size); - comm[size] = '\0'; - } else { - pr_debug("Name: string not found for pid %d\n", pid); - } - - if (tgids) { - tgids += 5; /* strlen("Tgid:") */ - *tgid = atoi(tgids); - } else { - pr_debug("Tgid: string not found for pid %d\n", pid); - } - - if (ppids) { - ppids += 5; /* strlen("PPid:") */ - *ppid = atoi(ppids); - } else { - pr_debug("PPid: string not found for pid %d\n", pid); - } - - return 0; -} - -static int perf_event__prepare_comm(union perf_event *event, pid_t pid, - struct machine *machine, - pid_t *tgid, pid_t *ppid) -{ - size_t size; - - *ppid = -1; - - memset(&event->comm, 0, sizeof(event->comm)); - - if (machine__is_host(machine)) { - if (perf_event__get_comm_ids(pid, event->comm.comm, - sizeof(event->comm.comm), - tgid, ppid) != 0) { - return -1; - } - } else { - *tgid = machine->pid; - } - - if (*tgid < 0) - return -1; - - event->comm.pid = *tgid; - event->comm.header.type = PERF_RECORD_COMM; - - size = strlen(event->comm.comm) + 1; - size = PERF_ALIGN(size, sizeof(u64)); - memset(event->comm.comm + size, 0, machine->id_hdr_size); - event->comm.header.size = (sizeof(event->comm) - - (sizeof(event->comm.comm) - size) + - machine->id_hdr_size); - event->comm.tid = pid; - - return 0; -} - -pid_t perf_event__synthesize_comm(struct perf_tool *tool, - union perf_event *event, pid_t pid, - perf_event__handler_t process, - struct machine *machine) -{ - pid_t tgid, ppid; - - if (perf_event__prepare_comm(event, pid, machine, &tgid, &ppid) != 0) - return -1; - - if (perf_tool__process_synth_event(tool, event, machine, process) != 0) - return -1; - - return tgid; -} - -static void perf_event__get_ns_link_info(pid_t pid, const char *ns, - struct perf_ns_link_info *ns_link_info) -{ - struct stat64 st; - char proc_ns[128]; - - sprintf(proc_ns, "/proc/%u/ns/%s", pid, ns); - if (stat64(proc_ns, &st) == 0) { - ns_link_info->dev = st.st_dev; - ns_link_info->ino = st.st_ino; - } -} - -int perf_event__synthesize_namespaces(struct perf_tool *tool, - union perf_event *event, - pid_t pid, pid_t tgid, - perf_event__handler_t process, - struct machine *machine) -{ - u32 idx; - struct perf_ns_link_info *ns_link_info; - - if (!tool || !tool->namespace_events) - return 0; - - memset(&event->namespaces, 0, (sizeof(event->namespaces) + - (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + - machine->id_hdr_size)); - - event->namespaces.pid = tgid; - event->namespaces.tid = pid; - - event->namespaces.nr_namespaces = NR_NAMESPACES; - - ns_link_info = event->namespaces.link_info; - - for (idx = 0; idx < event->namespaces.nr_namespaces; idx++) - perf_event__get_ns_link_info(pid, perf_ns__name(idx), - &ns_link_info[idx]); - - event->namespaces.header.type = PERF_RECORD_NAMESPACES; - - event->namespaces.header.size = (sizeof(event->namespaces) + - (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + - machine->id_hdr_size); - - if (perf_tool__process_synth_event(tool, event, machine, process) != 0) - return -1; - - return 0; -} - -static int perf_event__synthesize_fork(struct perf_tool *tool, - union perf_event *event, - pid_t pid, pid_t tgid, pid_t ppid, - perf_event__handler_t process, - struct machine *machine) -{ - memset(&event->fork, 0, sizeof(event->fork) + machine->id_hdr_size); - - /* - * for main thread set parent to ppid from status file. For other - * threads set parent pid to main thread. ie., assume main thread - * spawns all threads in a process - */ - if (tgid == pid) { - event->fork.ppid = ppid; - event->fork.ptid = ppid; - } else { - event->fork.ppid = tgid; - event->fork.ptid = tgid; - } - event->fork.pid = tgid; - event->fork.tid = pid; - event->fork.header.type = PERF_RECORD_FORK; - event->fork.header.misc = PERF_RECORD_MISC_FORK_EXEC; - - event->fork.header.size = (sizeof(event->fork) + machine->id_hdr_size); - - if (perf_tool__process_synth_event(tool, event, machine, process) != 0) - return -1; - - return 0; -} - -int perf_event__synthesize_mmap_events(struct perf_tool *tool, - union perf_event *event, - pid_t pid, pid_t tgid, - perf_event__handler_t process, - struct machine *machine, - bool mmap_data) -{ - char filename[PATH_MAX]; - FILE *fp; - unsigned long long t; - bool truncation = false; - unsigned long long timeout = proc_map_timeout * 1000000ULL; - int rc = 0; - const char *hugetlbfs_mnt = hugetlbfs__mountpoint(); - int hugetlbfs_mnt_len = hugetlbfs_mnt ? strlen(hugetlbfs_mnt) : 0; - - if (machine__is_default_guest(machine)) - return 0; - - snprintf(filename, sizeof(filename), "%s/proc/%d/task/%d/maps", - machine->root_dir, pid, pid); - - fp = fopen(filename, "r"); - if (fp == NULL) { - /* - * We raced with a task exiting - just return: - */ - pr_debug("couldn't open %s\n", filename); - return -1; - } - - event->header.type = PERF_RECORD_MMAP2; - t = rdclock(); - - while (1) { - char bf[BUFSIZ]; - char prot[5]; - char execname[PATH_MAX]; - char anonstr[] = "//anon"; - unsigned int ino; - size_t size; - ssize_t n; - - if (fgets(bf, sizeof(bf), fp) == NULL) - break; - - if ((rdclock() - t) > timeout) { - pr_warning("Reading %s time out. " - "You may want to increase " - "the time limit by --proc-map-timeout\n", - filename); - truncation = true; - goto out; - } - - /* ensure null termination since stack will be reused. */ - strcpy(execname, ""); - - /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */ - n = sscanf(bf, "%"PRI_lx64"-%"PRI_lx64" %s %"PRI_lx64" %x:%x %u %[^\n]\n", - &event->mmap2.start, &event->mmap2.len, prot, - &event->mmap2.pgoff, &event->mmap2.maj, - &event->mmap2.min, - &ino, execname); - - /* - * Anon maps don't have the execname. - */ - if (n < 7) - continue; - - event->mmap2.ino = (u64)ino; - - /* - * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c - */ - if (machine__is_host(machine)) - event->header.misc = PERF_RECORD_MISC_USER; - else - event->header.misc = PERF_RECORD_MISC_GUEST_USER; - - /* map protection and flags bits */ - event->mmap2.prot = 0; - event->mmap2.flags = 0; - if (prot[0] == 'r') - event->mmap2.prot |= PROT_READ; - if (prot[1] == 'w') - event->mmap2.prot |= PROT_WRITE; - if (prot[2] == 'x') - event->mmap2.prot |= PROT_EXEC; - - if (prot[3] == 's') - event->mmap2.flags |= MAP_SHARED; - else - event->mmap2.flags |= MAP_PRIVATE; - - if (prot[2] != 'x') { - if (!mmap_data || prot[0] != 'r') - continue; - - event->header.misc |= PERF_RECORD_MISC_MMAP_DATA; - } - -out: - if (truncation) - event->header.misc |= PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT; - - if (!strcmp(execname, "")) - strcpy(execname, anonstr); - - if (hugetlbfs_mnt_len && - !strncmp(execname, hugetlbfs_mnt, hugetlbfs_mnt_len)) { - strcpy(execname, anonstr); - event->mmap2.flags |= MAP_HUGETLB; - } - - size = strlen(execname) + 1; - memcpy(event->mmap2.filename, execname, size); - size = PERF_ALIGN(size, sizeof(u64)); - event->mmap2.len -= event->mmap.start; - event->mmap2.header.size = (sizeof(event->mmap2) - - (sizeof(event->mmap2.filename) - size)); - memset(event->mmap2.filename + size, 0, machine->id_hdr_size); - event->mmap2.header.size += machine->id_hdr_size; - event->mmap2.pid = tgid; - event->mmap2.tid = pid; - - if (perf_tool__process_synth_event(tool, event, machine, process) != 0) { - rc = -1; - break; - } - - if (truncation) - break; - } - - fclose(fp); - return rc; -} - -int perf_event__synthesize_modules(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine) -{ - int rc = 0; - struct map *pos; - struct maps *maps = machine__kernel_maps(machine); - union perf_event *event = zalloc((sizeof(event->mmap) + - machine->id_hdr_size)); - if (event == NULL) { - pr_debug("Not enough memory synthesizing mmap event " - "for kernel modules\n"); - return -1; - } - - event->header.type = PERF_RECORD_MMAP; - - /* - * kernel uses 0 for user space maps, see kernel/perf_event.c - * __perf_event_mmap - */ - if (machine__is_host(machine)) - event->header.misc = PERF_RECORD_MISC_KERNEL; - else - event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL; - - for (pos = maps__first(maps); pos; pos = map__next(pos)) { - size_t size; - - if (!__map__is_kmodule(pos)) - continue; - - size = PERF_ALIGN(pos->dso->long_name_len + 1, sizeof(u64)); - event->mmap.header.type = PERF_RECORD_MMAP; - event->mmap.header.size = (sizeof(event->mmap) - - (sizeof(event->mmap.filename) - size)); - memset(event->mmap.filename + size, 0, machine->id_hdr_size); - event->mmap.header.size += machine->id_hdr_size; - event->mmap.start = pos->start; - event->mmap.len = pos->end - pos->start; - event->mmap.pid = machine->pid; - - memcpy(event->mmap.filename, pos->dso->long_name, - pos->dso->long_name_len + 1); - if (perf_tool__process_synth_event(tool, event, machine, process) != 0) { - rc = -1; - break; - } - } - - free(event); - return rc; -} - -static int __event__synthesize_thread(union perf_event *comm_event, - union perf_event *mmap_event, - union perf_event *fork_event, - union perf_event *namespaces_event, - pid_t pid, int full, - perf_event__handler_t process, - struct perf_tool *tool, - struct machine *machine, - bool mmap_data) -{ - char filename[PATH_MAX]; - DIR *tasks; - struct dirent *dirent; - pid_t tgid, ppid; - int rc = 0; - - /* special case: only send one comm event using passed in pid */ - if (!full) { - tgid = perf_event__synthesize_comm(tool, comm_event, pid, - process, machine); - - if (tgid == -1) - return -1; - - if (perf_event__synthesize_namespaces(tool, namespaces_event, pid, - tgid, process, machine) < 0) - return -1; - - /* - * send mmap only for thread group leader - * see thread__init_map_groups - */ - if (pid == tgid && - perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid, - process, machine, mmap_data)) - return -1; - - return 0; - } - - if (machine__is_default_guest(machine)) - return 0; - - snprintf(filename, sizeof(filename), "%s/proc/%d/task", - machine->root_dir, pid); - - tasks = opendir(filename); - if (tasks == NULL) { - pr_debug("couldn't open %s\n", filename); - return 0; - } - - while ((dirent = readdir(tasks)) != NULL) { - char *end; - pid_t _pid; - - _pid = strtol(dirent->d_name, &end, 10); - if (*end) - continue; - - rc = -1; - if (perf_event__prepare_comm(comm_event, _pid, machine, - &tgid, &ppid) != 0) - break; - - if (perf_event__synthesize_fork(tool, fork_event, _pid, tgid, - ppid, process, machine) < 0) - break; - - if (perf_event__synthesize_namespaces(tool, namespaces_event, _pid, - tgid, process, machine) < 0) - break; - - /* - * Send the prepared comm event - */ - if (perf_tool__process_synth_event(tool, comm_event, machine, process) != 0) - break; - - rc = 0; - if (_pid == pid) { - /* process the parent's maps too */ - rc = perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid, - process, machine, mmap_data); - if (rc) - break; - } - } - - closedir(tasks); - return rc; -} - -int perf_event__synthesize_thread_map(struct perf_tool *tool, - struct perf_thread_map *threads, - perf_event__handler_t process, - struct machine *machine, - bool mmap_data) -{ - union perf_event *comm_event, *mmap_event, *fork_event; - union perf_event *namespaces_event; - int err = -1, thread, j; - - comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size); - if (comm_event == NULL) - goto out; - - mmap_event = malloc(sizeof(mmap_event->mmap2) + machine->id_hdr_size); - if (mmap_event == NULL) - goto out_free_comm; - - fork_event = malloc(sizeof(fork_event->fork) + machine->id_hdr_size); - if (fork_event == NULL) - goto out_free_mmap; - - namespaces_event = malloc(sizeof(namespaces_event->namespaces) + - (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + - machine->id_hdr_size); - if (namespaces_event == NULL) - goto out_free_fork; - - err = 0; - for (thread = 0; thread < threads->nr; ++thread) { - if (__event__synthesize_thread(comm_event, mmap_event, - fork_event, namespaces_event, - perf_thread_map__pid(threads, thread), 0, - process, tool, machine, - mmap_data)) { - err = -1; - break; - } - - /* - * comm.pid is set to thread group id by - * perf_event__synthesize_comm - */ - if ((int) comm_event->comm.pid != perf_thread_map__pid(threads, thread)) { - bool need_leader = true; - - /* is thread group leader in thread_map? */ - for (j = 0; j < threads->nr; ++j) { - if ((int) comm_event->comm.pid == perf_thread_map__pid(threads, j)) { - need_leader = false; - break; - } - } - - /* if not, generate events for it */ - if (need_leader && - __event__synthesize_thread(comm_event, mmap_event, - fork_event, namespaces_event, - comm_event->comm.pid, 0, - process, tool, machine, - mmap_data)) { - err = -1; - break; - } - } - } - free(namespaces_event); -out_free_fork: - free(fork_event); -out_free_mmap: - free(mmap_event); -out_free_comm: - free(comm_event); -out: - return err; -} - -static int __perf_event__synthesize_threads(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine, - bool mmap_data, - struct dirent **dirent, - int start, - int num) -{ - union perf_event *comm_event, *mmap_event, *fork_event; - union perf_event *namespaces_event; - int err = -1; - char *end; - pid_t pid; - int i; - - comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size); - if (comm_event == NULL) - goto out; - - mmap_event = malloc(sizeof(mmap_event->mmap2) + machine->id_hdr_size); - if (mmap_event == NULL) - goto out_free_comm; - - fork_event = malloc(sizeof(fork_event->fork) + machine->id_hdr_size); - if (fork_event == NULL) - goto out_free_mmap; - - namespaces_event = malloc(sizeof(namespaces_event->namespaces) + - (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + - machine->id_hdr_size); - if (namespaces_event == NULL) - goto out_free_fork; - - for (i = start; i < start + num; i++) { - if (!isdigit(dirent[i]->d_name[0])) - continue; - - pid = (pid_t)strtol(dirent[i]->d_name, &end, 10); - /* only interested in proper numerical dirents */ - if (*end) - continue; - /* - * We may race with exiting thread, so don't stop just because - * one thread couldn't be synthesized. - */ - __event__synthesize_thread(comm_event, mmap_event, fork_event, - namespaces_event, pid, 1, process, - tool, machine, mmap_data); - } - err = 0; - - free(namespaces_event); -out_free_fork: - free(fork_event); -out_free_mmap: - free(mmap_event); -out_free_comm: - free(comm_event); -out: - return err; -} - -struct synthesize_threads_arg { - struct perf_tool *tool; - perf_event__handler_t process; - struct machine *machine; - bool mmap_data; - struct dirent **dirent; - int num; - int start; -}; - -static void *synthesize_threads_worker(void *arg) -{ - struct synthesize_threads_arg *args = arg; - - __perf_event__synthesize_threads(args->tool, args->process, - args->machine, args->mmap_data, - args->dirent, - args->start, args->num); - return NULL; -} - -int perf_event__synthesize_threads(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine, - bool mmap_data, - unsigned int nr_threads_synthesize) -{ - struct synthesize_threads_arg *args = NULL; - pthread_t *synthesize_threads = NULL; - char proc_path[PATH_MAX]; - struct dirent **dirent; - int num_per_thread; - int m, n, i, j; - int thread_nr; - int base = 0; - int err = -1; - - - if (machine__is_default_guest(machine)) - return 0; - - snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir); - n = scandir(proc_path, &dirent, 0, alphasort); - if (n < 0) - return err; - - if (nr_threads_synthesize == UINT_MAX) - thread_nr = sysconf(_SC_NPROCESSORS_ONLN); - else - thread_nr = nr_threads_synthesize; - - if (thread_nr <= 1) { - err = __perf_event__synthesize_threads(tool, process, - machine, mmap_data, - dirent, base, n); - goto free_dirent; - } - if (thread_nr > n) - thread_nr = n; - - synthesize_threads = calloc(sizeof(pthread_t), thread_nr); - if (synthesize_threads == NULL) - goto free_dirent; - - args = calloc(sizeof(*args), thread_nr); - if (args == NULL) - goto free_threads; - - num_per_thread = n / thread_nr; - m = n % thread_nr; - for (i = 0; i < thread_nr; i++) { - args[i].tool = tool; - args[i].process = process; - args[i].machine = machine; - args[i].mmap_data = mmap_data; - args[i].dirent = dirent; - } - for (i = 0; i < m; i++) { - args[i].num = num_per_thread + 1; - args[i].start = i * args[i].num; - } - if (i != 0) - base = args[i-1].start + args[i-1].num; - for (j = i; j < thread_nr; j++) { - args[j].num = num_per_thread; - args[j].start = base + (j - i) * args[i].num; - } - - for (i = 0; i < thread_nr; i++) { - if (pthread_create(&synthesize_threads[i], NULL, - synthesize_threads_worker, &args[i])) - goto out_join; - } - err = 0; -out_join: - for (i = 0; i < thread_nr; i++) - pthread_join(synthesize_threads[i], NULL); - free(args); -free_threads: - free(synthesize_threads); -free_dirent: - for (i = 0; i < n; i++) - zfree(&dirent[i]); - free(dirent); - - return err; -} - struct process_symbol_args { const char *name; u64 start; @@ -902,327 +117,6 @@ int kallsyms__get_function_start(const char *kallsyms_filename, return 0; } -int __weak perf_event__synthesize_extra_kmaps(struct perf_tool *tool __maybe_unused, - perf_event__handler_t process __maybe_unused, - struct machine *machine __maybe_unused) -{ - return 0; -} - -static int __perf_event__synthesize_kernel_mmap(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine) -{ - size_t size; - struct map *map = machine__kernel_map(machine); - struct kmap *kmap; - int err; - union perf_event *event; - - if (map == NULL) - return -1; - - kmap = map__kmap(map); - if (!kmap->ref_reloc_sym) - return -1; - - /* - * We should get this from /sys/kernel/sections/.text, but till that is - * available use this, and after it is use this as a fallback for older - * kernels. - */ - event = zalloc((sizeof(event->mmap) + machine->id_hdr_size)); - if (event == NULL) { - pr_debug("Not enough memory synthesizing mmap event " - "for kernel modules\n"); - return -1; - } - - if (machine__is_host(machine)) { - /* - * kernel uses PERF_RECORD_MISC_USER for user space maps, - * see kernel/perf_event.c __perf_event_mmap - */ - event->header.misc = PERF_RECORD_MISC_KERNEL; - } else { - event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL; - } - - size = snprintf(event->mmap.filename, sizeof(event->mmap.filename), - "%s%s", machine->mmap_name, kmap->ref_reloc_sym->name) + 1; - size = PERF_ALIGN(size, sizeof(u64)); - event->mmap.header.type = PERF_RECORD_MMAP; - event->mmap.header.size = (sizeof(event->mmap) - - (sizeof(event->mmap.filename) - size) + machine->id_hdr_size); - event->mmap.pgoff = kmap->ref_reloc_sym->addr; - event->mmap.start = map->start; - event->mmap.len = map->end - event->mmap.start; - event->mmap.pid = machine->pid; - - err = perf_tool__process_synth_event(tool, event, machine, process); - free(event); - - return err; -} - -int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine) -{ - int err; - - err = __perf_event__synthesize_kernel_mmap(tool, process, machine); - if (err < 0) - return err; - - return perf_event__synthesize_extra_kmaps(tool, process, machine); -} - -int perf_event__synthesize_thread_map2(struct perf_tool *tool, - struct perf_thread_map *threads, - perf_event__handler_t process, - struct machine *machine) -{ - union perf_event *event; - int i, err, size; - - size = sizeof(event->thread_map); - size += threads->nr * sizeof(event->thread_map.entries[0]); - - event = zalloc(size); - if (!event) - return -ENOMEM; - - event->header.type = PERF_RECORD_THREAD_MAP; - event->header.size = size; - event->thread_map.nr = threads->nr; - - for (i = 0; i < threads->nr; i++) { - struct perf_record_thread_map_entry *entry = &event->thread_map.entries[i]; - char *comm = perf_thread_map__comm(threads, i); - - if (!comm) - comm = (char *) ""; - - entry->pid = perf_thread_map__pid(threads, i); - strncpy((char *) &entry->comm, comm, sizeof(entry->comm)); - } - - err = process(tool, event, NULL, machine); - - free(event); - return err; -} - -static void synthesize_cpus(struct cpu_map_entries *cpus, - struct perf_cpu_map *map) -{ - int i; - - cpus->nr = map->nr; - - for (i = 0; i < map->nr; i++) - cpus->cpu[i] = map->map[i]; -} - -static void synthesize_mask(struct perf_record_record_cpu_map *mask, - struct perf_cpu_map *map, int max) -{ - int i; - - mask->nr = BITS_TO_LONGS(max); - mask->long_size = sizeof(long); - - for (i = 0; i < map->nr; i++) - set_bit(map->map[i], mask->mask); -} - -static size_t cpus_size(struct perf_cpu_map *map) -{ - return sizeof(struct cpu_map_entries) + map->nr * sizeof(u16); -} - -static size_t mask_size(struct perf_cpu_map *map, int *max) -{ - int i; - - *max = 0; - - for (i = 0; i < map->nr; i++) { - /* bit possition of the cpu is + 1 */ - int bit = map->map[i] + 1; - - if (bit > *max) - *max = bit; - } - - return sizeof(struct perf_record_record_cpu_map) + BITS_TO_LONGS(*max) * sizeof(long); -} - -void *cpu_map_data__alloc(struct perf_cpu_map *map, size_t *size, u16 *type, int *max) -{ - size_t size_cpus, size_mask; - bool is_dummy = perf_cpu_map__empty(map); - - /* - * Both array and mask data have variable size based - * on the number of cpus and their actual values. - * The size of the 'struct perf_record_cpu_map_data' is: - * - * array = size of 'struct cpu_map_entries' + - * number of cpus * sizeof(u64) - * - * mask = size of 'struct perf_record_record_cpu_map' + - * maximum cpu bit converted to size of longs - * - * and finaly + the size of 'struct perf_record_cpu_map_data'. - */ - size_cpus = cpus_size(map); - size_mask = mask_size(map, max); - - if (is_dummy || (size_cpus < size_mask)) { - *size += size_cpus; - *type = PERF_CPU_MAP__CPUS; - } else { - *size += size_mask; - *type = PERF_CPU_MAP__MASK; - } - - *size += sizeof(struct perf_record_cpu_map_data); - *size = PERF_ALIGN(*size, sizeof(u64)); - return zalloc(*size); -} - -void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, struct perf_cpu_map *map, - u16 type, int max) -{ - data->type = type; - - switch (type) { - case PERF_CPU_MAP__CPUS: - synthesize_cpus((struct cpu_map_entries *) data->data, map); - break; - case PERF_CPU_MAP__MASK: - synthesize_mask((struct perf_record_record_cpu_map *)data->data, map, max); - default: - break; - }; -} - -static struct perf_record_cpu_map *cpu_map_event__new(struct perf_cpu_map *map) -{ - size_t size = sizeof(struct perf_record_cpu_map); - struct perf_record_cpu_map *event; - int max; - u16 type; - - event = cpu_map_data__alloc(map, &size, &type, &max); - if (!event) - return NULL; - - event->header.type = PERF_RECORD_CPU_MAP; - event->header.size = size; - event->data.type = type; - - cpu_map_data__synthesize(&event->data, map, type, max); - return event; -} - -int perf_event__synthesize_cpu_map(struct perf_tool *tool, - struct perf_cpu_map *map, - perf_event__handler_t process, - struct machine *machine) -{ - struct perf_record_cpu_map *event; - int err; - - event = cpu_map_event__new(map); - if (!event) - return -ENOMEM; - - err = process(tool, (union perf_event *) event, NULL, machine); - - free(event); - return err; -} - -int perf_event__synthesize_stat_config(struct perf_tool *tool, - struct perf_stat_config *config, - perf_event__handler_t process, - struct machine *machine) -{ - struct perf_record_stat_config *event; - int size, i = 0, err; - - size = sizeof(*event); - size += (PERF_STAT_CONFIG_TERM__MAX * sizeof(event->data[0])); - - event = zalloc(size); - if (!event) - return -ENOMEM; - - event->header.type = PERF_RECORD_STAT_CONFIG; - event->header.size = size; - event->nr = PERF_STAT_CONFIG_TERM__MAX; - -#define ADD(__term, __val) \ - event->data[i].tag = PERF_STAT_CONFIG_TERM__##__term; \ - event->data[i].val = __val; \ - i++; - - ADD(AGGR_MODE, config->aggr_mode) - ADD(INTERVAL, config->interval) - ADD(SCALE, config->scale) - - WARN_ONCE(i != PERF_STAT_CONFIG_TERM__MAX, - "stat config terms unbalanced\n"); -#undef ADD - - err = process(tool, (union perf_event *) event, NULL, machine); - - free(event); - return err; -} - -int perf_event__synthesize_stat(struct perf_tool *tool, - u32 cpu, u32 thread, u64 id, - struct perf_counts_values *count, - perf_event__handler_t process, - struct machine *machine) -{ - struct perf_record_stat event; - - event.header.type = PERF_RECORD_STAT; - event.header.size = sizeof(event); - event.header.misc = 0; - - event.id = id; - event.cpu = cpu; - event.thread = thread; - event.val = count->val; - event.ena = count->ena; - event.run = count->run; - - return process(tool, (union perf_event *) &event, NULL, machine); -} - -int perf_event__synthesize_stat_round(struct perf_tool *tool, - u64 evtime, u64 type, - perf_event__handler_t process, - struct machine *machine) -{ - struct perf_record_stat_round event; - - event.header.type = PERF_RECORD_STAT_ROUND; - event.header.size = sizeof(event); - event.header.misc = 0; - - event.time = evtime; - event.type = type; - - return process(tool, (union perf_event *) &event, NULL, machine); -} - void perf_event__read_stat_config(struct perf_stat_config *config, struct perf_record_stat_config *event) { diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 5af025c80ec5..8e335d168503 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -40,7 +40,6 @@ #include "trace-event.h" #include "stat.h" #include "string2.h" -#include "util/synthetic-events.h" #include "memswap.h" #include "util.h" #include "../perf-sys.h" @@ -2421,283 +2420,6 @@ int perf_evsel__parse_sample_timestamp(struct evsel *evsel, return 0; } -size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, - u64 read_format) -{ - size_t sz, result = sizeof(struct perf_record_sample); - - if (type & PERF_SAMPLE_IDENTIFIER) - result += sizeof(u64); - - if (type & PERF_SAMPLE_IP) - result += sizeof(u64); - - if (type & PERF_SAMPLE_TID) - result += sizeof(u64); - - if (type & PERF_SAMPLE_TIME) - result += sizeof(u64); - - if (type & PERF_SAMPLE_ADDR) - result += sizeof(u64); - - if (type & PERF_SAMPLE_ID) - result += sizeof(u64); - - if (type & PERF_SAMPLE_STREAM_ID) - result += sizeof(u64); - - if (type & PERF_SAMPLE_CPU) - result += sizeof(u64); - - if (type & PERF_SAMPLE_PERIOD) - result += sizeof(u64); - - if (type & PERF_SAMPLE_READ) { - result += sizeof(u64); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - result += sizeof(u64); - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - result += sizeof(u64); - /* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */ - if (read_format & PERF_FORMAT_GROUP) { - sz = sample->read.group.nr * - sizeof(struct sample_read_value); - result += sz; - } else { - result += sizeof(u64); - } - } - - if (type & PERF_SAMPLE_CALLCHAIN) { - sz = (sample->callchain->nr + 1) * sizeof(u64); - result += sz; - } - - if (type & PERF_SAMPLE_RAW) { - result += sizeof(u32); - result += sample->raw_size; - } - - if (type & PERF_SAMPLE_BRANCH_STACK) { - sz = sample->branch_stack->nr * sizeof(struct branch_entry); - sz += sizeof(u64); - result += sz; - } - - if (type & PERF_SAMPLE_REGS_USER) { - if (sample->user_regs.abi) { - result += sizeof(u64); - sz = hweight64(sample->user_regs.mask) * sizeof(u64); - result += sz; - } else { - result += sizeof(u64); - } - } - - if (type & PERF_SAMPLE_STACK_USER) { - sz = sample->user_stack.size; - result += sizeof(u64); - if (sz) { - result += sz; - result += sizeof(u64); - } - } - - if (type & PERF_SAMPLE_WEIGHT) - result += sizeof(u64); - - if (type & PERF_SAMPLE_DATA_SRC) - result += sizeof(u64); - - if (type & PERF_SAMPLE_TRANSACTION) - result += sizeof(u64); - - if (type & PERF_SAMPLE_REGS_INTR) { - if (sample->intr_regs.abi) { - result += sizeof(u64); - sz = hweight64(sample->intr_regs.mask) * sizeof(u64); - result += sz; - } else { - result += sizeof(u64); - } - } - - if (type & PERF_SAMPLE_PHYS_ADDR) - result += sizeof(u64); - - return result; -} - -int perf_event__synthesize_sample(union perf_event *event, u64 type, - u64 read_format, - const struct perf_sample *sample) -{ - __u64 *array; - size_t sz; - /* - * used for cross-endian analysis. See git commit 65014ab3 - * for why this goofiness is needed. - */ - union u64_swap u; - - array = event->sample.array; - - if (type & PERF_SAMPLE_IDENTIFIER) { - *array = sample->id; - array++; - } - - if (type & PERF_SAMPLE_IP) { - *array = sample->ip; - array++; - } - - if (type & PERF_SAMPLE_TID) { - u.val32[0] = sample->pid; - u.val32[1] = sample->tid; - *array = u.val64; - array++; - } - - if (type & PERF_SAMPLE_TIME) { - *array = sample->time; - array++; - } - - if (type & PERF_SAMPLE_ADDR) { - *array = sample->addr; - array++; - } - - if (type & PERF_SAMPLE_ID) { - *array = sample->id; - array++; - } - - if (type & PERF_SAMPLE_STREAM_ID) { - *array = sample->stream_id; - array++; - } - - if (type & PERF_SAMPLE_CPU) { - u.val32[0] = sample->cpu; - u.val32[1] = 0; - *array = u.val64; - array++; - } - - if (type & PERF_SAMPLE_PERIOD) { - *array = sample->period; - array++; - } - - if (type & PERF_SAMPLE_READ) { - if (read_format & PERF_FORMAT_GROUP) - *array = sample->read.group.nr; - else - *array = sample->read.one.value; - array++; - - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - *array = sample->read.time_enabled; - array++; - } - - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - *array = sample->read.time_running; - array++; - } - - /* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */ - if (read_format & PERF_FORMAT_GROUP) { - sz = sample->read.group.nr * - sizeof(struct sample_read_value); - memcpy(array, sample->read.group.values, sz); - array = (void *)array + sz; - } else { - *array = sample->read.one.id; - array++; - } - } - - if (type & PERF_SAMPLE_CALLCHAIN) { - sz = (sample->callchain->nr + 1) * sizeof(u64); - memcpy(array, sample->callchain, sz); - array = (void *)array + sz; - } - - if (type & PERF_SAMPLE_RAW) { - u.val32[0] = sample->raw_size; - *array = u.val64; - array = (void *)array + sizeof(u32); - - memcpy(array, sample->raw_data, sample->raw_size); - array = (void *)array + sample->raw_size; - } - - if (type & PERF_SAMPLE_BRANCH_STACK) { - sz = sample->branch_stack->nr * sizeof(struct branch_entry); - sz += sizeof(u64); - memcpy(array, sample->branch_stack, sz); - array = (void *)array + sz; - } - - if (type & PERF_SAMPLE_REGS_USER) { - if (sample->user_regs.abi) { - *array++ = sample->user_regs.abi; - sz = hweight64(sample->user_regs.mask) * sizeof(u64); - memcpy(array, sample->user_regs.regs, sz); - array = (void *)array + sz; - } else { - *array++ = 0; - } - } - - if (type & PERF_SAMPLE_STACK_USER) { - sz = sample->user_stack.size; - *array++ = sz; - if (sz) { - memcpy(array, sample->user_stack.data, sz); - array = (void *)array + sz; - *array++ = sz; - } - } - - if (type & PERF_SAMPLE_WEIGHT) { - *array = sample->weight; - array++; - } - - if (type & PERF_SAMPLE_DATA_SRC) { - *array = sample->data_src; - array++; - } - - if (type & PERF_SAMPLE_TRANSACTION) { - *array = sample->transaction; - array++; - } - - if (type & PERF_SAMPLE_REGS_INTR) { - if (sample->intr_regs.abi) { - *array++ = sample->intr_regs.abi; - sz = hweight64(sample->intr_regs.mask) * sizeof(u64); - memcpy(array, sample->intr_regs.regs, sz); - array = (void *)array + sz; - } else { - *array++ = 0; - } - } - - if (type & PERF_SAMPLE_PHYS_ADDR) { - *array = sample->phys_addr; - array++; - } - - return 0; -} - struct tep_format_field *perf_evsel__field(struct evsel *evsel, const char *name) { return tep_find_field(evsel->tp_format, name); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index a4a8342eba98..5722ff717777 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -45,7 +45,6 @@ #include "util.h" // page_size, perf_exe() #include "cputopo.h" #include "bpf-event.h" -#include "util/synthetic-events.h" #include #include @@ -72,15 +71,6 @@ struct perf_file_attr { struct perf_file_section ids; }; -struct feat_fd { - struct perf_header *ph; - int fd; - void *buf; /* Either buf != NULL or fd >= 0 */ - ssize_t offset; - size_t size; - struct evsel *events; -}; - void perf_header__set_feat(struct perf_header *header, int feat) { set_bit(feat, header->adds_features); @@ -2825,15 +2815,6 @@ static int process_compressed(struct feat_fd *ff, return 0; } -struct feature_ops { - int (*write)(struct feat_fd *ff, struct evlist *evlist); - void (*print)(struct feat_fd *ff, FILE *fp); - int (*process)(struct feat_fd *ff, void *data); - const char *name; - bool full_only; - bool synthesize; -}; - #define FEAT_OPR(n, func, __full_only) \ [HEADER_##n] = { \ .name = __stringify(n), \ @@ -2860,8 +2841,10 @@ struct feature_ops { #define process_branch_stack NULL #define process_stat NULL +// Only used in util/synthetic-events.c +const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE]; -static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = { +const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPN(TRACING_DATA, tracing_data, false), FEAT_OPN(BUILD_ID, build_id, false), FEAT_OPR(HOSTNAME, hostname, false), @@ -3658,105 +3641,6 @@ out_delete_evlist: return -ENOMEM; } -int perf_event__synthesize_attr(struct perf_tool *tool, - struct perf_event_attr *attr, u32 ids, u64 *id, - perf_event__handler_t process) -{ - union perf_event *ev; - size_t size; - int err; - - size = sizeof(struct perf_event_attr); - size = PERF_ALIGN(size, sizeof(u64)); - size += sizeof(struct perf_event_header); - size += ids * sizeof(u64); - - ev = zalloc(size); - - if (ev == NULL) - return -ENOMEM; - - ev->attr.attr = *attr; - memcpy(ev->attr.id, id, ids * sizeof(u64)); - - ev->attr.header.type = PERF_RECORD_HEADER_ATTR; - ev->attr.header.size = (u16)size; - - if (ev->attr.header.size == size) - err = process(tool, ev, NULL, NULL); - else - err = -E2BIG; - - free(ev); - - return err; -} - -int perf_event__synthesize_features(struct perf_tool *tool, - struct perf_session *session, - struct evlist *evlist, - perf_event__handler_t process) -{ - struct perf_header *header = &session->header; - struct feat_fd ff; - struct perf_record_header_feature *fe; - size_t sz, sz_hdr; - int feat, ret; - - sz_hdr = sizeof(fe->header); - sz = sizeof(union perf_event); - /* get a nice alignment */ - sz = PERF_ALIGN(sz, page_size); - - memset(&ff, 0, sizeof(ff)); - - ff.buf = malloc(sz); - if (!ff.buf) - return -ENOMEM; - - ff.size = sz - sz_hdr; - ff.ph = &session->header; - - for_each_set_bit(feat, header->adds_features, HEADER_FEAT_BITS) { - if (!feat_ops[feat].synthesize) { - pr_debug("No record header feature for header :%d\n", feat); - continue; - } - - ff.offset = sizeof(*fe); - - ret = feat_ops[feat].write(&ff, evlist); - if (ret || ff.offset <= (ssize_t)sizeof(*fe)) { - pr_debug("Error writing feature\n"); - continue; - } - /* ff.buf may have changed due to realloc in do_write() */ - fe = ff.buf; - memset(fe, 0, sizeof(*fe)); - - fe->feat_id = feat; - fe->header.type = PERF_RECORD_HEADER_FEATURE; - fe->header.size = ff.offset; - - ret = process(tool, ff.buf, NULL, NULL); - if (ret) { - free(ff.buf); - return ret; - } - } - - /* Send HEADER_LAST_FEATURE mark. */ - fe = ff.buf; - fe->feat_id = HEADER_LAST_FEATURE; - fe->header.type = PERF_RECORD_HEADER_FEATURE; - fe->header.size = sizeof(*fe); - - ret = process(tool, ff.buf, NULL, NULL); - - free(ff.buf); - return ret; -} - int perf_event__process_feature(struct perf_session *session, union perf_event *event) { @@ -3799,113 +3683,6 @@ int perf_event__process_feature(struct perf_session *session, return 0; } -static struct perf_record_event_update * -event_update_event__new(size_t size, u64 type, u64 id) -{ - struct perf_record_event_update *ev; - - size += sizeof(*ev); - size = PERF_ALIGN(size, sizeof(u64)); - - ev = zalloc(size); - if (ev) { - ev->header.type = PERF_RECORD_EVENT_UPDATE; - ev->header.size = (u16)size; - ev->type = type; - ev->id = id; - } - return ev; -} - -int -perf_event__synthesize_event_update_unit(struct perf_tool *tool, - struct evsel *evsel, - perf_event__handler_t process) -{ - struct perf_record_event_update *ev; - size_t size = strlen(evsel->unit); - int err; - - ev = event_update_event__new(size + 1, PERF_EVENT_UPDATE__UNIT, evsel->id[0]); - if (ev == NULL) - return -ENOMEM; - - strlcpy(ev->data, evsel->unit, size + 1); - err = process(tool, (union perf_event *)ev, NULL, NULL); - free(ev); - return err; -} - -int -perf_event__synthesize_event_update_scale(struct perf_tool *tool, - struct evsel *evsel, - perf_event__handler_t process) -{ - struct perf_record_event_update *ev; - struct perf_record_event_update_scale *ev_data; - int err; - - ev = event_update_event__new(sizeof(*ev_data), PERF_EVENT_UPDATE__SCALE, evsel->id[0]); - if (ev == NULL) - return -ENOMEM; - - ev_data = (struct perf_record_event_update_scale *)ev->data; - ev_data->scale = evsel->scale; - err = process(tool, (union perf_event*) ev, NULL, NULL); - free(ev); - return err; -} - -int -perf_event__synthesize_event_update_name(struct perf_tool *tool, - struct evsel *evsel, - perf_event__handler_t process) -{ - struct perf_record_event_update *ev; - size_t len = strlen(evsel->name); - int err; - - ev = event_update_event__new(len + 1, PERF_EVENT_UPDATE__NAME, evsel->id[0]); - if (ev == NULL) - return -ENOMEM; - - strlcpy(ev->data, evsel->name, len + 1); - err = process(tool, (union perf_event*) ev, NULL, NULL); - free(ev); - return err; -} - -int -perf_event__synthesize_event_update_cpus(struct perf_tool *tool, - struct evsel *evsel, - perf_event__handler_t process) -{ - size_t size = sizeof(struct perf_record_event_update); - struct perf_record_event_update *ev; - int max, err; - u16 type; - - if (!evsel->core.own_cpus) - return 0; - - ev = cpu_map_data__alloc(evsel->core.own_cpus, &size, &type, &max); - if (!ev) - return -ENOMEM; - - ev->header.type = PERF_RECORD_EVENT_UPDATE; - ev->header.size = (u16)size; - ev->type = PERF_EVENT_UPDATE__CPUS; - ev->id = evsel->id[0]; - - cpu_map_data__synthesize((struct perf_record_cpu_map_data *)ev->data, - evsel->core.own_cpus, - type, max); - - err = process(tool, (union perf_event*) ev, NULL, NULL); - free(ev); - return err; -} - size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp) { struct perf_record_event_update *ev = &event->event_update; @@ -3945,93 +3722,6 @@ size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp) return ret; } -int perf_event__synthesize_attrs(struct perf_tool *tool, - struct evlist *evlist, - perf_event__handler_t process) -{ - struct evsel *evsel; - int err = 0; - - evlist__for_each_entry(evlist, evsel) { - err = perf_event__synthesize_attr(tool, &evsel->core.attr, evsel->ids, - evsel->id, process); - if (err) { - pr_debug("failed to create perf header attribute\n"); - return err; - } - } - - return err; -} - -static bool has_unit(struct evsel *counter) -{ - return counter->unit && *counter->unit; -} - -static bool has_scale(struct evsel *counter) -{ - return counter->scale != 1; -} - -int perf_event__synthesize_extra_attr(struct perf_tool *tool, - struct evlist *evsel_list, - perf_event__handler_t process, - bool is_pipe) -{ - struct evsel *counter; - int err; - - /* - * Synthesize other events stuff not carried within - * attr event - unit, scale, name - */ - evlist__for_each_entry(evsel_list, counter) { - if (!counter->supported) - continue; - - /* - * Synthesize unit and scale only if it's defined. - */ - if (has_unit(counter)) { - err = perf_event__synthesize_event_update_unit(tool, counter, process); - if (err < 0) { - pr_err("Couldn't synthesize evsel unit.\n"); - return err; - } - } - - if (has_scale(counter)) { - err = perf_event__synthesize_event_update_scale(tool, counter, process); - if (err < 0) { - pr_err("Couldn't synthesize evsel counter.\n"); - return err; - } - } - - if (counter->core.own_cpus) { - err = perf_event__synthesize_event_update_cpus(tool, counter, process); - if (err < 0) { - pr_err("Couldn't synthesize evsel cpus.\n"); - return err; - } - } - - /* - * Name is needed only for pipe output, - * perf.data carries event names. - */ - if (is_pipe) { - err = perf_event__synthesize_event_update_name(tool, counter, process); - if (err < 0) { - pr_err("Couldn't synthesize evsel name.\n"); - return err; - } - } - } - return 0; -} - int perf_event__process_attr(struct perf_tool *tool __maybe_unused, union perf_event *event, struct evlist **pevlist) @@ -4116,55 +3806,6 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused, return 0; } -int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, - struct evlist *evlist, - perf_event__handler_t process) -{ - union perf_event ev; - struct tracing_data *tdata; - ssize_t size = 0, aligned_size = 0, padding; - struct feat_fd ff; - int err __maybe_unused = 0; - - /* - * We are going to store the size of the data followed - * by the data contents. Since the fd descriptor is a pipe, - * we cannot seek back to store the size of the data once - * we know it. Instead we: - * - * - write the tracing data to the temp file - * - get/write the data size to pipe - * - write the tracing data from the temp file - * to the pipe - */ - tdata = tracing_data_get(&evlist->core.entries, fd, true); - if (!tdata) - return -1; - - memset(&ev, 0, sizeof(ev)); - - ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA; - size = tdata->size; - aligned_size = PERF_ALIGN(size, sizeof(u64)); - padding = aligned_size - size; - ev.tracing_data.header.size = sizeof(ev.tracing_data); - ev.tracing_data.size = aligned_size; - - process(tool, &ev, NULL, NULL); - - /* - * The put function will copy all the tracing data - * stored in temp file to the pipe. - */ - tracing_data_put(tdata); - - ff = (struct feat_fd){ .fd = fd }; - if (write_padded(&ff, NULL, 0, padding)) - return -1; - - return aligned_size; -} - int perf_event__process_tracing_data(struct perf_session *session, union perf_event *event) { @@ -4204,34 +3845,6 @@ int perf_event__process_tracing_data(struct perf_session *session, return size_read + padding; } -int perf_event__synthesize_build_id(struct perf_tool *tool, - struct dso *pos, u16 misc, - perf_event__handler_t process, - struct machine *machine) -{ - union perf_event ev; - size_t len; - int err = 0; - - if (!pos->hit) - return err; - - memset(&ev, 0, sizeof(ev)); - - len = pos->long_name_len + 1; - len = PERF_ALIGN(len, NAME_ALIGN); - memcpy(&ev.build_id.build_id, pos->build_id, sizeof(pos->build_id)); - ev.build_id.header.type = PERF_RECORD_HEADER_BUILD_ID; - ev.build_id.header.misc = misc; - ev.build_id.pid = machine->pid; - ev.build_id.header.size = sizeof(ev.build_id) + len; - memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len); - - err = process(tool, &ev, NULL, machine); - - return err; -} - int perf_event__process_build_id(struct perf_session *session, union perf_event *event) { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 999dac41871e..ca53a929e9fd 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -92,6 +92,24 @@ struct perf_header { struct perf_env env; }; +struct feat_fd { + struct perf_header *ph; + int fd; + void *buf; /* Either buf != NULL or fd >= 0 */ + ssize_t offset; + size_t size; + struct evsel *events; +}; + +struct perf_header_feature_ops { + int (*write)(struct feat_fd *ff, struct evlist *evlist); + void (*print)(struct feat_fd *ff, FILE *fp); + int (*process)(struct feat_fd *ff, void *data); + const char *name; + bool full_only; + bool synthesize; +}; + struct evlist; struct perf_session; struct perf_tool; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 132de5cfb9b9..0535338f2d7a 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -20,7 +20,6 @@ #include "symbol.h" #include "sort.h" #include "strlist.h" -#include "util/synthetic-events.h" #include "target.h" #include "thread.h" #include "util.h" @@ -2610,30 +2609,6 @@ int machines__for_each_thread(struct machines *machines, return rc; } -int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, - struct target *target, struct perf_thread_map *threads, - perf_event__handler_t process, bool data_mmap, - unsigned int nr_threads_synthesize) -{ - if (target__has_task(target)) - return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap); - else if (target__has_cpu(target)) - return perf_event__synthesize_threads(tool, process, - machine, data_mmap, - nr_threads_synthesize); - /* command specified */ - return 0; -} - -int machine__synthesize_threads(struct machine *machine, struct target *target, - struct perf_thread_map *threads, bool data_mmap, - unsigned int nr_threads_synthesize) -{ - return __machine__synthesize_threads(machine, NULL, target, threads, - perf_event__process, data_mmap, - nr_threads_synthesize); -} - pid_t machine__get_current_tid(struct machine *machine, int cpu) { int nr_cpus = min(machine->env->nr_cpus_online, MAX_NR_CPUS); diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c index 99be15dd2b6b..285d6f30d912 100644 --- a/tools/perf/util/namespaces.c +++ b/tools/perf/util/namespaces.c @@ -17,8 +17,26 @@ #include #include #include +#include #include +static const char *perf_ns__names[] = { + [NET_NS_INDEX] = "net", + [UTS_NS_INDEX] = "uts", + [IPC_NS_INDEX] = "ipc", + [PID_NS_INDEX] = "pid", + [USER_NS_INDEX] = "user", + [MNT_NS_INDEX] = "mnt", + [CGROUP_NS_INDEX] = "cgroup", +}; + +const char *perf_ns__name(unsigned int id) +{ + if (id >= ARRAY_SIZE(perf_ns__names)) + return "UNKNOWN"; + return perf_ns__names[id]; +} + struct namespaces *namespaces__new(struct perf_record_namespaces *event) { struct namespaces *namespaces; diff --git a/tools/perf/util/namespaces.h b/tools/perf/util/namespaces.h index 40edef56cb52..4b33f684eddd 100644 --- a/tools/perf/util/namespaces.h +++ b/tools/perf/util/namespaces.h @@ -66,4 +66,6 @@ static inline void __nsinfo__zput(struct nsinfo **nsip) #define nsinfo__zput(nsi) __nsinfo__zput(&nsi) +const char *perf_ns__name(unsigned int id); + #endif /* __PERF_NAMESPACES_H */ diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 6267613b551d..58b5bc34ba12 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -29,7 +29,6 @@ #include "thread-stack.h" #include "sample-raw.h" #include "stat.h" -#include "util/synthetic-events.h" #include "util.h" #include "ui/progress.h" #include "../perf.h" @@ -2413,73 +2412,3 @@ int perf_event__process_id_index(struct perf_session *session, } return 0; } - -int perf_event__synthesize_id_index(struct perf_tool *tool, - perf_event__handler_t process, - struct evlist *evlist, - struct machine *machine) -{ - union perf_event *ev; - struct evsel *evsel; - size_t nr = 0, i = 0, sz, max_nr, n; - int err; - - pr_debug2("Synthesizing id index\n"); - - max_nr = (UINT16_MAX - sizeof(struct perf_record_id_index)) / - sizeof(struct id_index_entry); - - evlist__for_each_entry(evlist, evsel) - nr += evsel->ids; - - n = nr > max_nr ? max_nr : nr; - sz = sizeof(struct perf_record_id_index) + n * sizeof(struct id_index_entry); - ev = zalloc(sz); - if (!ev) - return -ENOMEM; - - ev->id_index.header.type = PERF_RECORD_ID_INDEX; - ev->id_index.header.size = sz; - ev->id_index.nr = n; - - evlist__for_each_entry(evlist, evsel) { - u32 j; - - for (j = 0; j < evsel->ids; j++) { - struct id_index_entry *e; - struct perf_sample_id *sid; - - if (i >= n) { - err = process(tool, ev, NULL, machine); - if (err) - goto out_err; - nr -= n; - i = 0; - } - - e = &ev->id_index.entries[i++]; - - e->id = evsel->id[j]; - - sid = perf_evlist__id2sid(evlist, e->id); - if (!sid) { - free(ev); - return -ENOENT; - } - - e->idx = sid->idx; - e->cpu = sid->cpu; - e->tid = sid->tid; - } - } - - sz = sizeof(struct perf_record_id_index) + nr * sizeof(struct id_index_entry); - ev->id_index.header.size = sz; - ev->id_index.nr = nr; - - err = process(tool, ev, NULL, machine); -out_err: - free(ev); - - return err; -} diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 46c8a5027e12..06571209cb0b 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -12,7 +12,6 @@ #include "target.h" #include "evlist.h" #include "evsel.h" -#include "util/synthetic-events.h" #include "thread_map.h" #include @@ -495,45 +494,3 @@ int create_perf_stat_counter(struct evsel *evsel, return perf_evsel__open_per_thread(evsel, evsel->core.threads); } - -int perf_event__synthesize_stat_events(struct perf_stat_config *config, - struct perf_tool *tool, - struct evlist *evlist, - perf_event__handler_t process, - bool attrs) -{ - int err; - - if (attrs) { - err = perf_event__synthesize_attrs(tool, evlist, process); - if (err < 0) { - pr_err("Couldn't synthesize attrs.\n"); - return err; - } - } - - err = perf_event__synthesize_extra_attr(tool, evlist, process, - attrs); - - err = perf_event__synthesize_thread_map2(tool, evlist->core.threads, - process, NULL); - if (err < 0) { - pr_err("Couldn't synthesize thread map.\n"); - return err; - } - - err = perf_event__synthesize_cpu_map(tool, evlist->core.cpus, - process, NULL); - if (err < 0) { - pr_err("Couldn't synthesize thread map.\n"); - return err; - } - - err = perf_event__synthesize_stat_config(tool, config, process, NULL); - if (err < 0) { - pr_err("Couldn't synthesize config.\n"); - return err; - } - - return 0; -} diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c new file mode 100644 index 000000000000..8322028a9a97 --- /dev/null +++ b/tools/perf/util/synthetic-events.c @@ -0,0 +1,1884 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "util/debug.h" +#include "util/dso.h" +#include "util/event.h" +#include "util/evlist.h" +#include "util/machine.h" +#include "util/map.h" +#include "util/map_symbol.h" +#include "util/branch.h" +#include "util/memswap.h" +#include "util/namespaces.h" +#include "util/session.h" +#include "util/stat.h" +#include "util/symbol.h" +#include "util/synthetic-events.h" +#include "util/target.h" +#include "util/time-utils.h" +#include "util/util.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* To get things like MAP_HUGETLB even on older libc headers */ +#include +#include +#include +#include +#include + +#define DEFAULT_PROC_MAP_PARSE_TIMEOUT 500 + +unsigned int proc_map_timeout = DEFAULT_PROC_MAP_PARSE_TIMEOUT; + +int perf_tool__process_synth_event(struct perf_tool *tool, + union perf_event *event, + struct machine *machine, + perf_event__handler_t process) +{ + struct perf_sample synth_sample = { + .pid = -1, + .tid = -1, + .time = -1, + .stream_id = -1, + .cpu = -1, + .period = 1, + .cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK, + }; + + return process(tool, event, &synth_sample, machine); +}; + +/* + * Assumes that the first 4095 bytes of /proc/pid/stat contains + * the comm, tgid and ppid. + */ +static int perf_event__get_comm_ids(pid_t pid, char *comm, size_t len, + pid_t *tgid, pid_t *ppid) +{ + char filename[PATH_MAX]; + char bf[4096]; + int fd; + size_t size = 0; + ssize_t n; + char *name, *tgids, *ppids; + + *tgid = -1; + *ppid = -1; + + snprintf(filename, sizeof(filename), "/proc/%d/status", pid); + + fd = open(filename, O_RDONLY); + if (fd < 0) { + pr_debug("couldn't open %s\n", filename); + return -1; + } + + n = read(fd, bf, sizeof(bf) - 1); + close(fd); + if (n <= 0) { + pr_warning("Couldn't get COMM, tigd and ppid for pid %d\n", + pid); + return -1; + } + bf[n] = '\0'; + + name = strstr(bf, "Name:"); + tgids = strstr(bf, "Tgid:"); + ppids = strstr(bf, "PPid:"); + + if (name) { + char *nl; + + name = skip_spaces(name + 5); /* strlen("Name:") */ + nl = strchr(name, '\n'); + if (nl) + *nl = '\0'; + + size = strlen(name); + if (size >= len) + size = len - 1; + memcpy(comm, name, size); + comm[size] = '\0'; + } else { + pr_debug("Name: string not found for pid %d\n", pid); + } + + if (tgids) { + tgids += 5; /* strlen("Tgid:") */ + *tgid = atoi(tgids); + } else { + pr_debug("Tgid: string not found for pid %d\n", pid); + } + + if (ppids) { + ppids += 5; /* strlen("PPid:") */ + *ppid = atoi(ppids); + } else { + pr_debug("PPid: string not found for pid %d\n", pid); + } + + return 0; +} + +static int perf_event__prepare_comm(union perf_event *event, pid_t pid, + struct machine *machine, + pid_t *tgid, pid_t *ppid) +{ + size_t size; + + *ppid = -1; + + memset(&event->comm, 0, sizeof(event->comm)); + + if (machine__is_host(machine)) { + if (perf_event__get_comm_ids(pid, event->comm.comm, + sizeof(event->comm.comm), + tgid, ppid) != 0) { + return -1; + } + } else { + *tgid = machine->pid; + } + + if (*tgid < 0) + return -1; + + event->comm.pid = *tgid; + event->comm.header.type = PERF_RECORD_COMM; + + size = strlen(event->comm.comm) + 1; + size = PERF_ALIGN(size, sizeof(u64)); + memset(event->comm.comm + size, 0, machine->id_hdr_size); + event->comm.header.size = (sizeof(event->comm) - + (sizeof(event->comm.comm) - size) + + machine->id_hdr_size); + event->comm.tid = pid; + + return 0; +} + +pid_t perf_event__synthesize_comm(struct perf_tool *tool, + union perf_event *event, pid_t pid, + perf_event__handler_t process, + struct machine *machine) +{ + pid_t tgid, ppid; + + if (perf_event__prepare_comm(event, pid, machine, &tgid, &ppid) != 0) + return -1; + + if (perf_tool__process_synth_event(tool, event, machine, process) != 0) + return -1; + + return tgid; +} + +static void perf_event__get_ns_link_info(pid_t pid, const char *ns, + struct perf_ns_link_info *ns_link_info) +{ + struct stat64 st; + char proc_ns[128]; + + sprintf(proc_ns, "/proc/%u/ns/%s", pid, ns); + if (stat64(proc_ns, &st) == 0) { + ns_link_info->dev = st.st_dev; + ns_link_info->ino = st.st_ino; + } +} + +int perf_event__synthesize_namespaces(struct perf_tool *tool, + union perf_event *event, + pid_t pid, pid_t tgid, + perf_event__handler_t process, + struct machine *machine) +{ + u32 idx; + struct perf_ns_link_info *ns_link_info; + + if (!tool || !tool->namespace_events) + return 0; + + memset(&event->namespaces, 0, (sizeof(event->namespaces) + + (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + + machine->id_hdr_size)); + + event->namespaces.pid = tgid; + event->namespaces.tid = pid; + + event->namespaces.nr_namespaces = NR_NAMESPACES; + + ns_link_info = event->namespaces.link_info; + + for (idx = 0; idx < event->namespaces.nr_namespaces; idx++) + perf_event__get_ns_link_info(pid, perf_ns__name(idx), + &ns_link_info[idx]); + + event->namespaces.header.type = PERF_RECORD_NAMESPACES; + + event->namespaces.header.size = (sizeof(event->namespaces) + + (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + + machine->id_hdr_size); + + if (perf_tool__process_synth_event(tool, event, machine, process) != 0) + return -1; + + return 0; +} + +static int perf_event__synthesize_fork(struct perf_tool *tool, + union perf_event *event, + pid_t pid, pid_t tgid, pid_t ppid, + perf_event__handler_t process, + struct machine *machine) +{ + memset(&event->fork, 0, sizeof(event->fork) + machine->id_hdr_size); + + /* + * for main thread set parent to ppid from status file. For other + * threads set parent pid to main thread. ie., assume main thread + * spawns all threads in a process + */ + if (tgid == pid) { + event->fork.ppid = ppid; + event->fork.ptid = ppid; + } else { + event->fork.ppid = tgid; + event->fork.ptid = tgid; + } + event->fork.pid = tgid; + event->fork.tid = pid; + event->fork.header.type = PERF_RECORD_FORK; + event->fork.header.misc = PERF_RECORD_MISC_FORK_EXEC; + + event->fork.header.size = (sizeof(event->fork) + machine->id_hdr_size); + + if (perf_tool__process_synth_event(tool, event, machine, process) != 0) + return -1; + + return 0; +} + +int perf_event__synthesize_mmap_events(struct perf_tool *tool, + union perf_event *event, + pid_t pid, pid_t tgid, + perf_event__handler_t process, + struct machine *machine, + bool mmap_data) +{ + char filename[PATH_MAX]; + FILE *fp; + unsigned long long t; + bool truncation = false; + unsigned long long timeout = proc_map_timeout * 1000000ULL; + int rc = 0; + const char *hugetlbfs_mnt = hugetlbfs__mountpoint(); + int hugetlbfs_mnt_len = hugetlbfs_mnt ? strlen(hugetlbfs_mnt) : 0; + + if (machine__is_default_guest(machine)) + return 0; + + snprintf(filename, sizeof(filename), "%s/proc/%d/task/%d/maps", + machine->root_dir, pid, pid); + + fp = fopen(filename, "r"); + if (fp == NULL) { + /* + * We raced with a task exiting - just return: + */ + pr_debug("couldn't open %s\n", filename); + return -1; + } + + event->header.type = PERF_RECORD_MMAP2; + t = rdclock(); + + while (1) { + char bf[BUFSIZ]; + char prot[5]; + char execname[PATH_MAX]; + char anonstr[] = "//anon"; + unsigned int ino; + size_t size; + ssize_t n; + + if (fgets(bf, sizeof(bf), fp) == NULL) + break; + + if ((rdclock() - t) > timeout) { + pr_warning("Reading %s time out. " + "You may want to increase " + "the time limit by --proc-map-timeout\n", + filename); + truncation = true; + goto out; + } + + /* ensure null termination since stack will be reused. */ + strcpy(execname, ""); + + /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */ + n = sscanf(bf, "%"PRI_lx64"-%"PRI_lx64" %s %"PRI_lx64" %x:%x %u %[^\n]\n", + &event->mmap2.start, &event->mmap2.len, prot, + &event->mmap2.pgoff, &event->mmap2.maj, + &event->mmap2.min, + &ino, execname); + + /* + * Anon maps don't have the execname. + */ + if (n < 7) + continue; + + event->mmap2.ino = (u64)ino; + + /* + * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c + */ + if (machine__is_host(machine)) + event->header.misc = PERF_RECORD_MISC_USER; + else + event->header.misc = PERF_RECORD_MISC_GUEST_USER; + + /* map protection and flags bits */ + event->mmap2.prot = 0; + event->mmap2.flags = 0; + if (prot[0] == 'r') + event->mmap2.prot |= PROT_READ; + if (prot[1] == 'w') + event->mmap2.prot |= PROT_WRITE; + if (prot[2] == 'x') + event->mmap2.prot |= PROT_EXEC; + + if (prot[3] == 's') + event->mmap2.flags |= MAP_SHARED; + else + event->mmap2.flags |= MAP_PRIVATE; + + if (prot[2] != 'x') { + if (!mmap_data || prot[0] != 'r') + continue; + + event->header.misc |= PERF_RECORD_MISC_MMAP_DATA; + } + +out: + if (truncation) + event->header.misc |= PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT; + + if (!strcmp(execname, "")) + strcpy(execname, anonstr); + + if (hugetlbfs_mnt_len && + !strncmp(execname, hugetlbfs_mnt, hugetlbfs_mnt_len)) { + strcpy(execname, anonstr); + event->mmap2.flags |= MAP_HUGETLB; + } + + size = strlen(execname) + 1; + memcpy(event->mmap2.filename, execname, size); + size = PERF_ALIGN(size, sizeof(u64)); + event->mmap2.len -= event->mmap.start; + event->mmap2.header.size = (sizeof(event->mmap2) - + (sizeof(event->mmap2.filename) - size)); + memset(event->mmap2.filename + size, 0, machine->id_hdr_size); + event->mmap2.header.size += machine->id_hdr_size; + event->mmap2.pid = tgid; + event->mmap2.tid = pid; + + if (perf_tool__process_synth_event(tool, event, machine, process) != 0) { + rc = -1; + break; + } + + if (truncation) + break; + } + + fclose(fp); + return rc; +} + +int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t process, + struct machine *machine) +{ + int rc = 0; + struct map *pos; + struct maps *maps = machine__kernel_maps(machine); + union perf_event *event = zalloc((sizeof(event->mmap) + + machine->id_hdr_size)); + if (event == NULL) { + pr_debug("Not enough memory synthesizing mmap event " + "for kernel modules\n"); + return -1; + } + + event->header.type = PERF_RECORD_MMAP; + + /* + * kernel uses 0 for user space maps, see kernel/perf_event.c + * __perf_event_mmap + */ + if (machine__is_host(machine)) + event->header.misc = PERF_RECORD_MISC_KERNEL; + else + event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL; + + for (pos = maps__first(maps); pos; pos = map__next(pos)) { + size_t size; + + if (!__map__is_kmodule(pos)) + continue; + + size = PERF_ALIGN(pos->dso->long_name_len + 1, sizeof(u64)); + event->mmap.header.type = PERF_RECORD_MMAP; + event->mmap.header.size = (sizeof(event->mmap) - + (sizeof(event->mmap.filename) - size)); + memset(event->mmap.filename + size, 0, machine->id_hdr_size); + event->mmap.header.size += machine->id_hdr_size; + event->mmap.start = pos->start; + event->mmap.len = pos->end - pos->start; + event->mmap.pid = machine->pid; + + memcpy(event->mmap.filename, pos->dso->long_name, + pos->dso->long_name_len + 1); + if (perf_tool__process_synth_event(tool, event, machine, process) != 0) { + rc = -1; + break; + } + } + + free(event); + return rc; +} + +static int __event__synthesize_thread(union perf_event *comm_event, + union perf_event *mmap_event, + union perf_event *fork_event, + union perf_event *namespaces_event, + pid_t pid, int full, perf_event__handler_t process, + struct perf_tool *tool, struct machine *machine, bool mmap_data) +{ + char filename[PATH_MAX]; + DIR *tasks; + struct dirent *dirent; + pid_t tgid, ppid; + int rc = 0; + + /* special case: only send one comm event using passed in pid */ + if (!full) { + tgid = perf_event__synthesize_comm(tool, comm_event, pid, + process, machine); + + if (tgid == -1) + return -1; + + if (perf_event__synthesize_namespaces(tool, namespaces_event, pid, + tgid, process, machine) < 0) + return -1; + + /* + * send mmap only for thread group leader + * see thread__init_map_groups + */ + if (pid == tgid && + perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid, + process, machine, mmap_data)) + return -1; + + return 0; + } + + if (machine__is_default_guest(machine)) + return 0; + + snprintf(filename, sizeof(filename), "%s/proc/%d/task", + machine->root_dir, pid); + + tasks = opendir(filename); + if (tasks == NULL) { + pr_debug("couldn't open %s\n", filename); + return 0; + } + + while ((dirent = readdir(tasks)) != NULL) { + char *end; + pid_t _pid; + + _pid = strtol(dirent->d_name, &end, 10); + if (*end) + continue; + + rc = -1; + if (perf_event__prepare_comm(comm_event, _pid, machine, + &tgid, &ppid) != 0) + break; + + if (perf_event__synthesize_fork(tool, fork_event, _pid, tgid, + ppid, process, machine) < 0) + break; + + if (perf_event__synthesize_namespaces(tool, namespaces_event, _pid, + tgid, process, machine) < 0) + break; + + /* + * Send the prepared comm event + */ + if (perf_tool__process_synth_event(tool, comm_event, machine, process) != 0) + break; + + rc = 0; + if (_pid == pid) { + /* process the parent's maps too */ + rc = perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid, + process, machine, mmap_data); + if (rc) + break; + } + } + + closedir(tasks); + return rc; +} + +int perf_event__synthesize_thread_map(struct perf_tool *tool, + struct perf_thread_map *threads, + perf_event__handler_t process, + struct machine *machine, + bool mmap_data) +{ + union perf_event *comm_event, *mmap_event, *fork_event; + union perf_event *namespaces_event; + int err = -1, thread, j; + + comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size); + if (comm_event == NULL) + goto out; + + mmap_event = malloc(sizeof(mmap_event->mmap2) + machine->id_hdr_size); + if (mmap_event == NULL) + goto out_free_comm; + + fork_event = malloc(sizeof(fork_event->fork) + machine->id_hdr_size); + if (fork_event == NULL) + goto out_free_mmap; + + namespaces_event = malloc(sizeof(namespaces_event->namespaces) + + (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + + machine->id_hdr_size); + if (namespaces_event == NULL) + goto out_free_fork; + + err = 0; + for (thread = 0; thread < threads->nr; ++thread) { + if (__event__synthesize_thread(comm_event, mmap_event, + fork_event, namespaces_event, + perf_thread_map__pid(threads, thread), 0, + process, tool, machine, + mmap_data)) { + err = -1; + break; + } + + /* + * comm.pid is set to thread group id by + * perf_event__synthesize_comm + */ + if ((int) comm_event->comm.pid != perf_thread_map__pid(threads, thread)) { + bool need_leader = true; + + /* is thread group leader in thread_map? */ + for (j = 0; j < threads->nr; ++j) { + if ((int) comm_event->comm.pid == perf_thread_map__pid(threads, j)) { + need_leader = false; + break; + } + } + + /* if not, generate events for it */ + if (need_leader && + __event__synthesize_thread(comm_event, mmap_event, + fork_event, namespaces_event, + comm_event->comm.pid, 0, + process, tool, machine, + mmap_data)) { + err = -1; + break; + } + } + } + free(namespaces_event); +out_free_fork: + free(fork_event); +out_free_mmap: + free(mmap_event); +out_free_comm: + free(comm_event); +out: + return err; +} + +static int __perf_event__synthesize_threads(struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine, + bool mmap_data, + struct dirent **dirent, + int start, + int num) +{ + union perf_event *comm_event, *mmap_event, *fork_event; + union perf_event *namespaces_event; + int err = -1; + char *end; + pid_t pid; + int i; + + comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size); + if (comm_event == NULL) + goto out; + + mmap_event = malloc(sizeof(mmap_event->mmap2) + machine->id_hdr_size); + if (mmap_event == NULL) + goto out_free_comm; + + fork_event = malloc(sizeof(fork_event->fork) + machine->id_hdr_size); + if (fork_event == NULL) + goto out_free_mmap; + + namespaces_event = malloc(sizeof(namespaces_event->namespaces) + + (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) + + machine->id_hdr_size); + if (namespaces_event == NULL) + goto out_free_fork; + + for (i = start; i < start + num; i++) { + if (!isdigit(dirent[i]->d_name[0])) + continue; + + pid = (pid_t)strtol(dirent[i]->d_name, &end, 10); + /* only interested in proper numerical dirents */ + if (*end) + continue; + /* + * We may race with exiting thread, so don't stop just because + * one thread couldn't be synthesized. + */ + __event__synthesize_thread(comm_event, mmap_event, fork_event, + namespaces_event, pid, 1, process, + tool, machine, mmap_data); + } + err = 0; + + free(namespaces_event); +out_free_fork: + free(fork_event); +out_free_mmap: + free(mmap_event); +out_free_comm: + free(comm_event); +out: + return err; +} + +struct synthesize_threads_arg { + struct perf_tool *tool; + perf_event__handler_t process; + struct machine *machine; + bool mmap_data; + struct dirent **dirent; + int num; + int start; +}; + +static void *synthesize_threads_worker(void *arg) +{ + struct synthesize_threads_arg *args = arg; + + __perf_event__synthesize_threads(args->tool, args->process, + args->machine, args->mmap_data, + args->dirent, + args->start, args->num); + return NULL; +} + +int perf_event__synthesize_threads(struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine, + bool mmap_data, + unsigned int nr_threads_synthesize) +{ + struct synthesize_threads_arg *args = NULL; + pthread_t *synthesize_threads = NULL; + char proc_path[PATH_MAX]; + struct dirent **dirent; + int num_per_thread; + int m, n, i, j; + int thread_nr; + int base = 0; + int err = -1; + + + if (machine__is_default_guest(machine)) + return 0; + + snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir); + n = scandir(proc_path, &dirent, 0, alphasort); + if (n < 0) + return err; + + if (nr_threads_synthesize == UINT_MAX) + thread_nr = sysconf(_SC_NPROCESSORS_ONLN); + else + thread_nr = nr_threads_synthesize; + + if (thread_nr <= 1) { + err = __perf_event__synthesize_threads(tool, process, + machine, mmap_data, + dirent, base, n); + goto free_dirent; + } + if (thread_nr > n) + thread_nr = n; + + synthesize_threads = calloc(sizeof(pthread_t), thread_nr); + if (synthesize_threads == NULL) + goto free_dirent; + + args = calloc(sizeof(*args), thread_nr); + if (args == NULL) + goto free_threads; + + num_per_thread = n / thread_nr; + m = n % thread_nr; + for (i = 0; i < thread_nr; i++) { + args[i].tool = tool; + args[i].process = process; + args[i].machine = machine; + args[i].mmap_data = mmap_data; + args[i].dirent = dirent; + } + for (i = 0; i < m; i++) { + args[i].num = num_per_thread + 1; + args[i].start = i * args[i].num; + } + if (i != 0) + base = args[i-1].start + args[i-1].num; + for (j = i; j < thread_nr; j++) { + args[j].num = num_per_thread; + args[j].start = base + (j - i) * args[i].num; + } + + for (i = 0; i < thread_nr; i++) { + if (pthread_create(&synthesize_threads[i], NULL, + synthesize_threads_worker, &args[i])) + goto out_join; + } + err = 0; +out_join: + for (i = 0; i < thread_nr; i++) + pthread_join(synthesize_threads[i], NULL); + free(args); +free_threads: + free(synthesize_threads); +free_dirent: + for (i = 0; i < n; i++) + zfree(&dirent[i]); + free(dirent); + + return err; +} + +int __weak perf_event__synthesize_extra_kmaps(struct perf_tool *tool __maybe_unused, + perf_event__handler_t process __maybe_unused, + struct machine *machine __maybe_unused) +{ + return 0; +} + +static int __perf_event__synthesize_kernel_mmap(struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine) +{ + size_t size; + struct map *map = machine__kernel_map(machine); + struct kmap *kmap; + int err; + union perf_event *event; + + if (map == NULL) + return -1; + + kmap = map__kmap(map); + if (!kmap->ref_reloc_sym) + return -1; + + /* + * We should get this from /sys/kernel/sections/.text, but till that is + * available use this, and after it is use this as a fallback for older + * kernels. + */ + event = zalloc((sizeof(event->mmap) + machine->id_hdr_size)); + if (event == NULL) { + pr_debug("Not enough memory synthesizing mmap event " + "for kernel modules\n"); + return -1; + } + + if (machine__is_host(machine)) { + /* + * kernel uses PERF_RECORD_MISC_USER for user space maps, + * see kernel/perf_event.c __perf_event_mmap + */ + event->header.misc = PERF_RECORD_MISC_KERNEL; + } else { + event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL; + } + + size = snprintf(event->mmap.filename, sizeof(event->mmap.filename), + "%s%s", machine->mmap_name, kmap->ref_reloc_sym->name) + 1; + size = PERF_ALIGN(size, sizeof(u64)); + event->mmap.header.type = PERF_RECORD_MMAP; + event->mmap.header.size = (sizeof(event->mmap) - + (sizeof(event->mmap.filename) - size) + machine->id_hdr_size); + event->mmap.pgoff = kmap->ref_reloc_sym->addr; + event->mmap.start = map->start; + event->mmap.len = map->end - event->mmap.start; + event->mmap.pid = machine->pid; + + err = perf_tool__process_synth_event(tool, event, machine, process); + free(event); + + return err; +} + +int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine) +{ + int err; + + err = __perf_event__synthesize_kernel_mmap(tool, process, machine); + if (err < 0) + return err; + + return perf_event__synthesize_extra_kmaps(tool, process, machine); +} + +int perf_event__synthesize_thread_map2(struct perf_tool *tool, + struct perf_thread_map *threads, + perf_event__handler_t process, + struct machine *machine) +{ + union perf_event *event; + int i, err, size; + + size = sizeof(event->thread_map); + size += threads->nr * sizeof(event->thread_map.entries[0]); + + event = zalloc(size); + if (!event) + return -ENOMEM; + + event->header.type = PERF_RECORD_THREAD_MAP; + event->header.size = size; + event->thread_map.nr = threads->nr; + + for (i = 0; i < threads->nr; i++) { + struct perf_record_thread_map_entry *entry = &event->thread_map.entries[i]; + char *comm = perf_thread_map__comm(threads, i); + + if (!comm) + comm = (char *) ""; + + entry->pid = perf_thread_map__pid(threads, i); + strncpy((char *) &entry->comm, comm, sizeof(entry->comm)); + } + + err = process(tool, event, NULL, machine); + + free(event); + return err; +} + +static void synthesize_cpus(struct cpu_map_entries *cpus, + struct perf_cpu_map *map) +{ + int i; + + cpus->nr = map->nr; + + for (i = 0; i < map->nr; i++) + cpus->cpu[i] = map->map[i]; +} + +static void synthesize_mask(struct perf_record_record_cpu_map *mask, + struct perf_cpu_map *map, int max) +{ + int i; + + mask->nr = BITS_TO_LONGS(max); + mask->long_size = sizeof(long); + + for (i = 0; i < map->nr; i++) + set_bit(map->map[i], mask->mask); +} + +static size_t cpus_size(struct perf_cpu_map *map) +{ + return sizeof(struct cpu_map_entries) + map->nr * sizeof(u16); +} + +static size_t mask_size(struct perf_cpu_map *map, int *max) +{ + int i; + + *max = 0; + + for (i = 0; i < map->nr; i++) { + /* bit possition of the cpu is + 1 */ + int bit = map->map[i] + 1; + + if (bit > *max) + *max = bit; + } + + return sizeof(struct perf_record_record_cpu_map) + BITS_TO_LONGS(*max) * sizeof(long); +} + +void *cpu_map_data__alloc(struct perf_cpu_map *map, size_t *size, u16 *type, int *max) +{ + size_t size_cpus, size_mask; + bool is_dummy = perf_cpu_map__empty(map); + + /* + * Both array and mask data have variable size based + * on the number of cpus and their actual values. + * The size of the 'struct perf_record_cpu_map_data' is: + * + * array = size of 'struct cpu_map_entries' + + * number of cpus * sizeof(u64) + * + * mask = size of 'struct perf_record_record_cpu_map' + + * maximum cpu bit converted to size of longs + * + * and finaly + the size of 'struct perf_record_cpu_map_data'. + */ + size_cpus = cpus_size(map); + size_mask = mask_size(map, max); + + if (is_dummy || (size_cpus < size_mask)) { + *size += size_cpus; + *type = PERF_CPU_MAP__CPUS; + } else { + *size += size_mask; + *type = PERF_CPU_MAP__MASK; + } + + *size += sizeof(struct perf_record_cpu_map_data); + *size = PERF_ALIGN(*size, sizeof(u64)); + return zalloc(*size); +} + +void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, struct perf_cpu_map *map, + u16 type, int max) +{ + data->type = type; + + switch (type) { + case PERF_CPU_MAP__CPUS: + synthesize_cpus((struct cpu_map_entries *) data->data, map); + break; + case PERF_CPU_MAP__MASK: + synthesize_mask((struct perf_record_record_cpu_map *)data->data, map, max); + default: + break; + }; +} + +static struct perf_record_cpu_map *cpu_map_event__new(struct perf_cpu_map *map) +{ + size_t size = sizeof(struct perf_record_cpu_map); + struct perf_record_cpu_map *event; + int max; + u16 type; + + event = cpu_map_data__alloc(map, &size, &type, &max); + if (!event) + return NULL; + + event->header.type = PERF_RECORD_CPU_MAP; + event->header.size = size; + event->data.type = type; + + cpu_map_data__synthesize(&event->data, map, type, max); + return event; +} + +int perf_event__synthesize_cpu_map(struct perf_tool *tool, + struct perf_cpu_map *map, + perf_event__handler_t process, + struct machine *machine) +{ + struct perf_record_cpu_map *event; + int err; + + event = cpu_map_event__new(map); + if (!event) + return -ENOMEM; + + err = process(tool, (union perf_event *) event, NULL, machine); + + free(event); + return err; +} + +int perf_event__synthesize_stat_config(struct perf_tool *tool, + struct perf_stat_config *config, + perf_event__handler_t process, + struct machine *machine) +{ + struct perf_record_stat_config *event; + int size, i = 0, err; + + size = sizeof(*event); + size += (PERF_STAT_CONFIG_TERM__MAX * sizeof(event->data[0])); + + event = zalloc(size); + if (!event) + return -ENOMEM; + + event->header.type = PERF_RECORD_STAT_CONFIG; + event->header.size = size; + event->nr = PERF_STAT_CONFIG_TERM__MAX; + +#define ADD(__term, __val) \ + event->data[i].tag = PERF_STAT_CONFIG_TERM__##__term; \ + event->data[i].val = __val; \ + i++; + + ADD(AGGR_MODE, config->aggr_mode) + ADD(INTERVAL, config->interval) + ADD(SCALE, config->scale) + + WARN_ONCE(i != PERF_STAT_CONFIG_TERM__MAX, + "stat config terms unbalanced\n"); +#undef ADD + + err = process(tool, (union perf_event *) event, NULL, machine); + + free(event); + return err; +} + +int perf_event__synthesize_stat(struct perf_tool *tool, + u32 cpu, u32 thread, u64 id, + struct perf_counts_values *count, + perf_event__handler_t process, + struct machine *machine) +{ + struct perf_record_stat event; + + event.header.type = PERF_RECORD_STAT; + event.header.size = sizeof(event); + event.header.misc = 0; + + event.id = id; + event.cpu = cpu; + event.thread = thread; + event.val = count->val; + event.ena = count->ena; + event.run = count->run; + + return process(tool, (union perf_event *) &event, NULL, machine); +} + +int perf_event__synthesize_stat_round(struct perf_tool *tool, + u64 evtime, u64 type, + perf_event__handler_t process, + struct machine *machine) +{ + struct perf_record_stat_round event; + + event.header.type = PERF_RECORD_STAT_ROUND; + event.header.size = sizeof(event); + event.header.misc = 0; + + event.time = evtime; + event.type = type; + + return process(tool, (union perf_event *) &event, NULL, machine); +} + +size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, u64 read_format) +{ + size_t sz, result = sizeof(struct perf_record_sample); + + if (type & PERF_SAMPLE_IDENTIFIER) + result += sizeof(u64); + + if (type & PERF_SAMPLE_IP) + result += sizeof(u64); + + if (type & PERF_SAMPLE_TID) + result += sizeof(u64); + + if (type & PERF_SAMPLE_TIME) + result += sizeof(u64); + + if (type & PERF_SAMPLE_ADDR) + result += sizeof(u64); + + if (type & PERF_SAMPLE_ID) + result += sizeof(u64); + + if (type & PERF_SAMPLE_STREAM_ID) + result += sizeof(u64); + + if (type & PERF_SAMPLE_CPU) + result += sizeof(u64); + + if (type & PERF_SAMPLE_PERIOD) + result += sizeof(u64); + + if (type & PERF_SAMPLE_READ) { + result += sizeof(u64); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + result += sizeof(u64); + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + result += sizeof(u64); + /* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */ + if (read_format & PERF_FORMAT_GROUP) { + sz = sample->read.group.nr * + sizeof(struct sample_read_value); + result += sz; + } else { + result += sizeof(u64); + } + } + + if (type & PERF_SAMPLE_CALLCHAIN) { + sz = (sample->callchain->nr + 1) * sizeof(u64); + result += sz; + } + + if (type & PERF_SAMPLE_RAW) { + result += sizeof(u32); + result += sample->raw_size; + } + + if (type & PERF_SAMPLE_BRANCH_STACK) { + sz = sample->branch_stack->nr * sizeof(struct branch_entry); + sz += sizeof(u64); + result += sz; + } + + if (type & PERF_SAMPLE_REGS_USER) { + if (sample->user_regs.abi) { + result += sizeof(u64); + sz = hweight64(sample->user_regs.mask) * sizeof(u64); + result += sz; + } else { + result += sizeof(u64); + } + } + + if (type & PERF_SAMPLE_STACK_USER) { + sz = sample->user_stack.size; + result += sizeof(u64); + if (sz) { + result += sz; + result += sizeof(u64); + } + } + + if (type & PERF_SAMPLE_WEIGHT) + result += sizeof(u64); + + if (type & PERF_SAMPLE_DATA_SRC) + result += sizeof(u64); + + if (type & PERF_SAMPLE_TRANSACTION) + result += sizeof(u64); + + if (type & PERF_SAMPLE_REGS_INTR) { + if (sample->intr_regs.abi) { + result += sizeof(u64); + sz = hweight64(sample->intr_regs.mask) * sizeof(u64); + result += sz; + } else { + result += sizeof(u64); + } + } + + if (type & PERF_SAMPLE_PHYS_ADDR) + result += sizeof(u64); + + return result; +} + +int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_format, + const struct perf_sample *sample) +{ + __u64 *array; + size_t sz; + /* + * used for cross-endian analysis. See git commit 65014ab3 + * for why this goofiness is needed. + */ + union u64_swap u; + + array = event->sample.array; + + if (type & PERF_SAMPLE_IDENTIFIER) { + *array = sample->id; + array++; + } + + if (type & PERF_SAMPLE_IP) { + *array = sample->ip; + array++; + } + + if (type & PERF_SAMPLE_TID) { + u.val32[0] = sample->pid; + u.val32[1] = sample->tid; + *array = u.val64; + array++; + } + + if (type & PERF_SAMPLE_TIME) { + *array = sample->time; + array++; + } + + if (type & PERF_SAMPLE_ADDR) { + *array = sample->addr; + array++; + } + + if (type & PERF_SAMPLE_ID) { + *array = sample->id; + array++; + } + + if (type & PERF_SAMPLE_STREAM_ID) { + *array = sample->stream_id; + array++; + } + + if (type & PERF_SAMPLE_CPU) { + u.val32[0] = sample->cpu; + u.val32[1] = 0; + *array = u.val64; + array++; + } + + if (type & PERF_SAMPLE_PERIOD) { + *array = sample->period; + array++; + } + + if (type & PERF_SAMPLE_READ) { + if (read_format & PERF_FORMAT_GROUP) + *array = sample->read.group.nr; + else + *array = sample->read.one.value; + array++; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + *array = sample->read.time_enabled; + array++; + } + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + *array = sample->read.time_running; + array++; + } + + /* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */ + if (read_format & PERF_FORMAT_GROUP) { + sz = sample->read.group.nr * + sizeof(struct sample_read_value); + memcpy(array, sample->read.group.values, sz); + array = (void *)array + sz; + } else { + *array = sample->read.one.id; + array++; + } + } + + if (type & PERF_SAMPLE_CALLCHAIN) { + sz = (sample->callchain->nr + 1) * sizeof(u64); + memcpy(array, sample->callchain, sz); + array = (void *)array + sz; + } + + if (type & PERF_SAMPLE_RAW) { + u.val32[0] = sample->raw_size; + *array = u.val64; + array = (void *)array + sizeof(u32); + + memcpy(array, sample->raw_data, sample->raw_size); + array = (void *)array + sample->raw_size; + } + + if (type & PERF_SAMPLE_BRANCH_STACK) { + sz = sample->branch_stack->nr * sizeof(struct branch_entry); + sz += sizeof(u64); + memcpy(array, sample->branch_stack, sz); + array = (void *)array + sz; + } + + if (type & PERF_SAMPLE_REGS_USER) { + if (sample->user_regs.abi) { + *array++ = sample->user_regs.abi; + sz = hweight64(sample->user_regs.mask) * sizeof(u64); + memcpy(array, sample->user_regs.regs, sz); + array = (void *)array + sz; + } else { + *array++ = 0; + } + } + + if (type & PERF_SAMPLE_STACK_USER) { + sz = sample->user_stack.size; + *array++ = sz; + if (sz) { + memcpy(array, sample->user_stack.data, sz); + array = (void *)array + sz; + *array++ = sz; + } + } + + if (type & PERF_SAMPLE_WEIGHT) { + *array = sample->weight; + array++; + } + + if (type & PERF_SAMPLE_DATA_SRC) { + *array = sample->data_src; + array++; + } + + if (type & PERF_SAMPLE_TRANSACTION) { + *array = sample->transaction; + array++; + } + + if (type & PERF_SAMPLE_REGS_INTR) { + if (sample->intr_regs.abi) { + *array++ = sample->intr_regs.abi; + sz = hweight64(sample->intr_regs.mask) * sizeof(u64); + memcpy(array, sample->intr_regs.regs, sz); + array = (void *)array + sz; + } else { + *array++ = 0; + } + } + + if (type & PERF_SAMPLE_PHYS_ADDR) { + *array = sample->phys_addr; + array++; + } + + return 0; +} + +int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, + struct evlist *evlist, struct machine *machine) +{ + union perf_event *ev; + struct evsel *evsel; + size_t nr = 0, i = 0, sz, max_nr, n; + int err; + + pr_debug2("Synthesizing id index\n"); + + max_nr = (UINT16_MAX - sizeof(struct perf_record_id_index)) / + sizeof(struct id_index_entry); + + evlist__for_each_entry(evlist, evsel) + nr += evsel->ids; + + n = nr > max_nr ? max_nr : nr; + sz = sizeof(struct perf_record_id_index) + n * sizeof(struct id_index_entry); + ev = zalloc(sz); + if (!ev) + return -ENOMEM; + + ev->id_index.header.type = PERF_RECORD_ID_INDEX; + ev->id_index.header.size = sz; + ev->id_index.nr = n; + + evlist__for_each_entry(evlist, evsel) { + u32 j; + + for (j = 0; j < evsel->ids; j++) { + struct id_index_entry *e; + struct perf_sample_id *sid; + + if (i >= n) { + err = process(tool, ev, NULL, machine); + if (err) + goto out_err; + nr -= n; + i = 0; + } + + e = &ev->id_index.entries[i++]; + + e->id = evsel->id[j]; + + sid = perf_evlist__id2sid(evlist, e->id); + if (!sid) { + free(ev); + return -ENOENT; + } + + e->idx = sid->idx; + e->cpu = sid->cpu; + e->tid = sid->tid; + } + } + + sz = sizeof(struct perf_record_id_index) + nr * sizeof(struct id_index_entry); + ev->id_index.header.size = sz; + ev->id_index.nr = nr; + + err = process(tool, ev, NULL, machine); +out_err: + free(ev); + + return err; +} + +int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, + struct target *target, struct perf_thread_map *threads, + perf_event__handler_t process, bool data_mmap, + unsigned int nr_threads_synthesize) +{ + if (target__has_task(target)) + return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap); + else if (target__has_cpu(target)) + return perf_event__synthesize_threads(tool, process, + machine, data_mmap, + nr_threads_synthesize); + /* command specified */ + return 0; +} + +int machine__synthesize_threads(struct machine *machine, struct target *target, + struct perf_thread_map *threads, bool data_mmap, + unsigned int nr_threads_synthesize) +{ + return __machine__synthesize_threads(machine, NULL, target, threads, + perf_event__process, data_mmap, + nr_threads_synthesize); +} + +static struct perf_record_event_update *event_update_event__new(size_t size, u64 type, u64 id) +{ + struct perf_record_event_update *ev; + + size += sizeof(*ev); + size = PERF_ALIGN(size, sizeof(u64)); + + ev = zalloc(size); + if (ev) { + ev->header.type = PERF_RECORD_EVENT_UPDATE; + ev->header.size = (u16)size; + ev->type = type; + ev->id = id; + } + return ev; +} + +int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct evsel *evsel, + perf_event__handler_t process) +{ + size_t size = strlen(evsel->unit); + struct perf_record_event_update *ev; + int err; + + ev = event_update_event__new(size + 1, PERF_EVENT_UPDATE__UNIT, evsel->id[0]); + if (ev == NULL) + return -ENOMEM; + + strlcpy(ev->data, evsel->unit, size + 1); + err = process(tool, (union perf_event *)ev, NULL, NULL); + free(ev); + return err; +} + +int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evsel *evsel, + perf_event__handler_t process) +{ + struct perf_record_event_update *ev; + struct perf_record_event_update_scale *ev_data; + int err; + + ev = event_update_event__new(sizeof(*ev_data), PERF_EVENT_UPDATE__SCALE, evsel->id[0]); + if (ev == NULL) + return -ENOMEM; + + ev_data = (struct perf_record_event_update_scale *)ev->data; + ev_data->scale = evsel->scale; + err = process(tool, (union perf_event *)ev, NULL, NULL); + free(ev); + return err; +} + +int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evsel *evsel, + perf_event__handler_t process) +{ + struct perf_record_event_update *ev; + size_t len = strlen(evsel->name); + int err; + + ev = event_update_event__new(len + 1, PERF_EVENT_UPDATE__NAME, evsel->id[0]); + if (ev == NULL) + return -ENOMEM; + + strlcpy(ev->data, evsel->name, len + 1); + err = process(tool, (union perf_event *)ev, NULL, NULL); + free(ev); + return err; +} + +int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evsel *evsel, + perf_event__handler_t process) +{ + size_t size = sizeof(struct perf_record_event_update); + struct perf_record_event_update *ev; + int max, err; + u16 type; + + if (!evsel->core.own_cpus) + return 0; + + ev = cpu_map_data__alloc(evsel->core.own_cpus, &size, &type, &max); + if (!ev) + return -ENOMEM; + + ev->header.type = PERF_RECORD_EVENT_UPDATE; + ev->header.size = (u16)size; + ev->type = PERF_EVENT_UPDATE__CPUS; + ev->id = evsel->id[0]; + + cpu_map_data__synthesize((struct perf_record_cpu_map_data *)ev->data, + evsel->core.own_cpus, type, max); + + err = process(tool, (union perf_event *)ev, NULL, NULL); + free(ev); + return err; +} + +int perf_event__synthesize_attrs(struct perf_tool *tool, struct evlist *evlist, + perf_event__handler_t process) +{ + struct evsel *evsel; + int err = 0; + + evlist__for_each_entry(evlist, evsel) { + err = perf_event__synthesize_attr(tool, &evsel->core.attr, evsel->ids, + evsel->id, process); + if (err) { + pr_debug("failed to create perf header attribute\n"); + return err; + } + } + + return err; +} + +static bool has_unit(struct evsel *evsel) +{ + return evsel->unit && *evsel->unit; +} + +static bool has_scale(struct evsel *evsel) +{ + return evsel->scale != 1; +} + +int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evsel_list, + perf_event__handler_t process, bool is_pipe) +{ + struct evsel *evsel; + int err; + + /* + * Synthesize other events stuff not carried within + * attr event - unit, scale, name + */ + evlist__for_each_entry(evsel_list, evsel) { + if (!evsel->supported) + continue; + + /* + * Synthesize unit and scale only if it's defined. + */ + if (has_unit(evsel)) { + err = perf_event__synthesize_event_update_unit(tool, evsel, process); + if (err < 0) { + pr_err("Couldn't synthesize evsel unit.\n"); + return err; + } + } + + if (has_scale(evsel)) { + err = perf_event__synthesize_event_update_scale(tool, evsel, process); + if (err < 0) { + pr_err("Couldn't synthesize evsel evsel.\n"); + return err; + } + } + + if (evsel->core.own_cpus) { + err = perf_event__synthesize_event_update_cpus(tool, evsel, process); + if (err < 0) { + pr_err("Couldn't synthesize evsel cpus.\n"); + return err; + } + } + + /* + * Name is needed only for pipe output, + * perf.data carries event names. + */ + if (is_pipe) { + err = perf_event__synthesize_event_update_name(tool, evsel, process); + if (err < 0) { + pr_err("Couldn't synthesize evsel name.\n"); + return err; + } + } + } + return 0; +} + +int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *attr, + u32 ids, u64 *id, perf_event__handler_t process) +{ + union perf_event *ev; + size_t size; + int err; + + size = sizeof(struct perf_event_attr); + size = PERF_ALIGN(size, sizeof(u64)); + size += sizeof(struct perf_event_header); + size += ids * sizeof(u64); + + ev = zalloc(size); + + if (ev == NULL) + return -ENOMEM; + + ev->attr.attr = *attr; + memcpy(ev->attr.id, id, ids * sizeof(u64)); + + ev->attr.header.type = PERF_RECORD_HEADER_ATTR; + ev->attr.header.size = (u16)size; + + if (ev->attr.header.size == size) + err = process(tool, ev, NULL, NULL); + else + err = -E2BIG; + + free(ev); + + return err; +} + +int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct evlist *evlist, + perf_event__handler_t process) +{ + union perf_event ev; + struct tracing_data *tdata; + ssize_t size = 0, aligned_size = 0, padding; + struct feat_fd ff; + + /* + * We are going to store the size of the data followed + * by the data contents. Since the fd descriptor is a pipe, + * we cannot seek back to store the size of the data once + * we know it. Instead we: + * + * - write the tracing data to the temp file + * - get/write the data size to pipe + * - write the tracing data from the temp file + * to the pipe + */ + tdata = tracing_data_get(&evlist->core.entries, fd, true); + if (!tdata) + return -1; + + memset(&ev, 0, sizeof(ev)); + + ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA; + size = tdata->size; + aligned_size = PERF_ALIGN(size, sizeof(u64)); + padding = aligned_size - size; + ev.tracing_data.header.size = sizeof(ev.tracing_data); + ev.tracing_data.size = aligned_size; + + process(tool, &ev, NULL, NULL); + + /* + * The put function will copy all the tracing data + * stored in temp file to the pipe. + */ + tracing_data_put(tdata); + + ff = (struct feat_fd){ .fd = fd }; + if (write_padded(&ff, NULL, 0, padding)) + return -1; + + return aligned_size; +} + +int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 misc, + perf_event__handler_t process, struct machine *machine) +{ + union perf_event ev; + size_t len; + + if (!pos->hit) + return 0; + + memset(&ev, 0, sizeof(ev)); + + len = pos->long_name_len + 1; + len = PERF_ALIGN(len, NAME_ALIGN); + memcpy(&ev.build_id.build_id, pos->build_id, sizeof(pos->build_id)); + ev.build_id.header.type = PERF_RECORD_HEADER_BUILD_ID; + ev.build_id.header.misc = misc; + ev.build_id.pid = machine->pid; + ev.build_id.header.size = sizeof(ev.build_id) + len; + memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len); + + return process(tool, &ev, NULL, machine); +} + +int perf_event__synthesize_stat_events(struct perf_stat_config *config, struct perf_tool *tool, + struct evlist *evlist, perf_event__handler_t process, bool attrs) +{ + int err; + + if (attrs) { + err = perf_event__synthesize_attrs(tool, evlist, process); + if (err < 0) { + pr_err("Couldn't synthesize attrs.\n"); + return err; + } + } + + err = perf_event__synthesize_extra_attr(tool, evlist, process, attrs); + err = perf_event__synthesize_thread_map2(tool, evlist->core.threads, process, NULL); + if (err < 0) { + pr_err("Couldn't synthesize thread map.\n"); + return err; + } + + err = perf_event__synthesize_cpu_map(tool, evlist->core.cpus, process, NULL); + if (err < 0) { + pr_err("Couldn't synthesize thread map.\n"); + return err; + } + + err = perf_event__synthesize_stat_config(tool, config, process, NULL); + if (err < 0) { + pr_err("Couldn't synthesize config.\n"); + return err; + } + + return 0; +} + +int __weak perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused, + struct perf_tool *tool __maybe_unused, + perf_event__handler_t process __maybe_unused, + struct machine *machine __maybe_unused) +{ + return 0; +} + +extern const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE]; + +int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session *session, + struct evlist *evlist, perf_event__handler_t process) +{ + struct perf_header *header = &session->header; + struct perf_record_header_feature *fe; + struct feat_fd ff; + size_t sz, sz_hdr; + int feat, ret; + + sz_hdr = sizeof(fe->header); + sz = sizeof(union perf_event); + /* get a nice alignment */ + sz = PERF_ALIGN(sz, page_size); + + memset(&ff, 0, sizeof(ff)); + + ff.buf = malloc(sz); + if (!ff.buf) + return -ENOMEM; + + ff.size = sz - sz_hdr; + ff.ph = &session->header; + + for_each_set_bit(feat, header->adds_features, HEADER_FEAT_BITS) { + if (!feat_ops[feat].synthesize) { + pr_debug("No record header feature for header :%d\n", feat); + continue; + } + + ff.offset = sizeof(*fe); + + ret = feat_ops[feat].write(&ff, evlist); + if (ret || ff.offset <= (ssize_t)sizeof(*fe)) { + pr_debug("Error writing feature\n"); + continue; + } + /* ff.buf may have changed due to realloc in do_write() */ + fe = ff.buf; + memset(fe, 0, sizeof(*fe)); + + fe->feat_id = feat; + fe->header.type = PERF_RECORD_HEADER_FEATURE; + fe->header.size = ff.offset; + + ret = process(tool, ff.buf, NULL, NULL); + if (ret) { + free(ff.buf); + return ret; + } + } + + /* Send HEADER_LAST_FEATURE mark. */ + fe = ff.buf; + fe->feat_id = HEADER_LAST_FEATURE; + fe->header.type = PERF_RECORD_HEADER_FEATURE; + fe->header.size = sizeof(*fe); + + ret = process(tool, ff.buf, NULL, NULL); + + free(ff.buf); + return ret; +} From b295c3e39c1383e06ba1db4dd836018502e2ff3a Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 18 Sep 2019 16:34:07 +0300 Subject: [PATCH 0675/2880] tools lib traceevent: Convert remaining %p[fF] users to %p[sS] There are no in-kernel %p[fF] users left. Convert the traceevent tool, too, to align with the kernel. Signed-off-by: Sakari Ailus Cc: Andy Shevchenko Cc: devicetree@vger.kernel.org Cc: Heikki Krogerus Cc: Jiri Olsa Cc: Joe Perches Cc: linux-acpi@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: Namhyung Kim Cc: Petr Mladek Cc: Rafael J. Wysocki Cc: Rob Herring Cc: Steven Rostedt (VMware) Cc: Tzvetomir Stoyanov Link: http://lore.kernel.org/lkml/20190918133419.7969-2-sakari.ailus@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../Documentation/libtraceevent-func_apis.txt | 10 +++++----- tools/lib/traceevent/event-parse.c | 18 ++++++++++++++---- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/tools/lib/traceevent/Documentation/libtraceevent-func_apis.txt b/tools/lib/traceevent/Documentation/libtraceevent-func_apis.txt index 38bfea30a5f6..f6aca0df2151 100644 --- a/tools/lib/traceevent/Documentation/libtraceevent-func_apis.txt +++ b/tools/lib/traceevent/Documentation/libtraceevent-func_apis.txt @@ -59,12 +59,12 @@ parser context. The _tep_register_function()_ function registers a function name mapped to an address and (optional) module. This mapping is used in case the function tracer -or events have "%pF" or "%pS" parameter in its format string. It is common to -pass in the kallsyms function names with their corresponding addresses with this +or events have "%pS" parameter in its format string. It is common to pass in +the kallsyms function names with their corresponding addresses with this function. The _tep_ argument is the trace event parser context. The _name_ is -the name of the function, the string is copied internally. The _addr_ is -the start address of the function. The _mod_ is the kernel module -the function may be in (NULL for none). +the name of the function, the string is copied internally. The _addr_ is the +start address of the function. The _mod_ is the kernel module the function may +be in (NULL for none). The _tep_register_print_string()_ function registers a string by the address it was stored in the kernel. Some strings internal to the kernel with static diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index bb22238debfe..6f842af4550b 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -4367,10 +4367,20 @@ static struct tep_print_arg *make_bprint_args(char *fmt, void *data, int size, s switch (*ptr) { case 's': case 'S': - case 'f': - case 'F': case 'x': break; + case 'f': + case 'F': + /* + * Pre-5.5 kernels use %pf and + * %pF for printing symbols + * while kernels since 5.5 use + * %pfw for fwnodes. So check + * %p[fF] isn't followed by 'w'. + */ + if (ptr[1] != 'w') + break; + /* fall through */ default: /* * Older kernels do not process @@ -4487,12 +4497,12 @@ get_bprint_format(void *data, int size __maybe_unused, printk = find_printk(tep, addr); if (!printk) { - if (asprintf(&format, "%%pf: (NO FORMAT FOUND at %llx)\n", addr) < 0) + if (asprintf(&format, "%%ps: (NO FORMAT FOUND at %llx)\n", addr) < 0) return NULL; return format; } - if (asprintf(&format, "%s: %s", "%pf", printk->printk) < 0) + if (asprintf(&format, "%s: %s", "%ps", printk->printk) < 0) return NULL; return format; From b63fd11cced17fcb8e133def29001b0f6aaa5e06 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 4 Sep 2019 15:17:37 +0530 Subject: [PATCH 0676/2880] perf stat: Reset previous counts on repeat with interval When using 'perf stat' with repeat and interval option, it shows wrong values for events. The wrong values will be shown for the first interval on the second and subsequent repetitions. Without the fix: # perf stat -r 3 -I 2000 -e faults -e sched:sched_switch -a sleep 5 2.000282489 53 faults 2.000282489 513 sched:sched_switch 4.005478208 3,721 faults 4.005478208 2,666 sched:sched_switch 5.025470933 395 faults 5.025470933 1,307 sched:sched_switch 2.009602825 1,84,46,74,40,73,70,95,47,520 faults <------ 2.009602825 1,84,46,74,40,73,70,95,49,568 sched:sched_switch <------ 4.019612206 4,730 faults 4.019612206 2,746 sched:sched_switch 5.039615484 3,953 faults 5.039615484 1,496 sched:sched_switch 2.000274620 1,84,46,74,40,73,70,95,47,520 faults <------ 2.000274620 1,84,46,74,40,73,70,95,47,520 sched:sched_switch <------ 4.000480342 4,282 faults 4.000480342 2,303 sched:sched_switch 5.000916811 1,322 faults 5.000916811 1,064 sched:sched_switch # prev_raw_counts is allocated when using intervals. This is used when calculating the difference in the counts of events when using interval. The current counts are stored in prev_raw_counts to calculate the differences in the next iteration. On the first interval of the second and subsequent repetitions, prev_raw_counts would be the values stored in the last interval of the previous repetitions, while the current counts will only be for the first interval of the current repetition. Hence there is a possibility of events showing up as big number. Fix this by resetting prev_raw_counts whenever perf stat repeats the command. With the fix: # perf stat -r 3 -I 2000 -e faults -e sched:sched_switch -a sleep 5 2.019349347 2,597 faults 2.019349347 2,753 sched:sched_switch 4.019577372 3,098 faults 4.019577372 2,532 sched:sched_switch 5.019415481 1,879 faults 5.019415481 1,356 sched:sched_switch 2.000178813 8,468 faults 2.000178813 2,254 sched:sched_switch 4.000404621 7,440 faults 4.000404621 1,266 sched:sched_switch 5.040196079 2,458 faults 5.040196079 556 sched:sched_switch 2.000191939 6,870 faults 2.000191939 1,170 sched:sched_switch 4.000414103 541 faults 4.000414103 902 sched:sched_switch 5.000809863 450 faults 5.000809863 364 sched:sched_switch # Committer notes: This was broken since the cset introducing the --interval feature, i.e. --repeat + --interval wasn't tested at that point, add the Fixes tag so that automatic scripts can pick this up. Fixes: 13370a9b5bb8 ("perf stat: Add interval printing") Signed-off-by: Srikar Dronamraju Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: Ravi Bangoria Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Stephane Eranian Cc: stable@vger.kernel.org # v3.9+ Link: http://lore.kernel.org/lkml/20190904094738.9558-2-srikar@linux.vnet.ibm.com [ Fixed up conflicts with libperf, i.e. some perf_{evsel,evlist} lost the 'perf' prefix ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 3 +++ tools/perf/util/stat.c | 17 +++++++++++++++++ tools/perf/util/stat.h | 1 + 3 files changed, 21 insertions(+) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index eece3d1e429a..fa4b148ecfca 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1952,6 +1952,9 @@ int cmd_stat(int argc, const char **argv) fprintf(output, "[ perf stat: executing run #%d ... ]\n", run_idx + 1); + if (run_idx != 0) + perf_evlist__reset_prev_raw_counts(evsel_list); + status = run_perf_stat(argc, argv, run_idx); if (forever && status != -1) { print_counters(NULL, argc, argv); diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 06571209cb0b..fcd54342c04c 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -162,6 +162,15 @@ static void perf_evsel__free_prev_raw_counts(struct evsel *evsel) evsel->prev_raw_counts = NULL; } +static void perf_evsel__reset_prev_raw_counts(struct evsel *evsel) +{ + if (evsel->prev_raw_counts) { + evsel->prev_raw_counts->aggr.val = 0; + evsel->prev_raw_counts->aggr.ena = 0; + evsel->prev_raw_counts->aggr.run = 0; + } +} + static int perf_evsel__alloc_stats(struct evsel *evsel, bool alloc_raw) { int ncpus = perf_evsel__nr_cpus(evsel); @@ -212,6 +221,14 @@ void perf_evlist__reset_stats(struct evlist *evlist) } } +void perf_evlist__reset_prev_raw_counts(struct evlist *evlist) +{ + struct evsel *evsel; + + evlist__for_each_entry(evlist, evsel) + perf_evsel__reset_prev_raw_counts(evsel); +} + static void zero_per_pkg(struct evsel *counter) { if (counter->per_pkg_mask) diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 0f9c9f6e2041..edbeb2f63e8d 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -193,6 +193,7 @@ void perf_stat__collect_metric_expr(struct evlist *); int perf_evlist__alloc_stats(struct evlist *evlist, bool alloc_raw); void perf_evlist__free_stats(struct evlist *evlist); void perf_evlist__reset_stats(struct evlist *evlist); +void perf_evlist__reset_prev_raw_counts(struct evlist *evlist); int perf_stat_process_counter(struct perf_stat_config *config, struct evsel *counter); From 443f2d5ba13d65ccfd879460f77941875159d154 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 4 Sep 2019 15:17:38 +0530 Subject: [PATCH 0677/2880] perf stat: Fix a segmentation fault when using repeat forever Observe a segmentation fault when 'perf stat' is asked to repeat forever with the interval option. Without fix: # perf stat -r 0 -I 5000 -e cycles -a sleep 10 # time counts unit events 5.000211692 3,13,89,82,34,157 cycles 10.000380119 1,53,98,52,22,294 cycles 10.040467280 17,16,79,265 cycles Segmentation fault This problem was only observed when we use forever option aka -r 0 and works with limited repeats. Calling print_counter with ts being set to NULL, is not a correct option when interval is set. Hence avoid print_counter(NULL,..) if interval is set. With fix: # perf stat -r 0 -I 5000 -e cycles -a sleep 10 # time counts unit events 5.019866622 3,15,14,43,08,697 cycles 10.039865756 3,15,16,31,95,261 cycles 10.059950628 1,26,05,47,158 cycles 5.009902655 3,14,52,62,33,932 cycles 10.019880228 3,14,52,22,89,154 cycles 10.030543876 66,90,18,333 cycles 5.009848281 3,14,51,98,25,437 cycles 10.029854402 3,15,14,93,04,918 cycles 5.009834177 3,14,51,95,92,316 cycles Committer notes: Did the 'git bisect' to find the cset introducing the problem to add the Fixes tag below, and at that time the problem reproduced as: (gdb) run stat -r0 -I500 sleep 1 Program received signal SIGSEGV, Segmentation fault. print_interval (prefix=prefix@entry=0x7fffffffc8d0 "", ts=ts@entry=0x0) at builtin-stat.c:866 866 sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep); (gdb) bt #0 print_interval (prefix=prefix@entry=0x7fffffffc8d0 "", ts=ts@entry=0x0) at builtin-stat.c:866 #1 0x000000000041860a in print_counters (ts=ts@entry=0x0, argc=argc@entry=2, argv=argv@entry=0x7fffffffd640) at builtin-stat.c:938 #2 0x0000000000419a7f in cmd_stat (argc=2, argv=0x7fffffffd640, prefix=) at builtin-stat.c:1411 #3 0x000000000045c65a in run_builtin (p=p@entry=0x6291b8 , argc=argc@entry=5, argv=argv@entry=0x7fffffffd640) at perf.c:370 #4 0x000000000045c893 in handle_internal_command (argc=5, argv=0x7fffffffd640) at perf.c:429 #5 0x000000000045c8f1 in run_argv (argcp=argcp@entry=0x7fffffffd4ac, argv=argv@entry=0x7fffffffd4a0) at perf.c:473 #6 0x000000000045cac9 in main (argc=, argv=) at perf.c:588 (gdb) Mostly the same as just before this patch: Program received signal SIGSEGV, Segmentation fault. 0x00000000005874a7 in print_interval (config=0xa1f2a0 , evlist=0xbc9b90, prefix=0x7fffffffd1c0 "`", ts=0x0) at util/stat-display.c:964 964 sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, config->csv_sep); (gdb) bt #0 0x00000000005874a7 in print_interval (config=0xa1f2a0 , evlist=0xbc9b90, prefix=0x7fffffffd1c0 "`", ts=0x0) at util/stat-display.c:964 #1 0x0000000000588047 in perf_evlist__print_counters (evlist=0xbc9b90, config=0xa1f2a0 , _target=0xa1f0c0 , ts=0x0, argc=2, argv=0x7fffffffd670) at util/stat-display.c:1172 #2 0x000000000045390f in print_counters (ts=0x0, argc=2, argv=0x7fffffffd670) at builtin-stat.c:656 #3 0x0000000000456bb5 in cmd_stat (argc=2, argv=0x7fffffffd670) at builtin-stat.c:1960 #4 0x00000000004dd2e0 in run_builtin (p=0xa30e00 , argc=5, argv=0x7fffffffd670) at perf.c:310 #5 0x00000000004dd54d in handle_internal_command (argc=5, argv=0x7fffffffd670) at perf.c:362 #6 0x00000000004dd694 in run_argv (argcp=0x7fffffffd4cc, argv=0x7fffffffd4c0) at perf.c:406 #7 0x00000000004dda11 in main (argc=5, argv=0x7fffffffd670) at perf.c:531 (gdb) Fixes: d4f63a4741a8 ("perf stat: Introduce print_counters function") Signed-off-by: Srikar Dronamraju Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: Ravi Bangoria Cc: Namhyung Kim Cc: Naveen N. Rao Cc: stable@vger.kernel.org # v4.2+ Link: http://lore.kernel.org/lkml/20190904094738.9558-3-srikar@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index fa4b148ecfca..60cdd383af81 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1956,7 +1956,7 @@ int cmd_stat(int argc, const char **argv) perf_evlist__reset_prev_raw_counts(evsel_list); status = run_perf_stat(argc, argv, run_idx); - if (forever && status != -1) { + if (forever && status != -1 && !interval) { print_counters(NULL, argc, argv); perf_stat__reset_stats(); } From ce095c9ac293d1683f9fedb214811727dff0d508 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 11 Sep 2019 16:21:48 +0100 Subject: [PATCH 0678/2880] perf test: Fix spelling mistake "allos" -> "allocate" There is a spelling mistake in a TEST_ASSERT_VAL message. Fix it. Signed-off-by: Colin King Reviewed-by: Mukesh Ojha Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Link: http://lore.kernel.org/lkml/20190911152148.17031-1-colin.king@canonical.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/event_update.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index 4bb772e2b73d..0497d900ced2 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -94,7 +94,7 @@ int test__event_update(struct test *test __maybe_unused, int subtest __maybe_unu evsel = perf_evlist__first(evlist); - TEST_ASSERT_VAL("failed to allos ids", + TEST_ASSERT_VAL("failed to allocate ids", !perf_evsel__alloc_id(evsel, 1, 1)); perf_evlist__id_add(evlist, evsel, 0, 0, 123); From 8067b3da970baa12e6045400fdf009673b8dd3c2 Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Thu, 18 Jul 2019 23:47:47 +0530 Subject: [PATCH 0679/2880] perf kvm: Move kvm-stat header file from conditional inclusion to common include section Move kvm-stat header file to the common include section, and make the definitions in the header file under the conditional inclusion `#ifdef HAVE_KVM_STAT_SUPPORT`. This helps to define other 'perf kvm' related function prototypes in kvm-stat header file, which may not need kvm-stat support. Signed-off-by: Anju T Sudhakar Reviewed-By: Ravi Bangoria Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Namhyung Kim Cc: Peter Zijlstra Cc: linuxppc-dev@lists.ozlabs.org Link: http://lore.kernel.org/lkml/20190718181749.30612-1-anju@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kvm.c | 2 +- tools/perf/util/kvm-stat.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index ac6d6e061dc5..2b822be87673 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -21,6 +21,7 @@ #include "util/top.h" #include "util/data.h" #include "util/ordered-events.h" +#include "util/kvm-stat.h" #include "ui/ui.h" #include @@ -59,7 +60,6 @@ static const char *get_filename_for_perf_kvm(void) } #ifdef HAVE_KVM_STAT_SUPPORT -#include "util/kvm-stat.h" void exit_event_get_key(struct evsel *evsel, struct perf_sample *sample, diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h index 46913637085b..8fd6ec20662c 100644 --- a/tools/perf/util/kvm-stat.h +++ b/tools/perf/util/kvm-stat.h @@ -2,6 +2,8 @@ #ifndef __PERF_KVM_STAT_H #define __PERF_KVM_STAT_H +#ifdef HAVE_KVM_STAT_SUPPORT + #include "tool.h" #include "stat.h" #include "record.h" @@ -144,5 +146,6 @@ extern const int decode_str_len; extern const char *kvm_exit_reason; extern const char *kvm_entry_trace; extern const char *kvm_exit_trace; +#endif /* HAVE_KVM_STAT_SUPPORT */ #endif /* __PERF_KVM_STAT_H */ From 124eb5f82bf9395419b20205c4dcc1b8fcda7f29 Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Thu, 18 Jul 2019 23:47:48 +0530 Subject: [PATCH 0680/2880] perf kvm: Add arch neutral function to choose event for perf kvm record 'perf kvm record' uses 'cycles'(if the user did not specify any event) as the default event to profile the guest. This will not provide any proper samples from the guest incase of powerpc architecture, since in powerpc the PMUs are controlled by the guest rather than the host. Patch adds a function to pick an arch specific event for 'perf kvm record', instead of selecting 'cycles' as a default event for all architectures. For powerpc this function checks for any user specified event, and if there isn't any it returns invalid instead of proceeding with 'cycles' event. Signed-off-by: Anju T Sudhakar Reviewed-by: Ravi Bangoria Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Namhyung Kim Cc: Peter Zijlstra Cc: linuxppc-dev@lists.ozlabs.org Link: http://lore.kernel.org/lkml/20190718181749.30612-2-anju@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/util/kvm-stat.c | 37 +++++++++++++++++++++++++ tools/perf/builtin-kvm.c | 12 +++++++- tools/perf/util/kvm-stat.h | 1 + 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index f0dbf7b075c8..ec5b771029e4 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -8,6 +8,7 @@ #include "book3s_hv_exits.h" #include "book3s_hcalls.h" +#include #define NR_TPS 4 @@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) return ret; } + +/* + * Incase of powerpc architecture, pmu registers are programmable + * by guest kernel. So monitoring guest via host may not provide + * valid samples. It is better to fail the "perf kvm record" + * with default "cycles" event to monitor guest in powerpc. + * + * Function to parse the arguments and return appropriate values. + */ +int kvm_add_default_arch_event(int *argc, const char **argv) +{ + const char **tmp; + bool event = false; + int i, j = *argc; + + const struct option event_options[] = { + OPT_BOOLEAN('e', "event", &event, NULL), + OPT_END() + }; + + tmp = calloc(j + 1, sizeof(char *)); + if (!tmp) + return -EINVAL; + + for (i = 0; i < j; i++) + tmp[i] = argv[i]; + + parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN); + if (!event) { + free(tmp); + return -EINVAL; + } + + free(tmp); + return 0; +} diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 2b822be87673..6e3e36658900 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1514,11 +1514,21 @@ perf_stat: } #endif /* HAVE_KVM_STAT_SUPPORT */ +int __weak kvm_add_default_arch_event(int *argc __maybe_unused, + const char **argv __maybe_unused) +{ + return 0; +} + static int __cmd_record(const char *file_name, int argc, const char **argv) { - int rec_argc, i = 0, j; + int rec_argc, i = 0, j, ret; const char **rec_argv; + ret = kvm_add_default_arch_event(&argc, argv); + if (ret) + return -EINVAL; + rec_argc = argc + 2; rec_argv = calloc(rec_argc + 1, sizeof(char *)); rec_argv[i++] = strdup("record"); diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h index 8fd6ec20662c..6f0fa05b62b6 100644 --- a/tools/perf/util/kvm-stat.h +++ b/tools/perf/util/kvm-stat.h @@ -148,4 +148,5 @@ extern const char *kvm_entry_trace; extern const char *kvm_exit_trace; #endif /* HAVE_KVM_STAT_SUPPORT */ +extern int kvm_add_default_arch_event(int *argc, const char **argv); #endif /* __PERF_KVM_STAT_H */ From 2bff2b828502b5e5d5ea5a52643d3542053df03f Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Thu, 18 Jul 2019 23:47:49 +0530 Subject: [PATCH 0681/2880] perf kvm stat: Set 'trace_cycles' as default event for 'perf kvm record' in powerpc Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record' in powerpc. Signed-off-by: Anju T Sudhakar Reviewed-by: Ravi Bangoria Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Namhyung Kim Cc: Peter Zijlstra Cc: linuxppc-dev@lists.ozlabs.org Link: http://lore.kernel.org/lkml/20190718181749.30612-3-anju@linux.vnet.ibm.com [ Add missing pmu.h header, needed because this patch uses pmu_have_event() ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/util/kvm-stat.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index ec5b771029e4..9cc1c4a9dec4 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -5,6 +5,7 @@ #include "util/debug.h" #include "util/evsel.h" #include "util/evlist.h" +#include "util/pmu.h" #include "book3s_hv_exits.h" #include "book3s_hcalls.h" @@ -177,8 +178,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) /* * Incase of powerpc architecture, pmu registers are programmable * by guest kernel. So monitoring guest via host may not provide - * valid samples. It is better to fail the "perf kvm record" - * with default "cycles" event to monitor guest in powerpc. + * valid samples with default 'cycles' event. It is better to use + * 'trace_imc/trace_cycles' event for guest profiling, since it + * can track the guest instruction pointer in the trace-record. * * Function to parse the arguments and return appropriate values. */ @@ -202,8 +204,14 @@ int kvm_add_default_arch_event(int *argc, const char **argv) parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN); if (!event) { - free(tmp); - return -EINVAL; + if (pmu_have_event("trace_imc", "trace_cycles")) { + argv[j++] = strdup("-e"); + argv[j++] = strdup("trace_imc/trace_cycles/"); + *argc += 2; + } else { + free(tmp); + return -EINVAL; + } } free(tmp); From 7b485d175631be676424aedb8cd2f66d0c93da78 Mon Sep 17 00:00:00 2001 From: "Shih-Yuan Lee (FourDollars)" Date: Fri, 20 Sep 2019 21:40:53 +0800 Subject: [PATCH 0682/2880] ALSA: hda - Add laptop imic fixup for ASUS M9V laptop The same fixup to enable laptop imic is needed for ASUS M9V with AD1986A codec like another HP machine. Signed-off-by: Shih-Yuan Lee (FourDollars) Cc: Link: https://lore.kernel.org/r/20190920134052.GA8035@localhost Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_analog.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_analog.c b/sound/pci/hda/patch_analog.c index e283966bdbb1..bc9dd8e6fd86 100644 --- a/sound/pci/hda/patch_analog.c +++ b/sound/pci/hda/patch_analog.c @@ -357,6 +357,7 @@ static const struct hda_fixup ad1986a_fixups[] = { static const struct snd_pci_quirk ad1986a_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x30af, "HP B2800", AD1986A_FIXUP_LAPTOP_IMIC), + SND_PCI_QUIRK(0x1043, 0x1153, "ASUS M9V", AD1986A_FIXUP_LAPTOP_IMIC), SND_PCI_QUIRK(0x1043, 0x1443, "ASUS Z99He", AD1986A_FIXUP_EAPD), SND_PCI_QUIRK(0x1043, 0x1447, "ASUS A8JN", AD1986A_FIXUP_EAPD), SND_PCI_QUIRK_MASK(0x1043, 0xff00, 0x8100, "ASUS P5", AD1986A_FIXUP_3STACK), From f110d252ae7985def3cd0fd8b03979180a8633a0 Mon Sep 17 00:00:00 2001 From: Srikanth Krishnakar Date: Fri, 20 Sep 2019 15:01:56 +0530 Subject: [PATCH 0683/2880] platform/x86: pmc_atom: Add Siemens SIMATIC IPC277E to critclk_systems DMI table The SIMATIC IPC277E uses the PMC clock for on-board components and gets stuck during boot if the clock is disabled. Therefore, add this device to the critical systems list. Tested on SIMATIC IPC277E. Fixes: 648e921888ad ("clk: x86: Stop marking clocks as CLK_IS_CRITICAL") Reviewed-by: Jan Kiszka Cc: Cedric Hombourger Signed-off-by: Srikanth Krishnakar Signed-off-by: Andy Shevchenko --- drivers/platform/x86/pmc_atom.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/platform/x86/pmc_atom.c b/drivers/platform/x86/pmc_atom.c index 9aca5e7ce6d0..07d1b911e72f 100644 --- a/drivers/platform/x86/pmc_atom.c +++ b/drivers/platform/x86/pmc_atom.c @@ -422,6 +422,13 @@ static const struct dmi_system_id critclk_systems[] = { DMI_MATCH(DMI_PRODUCT_VERSION, "6ES7647-8B"), }, }, + { + .ident = "SIMATIC IPC277E", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "SIEMENS AG"), + DMI_MATCH(DMI_PRODUCT_VERSION, "6AV7882-0"), + }, + }, { /*sentinel*/ } }; From 24a8d78a9affb63e5ced313ccde6888fe96edc6e Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Fri, 20 Sep 2019 13:02:33 +0300 Subject: [PATCH 0684/2880] platform/x86: i2c-multi-instantiate: Derive the device name from parent When naming the new devices, instead of using the ACPI ID in the name as base, using the parent device's name. That makes it possible to support multiple multi-instance i2c devices of the same type in the same system. This fixes an issue seen on some Intel Kaby Lake based boards: sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:15.0/i2c_designware.0/i2c-0/i2c-INT3515-tps6598x.0' Fixes: 2336dfadfb1e ("platform/x86: i2c-multi-instantiate: Allow to have same slaves") Cc: stable@vger.kernel.org Signed-off-by: Heikki Krogerus Reviewed-by: Hans de Goede Signed-off-by: Andy Shevchenko --- drivers/platform/x86/i2c-multi-instantiate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/i2c-multi-instantiate.c b/drivers/platform/x86/i2c-multi-instantiate.c index 61fe341a85aa..ea68f6ed66ae 100644 --- a/drivers/platform/x86/i2c-multi-instantiate.c +++ b/drivers/platform/x86/i2c-multi-instantiate.c @@ -90,7 +90,7 @@ static int i2c_multi_inst_probe(struct platform_device *pdev) for (i = 0; i < multi->num_clients && inst_data[i].type; i++) { memset(&board_info, 0, sizeof(board_info)); strlcpy(board_info.type, inst_data[i].type, I2C_NAME_SIZE); - snprintf(name, sizeof(name), "%s-%s.%d", match->id, + snprintf(name, sizeof(name), "%s-%s.%d", dev_name(dev), inst_data[i].type, i); board_info.dev_name = name; switch (inst_data[i].flags & IRQ_RESOURCE_TYPE) { From b47613da3b71ca59cc6924bad2d74974f86fab92 Mon Sep 17 00:00:00 2001 From: Xiang Wang Date: Fri, 6 Sep 2019 11:56:09 +0800 Subject: [PATCH 0685/2880] arch/riscv: disable excess harts before picking main boot hart Harts with id greater than or equal to CONFIG_NR_CPUS need to be disabled. But the kernel can pick any hart as the main hart. So, before picking the main hart, the kernel must disable harts with ids greater than or equal to CONFIG_NR_CPUS. Signed-off-by: Xiang Wang Reviewed-by: Palmer Dabbelt Reviewed-by: Anup Patel [paul.walmsley@sifive.com: updated to apply; cleaned up patch description] Signed-off-by: Paul Walmsley --- arch/riscv/kernel/head.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 15a9189f91ad..72f89b7590dd 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -63,6 +63,11 @@ _start_kernel: li t0, SR_FS csrc CSR_SSTATUS, t0 +#ifdef CONFIG_SMP + li t0, CONFIG_NR_CPUS + bgeu a0, t0, .Lsecondary_park +#endif + /* Pick one hart to run the main boot sequence */ la a3, hart_lottery li a2, 1 @@ -154,9 +159,6 @@ relocate: .Lsecondary_start: #ifdef CONFIG_SMP - li a1, CONFIG_NR_CPUS - bgeu a0, a1, .Lsecondary_park - /* Set trap vector to spin forever to help debug */ la a3, .Lsecondary_park csrw CSR_STVEC, a3 From dee04eee9182dae91801d0db5bb2acfd5365a749 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 4 Sep 2019 16:13:26 +0000 Subject: [PATCH 0686/2880] KVM: RISC-V: Add KVM_REG_RISCV for ONE_REG interface We will be using ONE_REG interface accessing VCPU registers from user-space hence we add KVM_REG_RISCV for RISC-V VCPU registers. Signed-off-by: Anup Patel Acked-by: Paolo Bonzini Reviewed-by: Paolo Bonzini Reviewed-by: Alexander Graf Signed-off-by: Paul Walmsley --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 233efbb1c81c..18a2b43097f8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1145,6 +1145,7 @@ struct kvm_dirty_tlb { #define KVM_REG_S390 0x5000000000000000ULL #define KVM_REG_ARM64 0x6000000000000000ULL #define KVM_REG_MIPS 0x7000000000000000ULL +#define KVM_REG_RISCV 0x8000000000000000ULL #define KVM_REG_SIZE_SHIFT 52 #define KVM_REG_SIZE_MASK 0x00f0000000000000ULL From d3d7a0ce020e2d14967159b5351158c80b681760 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 4 Sep 2019 16:14:06 +0000 Subject: [PATCH 0687/2880] RISC-V: Export kernel symbols for kvm Export a few symbols used by kvm module. Without this, kvm cannot be compiled as a module. Signed-off-by: Atish Patra Signed-off-by: Anup Patel Acked-by: Paolo Bonzini Reviewed-by: Paolo Bonzini Reviewed-by: Alexander Graf [paul.walmsley@sifive.com: updated to apply; clarified short patch description] Signed-off-by: Paul Walmsley --- arch/riscv/kernel/smp.c | 1 + arch/riscv/kernel/time.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index 3836760d7aaf..b18cd6c8e8fb 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -206,3 +206,4 @@ void smp_send_reschedule(int cpu) { send_ipi_single(cpu, IPI_RESCHEDULE); } +EXPORT_SYMBOL_GPL(smp_send_reschedule); diff --git a/arch/riscv/kernel/time.c b/arch/riscv/kernel/time.c index 541a2b885814..9dd1f2e64db1 100644 --- a/arch/riscv/kernel/time.c +++ b/arch/riscv/kernel/time.c @@ -9,6 +9,7 @@ #include unsigned long riscv_timebase; +EXPORT_SYMBOL_GPL(riscv_timebase); void __init time_init(void) { From 3bcca2a5a933e05db628ba731567de86ba7ed372 Mon Sep 17 00:00:00 2001 From: Bin Meng Date: Thu, 5 Sep 2019 05:46:14 -0700 Subject: [PATCH 0688/2880] riscv: dts: sifive: Add ethernet0 to the aliases node U-Boot expects this alias to be in place in order to fix up the mac address of the ethernet node. Signed-off-by: Bin Meng Reviewed-by: Christoph Hellwig Signed-off-by: Paul Walmsley --- arch/riscv/boot/dts/sifive/fu540-c000.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi index 5a29211d396e..ae5c42d6943a 100644 --- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi @@ -13,6 +13,7 @@ aliases { serial0 = &uart0; serial1 = &uart1; + ethernet0 = ð0; }; chosen { From c81007116bd23e9e2103c267184dc38d3acc1099 Mon Sep 17 00:00:00 2001 From: Bin Meng Date: Thu, 5 Sep 2019 05:45:53 -0700 Subject: [PATCH 0689/2880] riscv: dts: sifive: Drop "clock-frequency" property of cpu nodes The "clock-frequency" property of cpu nodes isn't required. Drop it. Signed-off-by: Bin Meng Reviewed-by: Christoph Hellwig Signed-off-by: Paul Walmsley --- arch/riscv/boot/dts/sifive/fu540-c000.dtsi | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi index ae5c42d6943a..afa43c7ea369 100644 --- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi @@ -61,7 +61,6 @@ }; }; cpu2: cpu@2 { - clock-frequency = <0>; compatible = "sifive,u54-mc", "sifive,rocket0", "riscv"; d-cache-block-size = <64>; d-cache-sets = <64>; @@ -85,7 +84,6 @@ }; }; cpu3: cpu@3 { - clock-frequency = <0>; compatible = "sifive,u54-mc", "sifive,rocket0", "riscv"; d-cache-block-size = <64>; d-cache-sets = <64>; @@ -109,7 +107,6 @@ }; }; cpu4: cpu@4 { - clock-frequency = <0>; compatible = "sifive,u54-mc", "sifive,rocket0", "riscv"; d-cache-block-size = <64>; d-cache-sets = <64>; From c82dd6d078a2bb29d41eda032bb96d05699a524d Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 16 Sep 2019 16:47:41 +0800 Subject: [PATCH 0690/2880] riscv: Avoid interrupts being erroneously enabled in handle_exception() When the handle_exception function addresses an exception, the interrupts will be unconditionally enabled after finishing the context save. However, It may erroneously enable the interrupts if the interrupts are disabled before entering the handle_exception. For example, one of the WARN_ON() condition is satisfied in the scheduling where the interrupt is disabled and rq.lock is locked. The WARN_ON will trigger a break exception and the handle_exception function will enable the interrupts before entering do_trap_break function. During the procedure, if a timer interrupt is pending, it will be taken when interrupts are enabled. In this case, it may cause a deadlock problem if the rq.lock is locked again in the timer ISR. Hence, the handle_exception() can only enable interrupts when the state of sstatus.SPIE is 1. This patch is tested on HiFive Unleashed board. Signed-off-by: Vincent Chen Reviewed-by: Palmer Dabbelt [paul.walmsley@sifive.com: updated to apply] Fixes: bcae803a21317 ("RISC-V: Enable IRQ during exception handling") Cc: David Abdurachmanov Cc: stable@vger.kernel.org Signed-off-by: Paul Walmsley --- arch/riscv/kernel/entry.S | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 74ccfd464071..da7aa88113c2 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -166,9 +166,13 @@ ENTRY(handle_exception) move a0, sp /* pt_regs */ tail do_IRQ 1: - /* Exceptions run with interrupts enabled */ + /* Exceptions run with interrupts enabled or disabled + depending on the state of sstatus.SR_SPIE */ + andi t0, s1, SR_SPIE + beqz t0, 1f csrs CSR_SSTATUS, SR_SIE +1: /* Handle syscalls */ li t0, EXC_SYSCALL beq s4, t0, handle_syscall From 7f49fd5d7acd82163685559045d04d49433581cc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 20 Sep 2019 16:33:16 +1000 Subject: [PATCH 0691/2880] nfsd: handle drc over-allocation gracefully. Currently, if there are more clients than allowed for by the space allocation in set_max_drc(), we fail a SESSION_CREATE request with NFS4ERR_DELAY. This means that the client retries indefinitely, which isn't a user-friendly response. The RFC requires NFS4ERR_NOSPC, but that would at best result in a clean failure on the client, which is not much more friendly. The current space allocation is a best-guess and doesn't provide any guarantees, we could still run out of space when trying to allocate drc space. So fail more gracefully - always give out at least one slot. If all clients used all the space in all slots, we might start getting memory pressure, but that is possible anyway. So ensure 'num' is always at least 1, and remove the test for it being zero. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index da1093dd5016..8622d6dd45c0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1575,14 +1575,25 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) unsigned long avail, total_avail; spin_lock(&nfsd_drc_lock); - total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + if (nfsd_drc_max_mem > nfsd_drc_mem_used) + total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + else + /* We have handed out more space than we chose in + * set_max_drc() to allow. That isn't really a + * problem as long as that doesn't make us think we + * have lots more due to integer overflow. + */ + total_avail = 0; avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* * Never use more than a third of the remaining memory, - * unless it's the only way to give this client a slot: + * unless it's the only way to give this client a slot. + * Give the client one slot even if that would require + * over-allocation--it is better than failure. */ avail = clamp_t(unsigned long, avail, slotsize, total_avail/3); num = min_t(int, num, avail / slotsize); + num = max_t(int, num, 1); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); @@ -3174,10 +3185,10 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs * performance. When short on memory we therefore prefer to * decrease number of slots instead of their size. Clients that * request larger slots than they need will get poor results: + * Note that we always allow at least one slot, because our + * accounting is soft and provides no guarantees either way. */ ca->maxreqs = nfsd4_get_drc_mem(ca); - if (!ca->maxreqs) - return nfserr_jukebox; return nfs_ok; } From 2030ca560c5f24138c1f0f8c8a89f1ac3b560613 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 20 Sep 2019 16:36:45 +1000 Subject: [PATCH 0692/2880] nfsd: degraded slot-count more gracefully as allocation nears exhaustion. This original code in nfsd4_get_drc_mem() would hand out 30 slots (approximately NFSD_MAX_MEM_PER_SESSION bytes at slightly over 2K per slot) to each requesting client until it ran out of space, then it would possibly give one last client a reduced allocation, then fail the allocation. Since commit de766e570413 ("nfsd: give out fewer session slots as limit approaches") the last 90 slots to be given to about 12 clients with quickly reducing slot counts (better than just 3 clients). This still seems unnecessarily hasty. A subsequent patch allows over-allocation so every client gets at least one slot, but that might be a bit restrictive. The requested number of nfsd threads is the best guide we have to the expected number of clients, so use that - if it is at least 8. 256 threads on a 256Meg machine - which is a lot for a tiny machine - would result in nfsd_drc_max_mem being 2Meg, so 8K (3 slots) would be available for the first client, and over 200 clients would get more than 1 slot. So I don't think this change will be too debilitating on poorly configured machines, though it does mean that a sensible configuration is a little more important. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8622d6dd45c0..c65aeaa812d4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1568,11 +1568,12 @@ static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) * re-negotiate active sessions and reduce their slot usage to make * room for new connections. For now we just fail the create session. */ -static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) +static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn) { u32 slotsize = slot_bytes(ca); u32 num = ca->maxreqs; unsigned long avail, total_avail; + unsigned int scale_factor; spin_lock(&nfsd_drc_lock); if (nfsd_drc_max_mem > nfsd_drc_mem_used) @@ -1586,12 +1587,18 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) total_avail = 0; avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* - * Never use more than a third of the remaining memory, + * Never use more than a fraction of the remaining memory, * unless it's the only way to give this client a slot. + * The chosen fraction is either 1/8 or 1/number of threads, + * whichever is smaller. This ensures there are adequate + * slots to support multiple clients per thread. * Give the client one slot even if that would require * over-allocation--it is better than failure. */ - avail = clamp_t(unsigned long, avail, slotsize, total_avail/3); + scale_factor = max_t(unsigned int, 8, nn->nfsd_serv->sv_nrthreads); + + avail = clamp_t(unsigned long, avail, slotsize, + total_avail/scale_factor); num = min_t(int, num, avail / slotsize); num = max_t(int, num, 1); nfsd_drc_mem_used += num * slotsize; @@ -3188,7 +3195,7 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs * Note that we always allow at least one slot, because our * accounting is soft and provides no guarantees either way. */ - ca->maxreqs = nfsd4_get_drc_mem(ca); + ca->maxreqs = nfsd4_get_drc_mem(ca, nn); return nfs_ok; } From b117b9b48b24cbb090dd3844b563b464d60a6278 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 5 Sep 2019 11:09:24 +0200 Subject: [PATCH 0693/2880] perf tests: Fix static build test Disable the potentional shared library features, which breaks static build if they are enabled and detected: jvmti and vdso libraries. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190905090924.GA1949@krava Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/make b/tools/perf/tests/make index 6b3afed5d910..c850d1664c56 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -100,7 +100,7 @@ make_install_info := install-info make_install_pdf := install-pdf make_install_prefix := install prefix=/tmp/krava make_install_prefix_slash := install prefix=/tmp/krava/ -make_static := LDFLAGS=-static +make_static := LDFLAGS=-static NO_PERF_READ_VDSO32=1 NO_PERF_READ_VDSOX32=1 NO_JVMTI=1 # all the NO_* variable combined make_minimal := NO_LIBPERL=1 NO_LIBPYTHON=1 NO_NEWT=1 NO_GTK2=1 From 7b678ccdf5f608c408e1368e525ed191ca456bcf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 20 Sep 2019 14:40:30 -0300 Subject: [PATCH 0694/2880] tools headers uapi: Sync prctl.h with the kernel sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To get the changes in: 63f0c6037965 ("arm64: Introduce prctl() options to control the tagged user addresses ABI") that introduces prctl options that then automagically gets catched by the prctl cmd table generator, and thus supported in the 'perf trace' prctl beautifier for the 'option' argument: $ tools/perf/trace/beauty/prctl_option.sh > after $ diff -u before after --- before 2019-09-20 14:38:41.386720870 -0300 +++ after 2019-09-20 14:40:02.583990802 -0300 @@ -49,6 +49,8 @@ [52] = "GET_SPECULATION_CTRL", [53] = "SET_SPECULATION_CTRL", [54] = "PAC_RESET_KEYS", + [55] = "SET_TAGGED_ADDR_CTRL", + [56] = "GET_TAGGED_ADDR_CTRL", }; static const char *prctl_set_mm_options[] = { [1] = "START_CODE", $ For now just the translation of 55 and 56 to the respecting strings are done, more work needed to allow for filters to be used using strings. This, for instance, already works: # perf record -e syscalls:sys_enter_close --filter="fd==4" # perf script | head -5 gpm 1018 [006] 21327.171436: syscalls:sys_enter_close: fd: 0x00000004 gpm 1018 [006] 21329.171583: syscalls:sys_enter_close: fd: 0x00000004 bash 4882 [002] 21330.785496: syscalls:sys_enter_close: fd: 0x00000004 bash 20672 [001] 21330.785719: syscalls:sys_enter_close: fd: 0x00000004 find 20672 [001] 21330.789082: syscalls:sys_enter_close: fd: 0x00000004 # perf record -e syscalls:sys_enter_close --filter="fd>=4" ^C[ perf record: Woken up 1 times to write data ] # perf script | head -5 gpm 1018 [005] 21401.178501: syscalls:sys_enter_close: fd: 0x00000004 gsd-housekeepin 2287 [006] 21402.225365: syscalls:sys_enter_close: fd: 0x0000000b gsd-housekeepin 2287 [006] 21402.226234: syscalls:sys_enter_close: fd: 0x0000000b gsd-housekeepin 2287 [006] 21402.227255: syscalls:sys_enter_close: fd: 0x0000000b gsd-housekeepin 2287 [006] 21402.228088: syscalls:sys_enter_close: fd: 0x0000000b # Being able to pass something like: # perf record -e syscalls:sys_enter_prctl --filter="option=*TAGGED_ADDR*" Should be easy enough, first using tracepoint filters, then via the augmented_raw_syscalls.c BPF method. This addresses this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/prctl.h' differs from latest version at 'include/uapi/linux/prctl.h' diff -u tools/include/uapi/linux/prctl.h include/uapi/linux/prctl.h Cc: Adrian Hunter Cc: Brendan Gregg Cc: Catalin Marinas Cc: Jiri Olsa Cc: Luis Cláudio Gonçalves Cc: Namhyung Kim Cc: Will Deacon Link: https://lkml.kernel.org/n/tip-y8u8kvflooyo9x0if1g3jska@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/prctl.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 094bb03b9cc2..7da1b37b27aa 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -181,7 +181,7 @@ struct prctl_mm_map { #define PR_GET_THP_DISABLE 42 /* - * Tell the kernel to start/stop helping userspace manage bounds tables. + * No longer implemented, but left here to ensure the numbers stay reserved: */ #define PR_MPX_ENABLE_MANAGEMENT 43 #define PR_MPX_DISABLE_MANAGEMENT 44 @@ -229,4 +229,9 @@ struct prctl_mm_map { # define PR_PAC_APDBKEY (1UL << 3) # define PR_PAC_APGAKEY (1UL << 4) +/* Tagged user address controls for arm64 */ +#define PR_SET_TAGGED_ADDR_CTRL 55 +#define PR_GET_TAGGED_ADDR_CTRL 56 +# define PR_TAGGED_ADDR_ENABLE (1UL << 0) + #endif /* _LINUX_PRCTL_H */ From 9846f1366489e1a30d871a5c3198b14394365be7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 20 Sep 2019 14:52:24 -0300 Subject: [PATCH 0695/2880] tools uapi asm-generic: Sync unistd.h with the kernel sources To pick the change from: 78e05972c5e6 ("ipc: fix semtimedop for generic 32-bit architectures") Which doesn't trigger any change in tooling and silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h' diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h Cc: Adrian Hunter Cc: Arnd Bergmann Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-hpnjuyjzoudltqe7dvbokqdt@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/unistd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 1be0e798e362..1fc8faa6e973 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -569,7 +569,7 @@ __SYSCALL(__NR_semget, sys_semget) __SC_COMP(__NR_semctl, sys_semctl, compat_sys_semctl) #if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_semtimedop 192 -__SC_COMP(__NR_semtimedop, sys_semtimedop, sys_semtimedop_time32) +__SC_3264(__NR_semtimedop, sys_semtimedop_time32, sys_semtimedop) #endif #define __NR_semop 193 __SYSCALL(__NR_semop, sys_semop) From 761830a03c5c99fc0a7142e16cf0811cb68bd7af Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 20 Sep 2019 14:56:47 -0300 Subject: [PATCH 0696/2880] tools arch x86 uapi: Synch asm/unistd.h with the kernel sources To pick up the change in: 45e29d119e99 ("x86/syscalls: Make __X32_SYSCALL_BIT be unsigned long") That doesn't trigger any changes in tooling and silences this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/unistd.h' differs from latest version at 'arch/x86/include/uapi/asm/unistd.h' diff -u tools/arch/x86/include/uapi/asm/unistd.h arch/x86/include/uapi/asm/unistd.h Cc: Adrian Hunter Cc: Andy Lutomirski Cc: Jiri Olsa Cc: Namhyung Kim Cc: Thomas Gleixner Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/unistd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/arch/x86/include/uapi/asm/unistd.h b/tools/arch/x86/include/uapi/asm/unistd.h index 30d7d04d72d6..196fdd02b8b1 100644 --- a/tools/arch/x86/include/uapi/asm/unistd.h +++ b/tools/arch/x86/include/uapi/asm/unistd.h @@ -3,7 +3,7 @@ #define _UAPI_ASM_X86_UNISTD_H /* x32 syscall flag bit */ -#define __X32_SYSCALL_BIT 0x40000000 +#define __X32_SYSCALL_BIT 0x40000000UL #ifndef __KERNEL__ # ifdef __i386__ From 40f1c039c7c6de913ee04eac585102e2fce7f6f7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 20 Sep 2019 15:00:49 -0300 Subject: [PATCH 0697/2880] tools arch x86: Sync asm/cpufeatures.h with the kernel sources To pick up the changes from: b4dd4f6e3648 ("x86/vmware: Add a header file for hypercall definitions") f36cf386e3fe ("x86/speculation/swapgs: Exclude ATOMs from speculation through SWAPGS") be261ffce6f1 ("x86: Remove X86_FEATURE_MFENCE_RDTSC") 018ebca8bd70 ("x86/cpufeatures: Enable a new AVX512 CPU feature") These don't cause any changes in tooling, just silences this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h To clarify, updating those files cause these bits of tools/perf to rebuild: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o INSTALL GTK UI LD /tmp/build/perf/bench/perf-in.o Those use just: $ grep FEATURE tools/arch/x86/lib/mem*.S tools/arch/x86/lib/memcpy_64.S: ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ tools/arch/x86/lib/memcpy_64.S: "jmp memcpy_erms", X86_FEATURE_ERMS tools/arch/x86/lib/memset_64.S: ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ tools/arch/x86/lib/memset_64.S: "jmp memset_erms", X86_FEATURE_ERMS $ I.e. none of the feature defines added/removed by the patches above. Cc: Adrian Hunter Cc: Borislav Petkov Cc: Gayatri Kammela Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Thomas Hellstrom Link: https://lkml.kernel.org/n/tip-pq63abgknsaeov23p80d8gjv@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 5171b9c7ca3e..0652d3eed9bd 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -231,6 +231,8 @@ #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ +#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ +#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -354,6 +356,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ From 0216234c2eed1367a318daeb9f4a97d8217412a0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Sep 2019 12:52:35 +0200 Subject: [PATCH 0698/2880] perf tools: Fix segfault in cpu_cache_level__read() We release wrong pointer on error path in cpu_cache_level__read function, leading to segfault: (gdb) r record ls Starting program: /root/perf/tools/perf/perf record ls ... [ perf record: Woken up 1 times to write data ] double free or corruption (out) Thread 1 "perf" received signal SIGABRT, Aborted. 0x00007ffff7463798 in raise () from /lib64/power9/libc.so.6 (gdb) bt #0 0x00007ffff7463798 in raise () from /lib64/power9/libc.so.6 #1 0x00007ffff7443bac in abort () from /lib64/power9/libc.so.6 #2 0x00007ffff74af8bc in __libc_message () from /lib64/power9/libc.so.6 #3 0x00007ffff74b92b8 in malloc_printerr () from /lib64/power9/libc.so.6 #4 0x00007ffff74bb874 in _int_free () from /lib64/power9/libc.so.6 #5 0x0000000010271260 in __zfree (ptr=0x7fffffffa0b0) at ../../lib/zalloc.. #6 0x0000000010139340 in cpu_cache_level__read (cache=0x7fffffffa090, cac.. #7 0x0000000010143c90 in build_caches (cntp=0x7fffffffa118, size= Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: stable@vger.kernel.org: # v4.6+ Link: http://lore.kernel.org/lkml/20190912105235.10689-1-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 5722ff717777..0167f9697172 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1073,7 +1073,7 @@ static int cpu_cache_level__read(struct cpu_cache_level *cache, u32 cpu, u16 lev scnprintf(file, PATH_MAX, "%s/shared_cpu_list", path); if (sysfs__read_str(file, &cache->map, &len)) { - zfree(&cache->map); + zfree(&cache->size); zfree(&cache->type); return -1; } From 1a375ae7659ab740d4c885ea98c1659b8a6e2071 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 19 Sep 2019 12:41:10 +0900 Subject: [PATCH 0699/2880] perf probe: Skip same probe address for a given line Fix to skip making a same probe address on given line. Since a DWARF line info contains several entries for one line with different column, perf probe will make a different probe on same address if user specifies a probe point by "function:line" or "file:line". e.g. $ perf probe -D kernel_read:8 p:probe/kernel_read_L8 kernel_read+39 p:probe/kernel_read_L8_1 kernel_read+39 This skips such duplicated probe addresses. Committer testing: # uname -a Linux quaco 5.3.0+ #2 SMP Thu Sep 19 16:13:22 -03 2019 x86_64 x86_64 x86_64 GNU/Linux # Before: # perf probe -D kernel_read:8 p:probe/kernel_read _text+3115191 p:probe/kernel_read_1 _text+3115191 # After: # perf probe -D kernel_read:8 p:probe/kernel_read _text+3115191 # Signed-off-by: Masami Hiramatsu Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lore.kernel.org/lkml/156886447061.10772.4261569305869149178.stgit@devnote2 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-finder.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 505905fc21c5..cd9f95e5044e 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -1245,6 +1245,17 @@ static int expand_probe_args(Dwarf_Die *sc_die, struct probe_finder *pf, return n; } +static bool trace_event_finder_overlap(struct trace_event_finder *tf) +{ + int i; + + for (i = 0; i < tf->ntevs; i++) { + if (tf->pf.addr == tf->tevs[i].point.address) + return true; + } + return false; +} + /* Add a found probe point into trace event list */ static int add_probe_trace_event(Dwarf_Die *sc_die, struct probe_finder *pf) { @@ -1255,6 +1266,14 @@ static int add_probe_trace_event(Dwarf_Die *sc_die, struct probe_finder *pf) struct perf_probe_arg *args = NULL; int ret, i; + /* + * For some reason (e.g. different column assigned to same address) + * This callback can be called with the address which already passed. + * Ignore it first. + */ + if (trace_event_finder_overlap(tf)) + return 0; + /* Check number of tevs */ if (tf->ntevs == tf->max_tevs) { pr_warning("Too many( > %d) probe point found.\n", From 9e6124d9d635957b56717f85219a88701617253f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 16 Sep 2019 01:44:40 +0900 Subject: [PATCH 0700/2880] perf probe: Fix to clear tev->nargs in clear_probe_trace_event() Since add_probe_trace_event() can reuse tf->tevs[i] after calling clear_probe_trace_event(), this can make perf-probe crash if the 1st attempt of probe event finding fails to find an event argument, and the 2nd attempt fails to find probe point. E.g. $ perf probe -D "task_pid_nr tsk" Failed to find 'tsk' in this function. Failed to get entry address of warn_bad_vsyscall Segmentation fault (core dumped) Committer testing: After the patch: $ perf probe -D "task_pid_nr tsk" Failed to find 'tsk' in this function. Failed to get entry address of warn_bad_vsyscall Failed to get entry address of signal_fault Failed to get entry address of show_signal Failed to get entry address of umip_printk Failed to get entry address of __bad_area_nosemaphore Failed to get entry address of sock_set_timeout Failed to get entry address of tcp_recvmsg Probe point 'task_pid_nr' not found. Error: Failed to add events. $ Fixes: 092b1f0b5f9f ("perf probe: Clear probe_trace_event when add_probe_trace_event() fails") Signed-off-by: Masami Hiramatsu Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lore.kernel.org/lkml/156856587999.25775.5145779959474477595.stgit@devnote2 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index b8e0967c5c21..91cab5f669d2 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -2331,6 +2331,7 @@ void clear_probe_trace_event(struct probe_trace_event *tev) } } zfree(&tev->args); + tev->nargs = 0; } struct kprobe_blacklist_node { From 6ef81c55a2b6584cb642917f5fdf3632ef44b670 Mon Sep 17 00:00:00 2001 From: Mamatha Inamdar Date: Thu, 22 Aug 2019 12:50:49 +0530 Subject: [PATCH 0701/2880] perf session: Return error code for perf_session__new() function on failure This patch is to return error code of perf_new_session function on failure instead of NULL. Test Results: Before Fix: $ perf c2c report -input failed to open nput: No such file or directory $ echo $? 0 $ After Fix: $ perf c2c report -input failed to open nput: No such file or directory $ echo $? 254 $ Committer notes: Fix 'perf tests topology' case, where we use that TEST_ASSERT_VAL(..., session), i.e. we need to pass zero in case of failure, which was the case before when NULL was returned by perf_session__new() for failure, but now we need to negate the result of IS_ERR(session) to respect that TEST_ASSERT_VAL) expectation of zero meaning failure. Reported-by: Nageswara R Sastry Signed-off-by: Mamatha Inamdar Tested-by: Arnaldo Carvalho de Melo Tested-by: Nageswara R Sastry Acked-by: Ravi Bangoria Reviewed-by: Jiri Olsa Reviewed-by: Mukesh Ojha Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Greg Kroah-Hartman Cc: Jeremie Galarneau Cc: Kate Stewart Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Shawn Landden Cc: Song Liu Cc: Thomas Gleixner Cc: Tzvetomir Stoyanov Link: http://lore.kernel.org/lkml/20190822071223.17892.45782.stgit@localhost.localdomain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 5 +++-- tools/perf/builtin-buildid-cache.c | 5 +++-- tools/perf/builtin-buildid-list.c | 5 +++-- tools/perf/builtin-c2c.c | 6 ++++-- tools/perf/builtin-diff.c | 9 +++++---- tools/perf/builtin-evlist.c | 5 +++-- tools/perf/builtin-inject.c | 5 +++-- tools/perf/builtin-kmem.c | 5 +++-- tools/perf/builtin-kvm.c | 9 +++++---- tools/perf/builtin-lock.c | 5 +++-- tools/perf/builtin-mem.c | 5 +++-- tools/perf/builtin-record.c | 5 +++-- tools/perf/builtin-report.c | 4 ++-- tools/perf/builtin-sched.c | 11 ++++++----- tools/perf/builtin-script.c | 9 +++++---- tools/perf/builtin-stat.c | 11 ++++++----- tools/perf/builtin-timechart.c | 5 +++-- tools/perf/builtin-top.c | 5 +++-- tools/perf/builtin-trace.c | 4 ++-- tools/perf/tests/topology.c | 5 +++-- tools/perf/util/data-convert-bt.c | 5 ++++- tools/perf/util/session.c | 15 +++++++++++---- 22 files changed, 86 insertions(+), 57 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 553c651fa0a2..8db8fc9bddef 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -40,6 +40,7 @@ #include #include #include +#include struct perf_annotate { struct perf_tool tool; @@ -584,8 +585,8 @@ int cmd_annotate(int argc, const char **argv) data.path = input_name; annotate.session = perf_session__new(&data, false, &annotate.tool); - if (annotate.session == NULL) - return -1; + if (IS_ERR(annotate.session)) + return PTR_ERR(annotate.session); annotate.has_br_stack = perf_header__has_feat(&annotate.session->header, HEADER_BRANCH_STACK); diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index 1a69eb565dc0..39efa51d7fb3 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -28,6 +28,7 @@ #include "util/util.h" #include "util/probe-file.h" #include +#include static int build_id_cache__kcore_buildid(const char *proc_dir, char *sbuildid) { @@ -422,8 +423,8 @@ int cmd_buildid_cache(int argc, const char **argv) data.force = force; session = perf_session__new(&data, false, NULL); - if (session == NULL) - return -1; + if (IS_ERR(session)) + return PTR_ERR(session); } if (symbol__init(session ? &session->header.env : NULL) < 0) diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index 5a0d8b378cb5..e3ef75583514 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -18,6 +18,7 @@ #include "util/symbol.h" #include "util/data.h" #include +#include static int sysfs__fprintf_build_id(FILE *fp) { @@ -65,8 +66,8 @@ static int perf_session__list_build_ids(bool force, bool with_hits) goto out; session = perf_session__new(&data, false, &build_id__mark_dso_hit_ops); - if (session == NULL) - return -1; + if (IS_ERR(session)) + return PTR_ERR(session); /* * We take all buildids when the file contains AUX area tracing data diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 61aaacc2aedd..3542b6ab9813 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -2781,8 +2782,9 @@ static int perf_c2c__report(int argc, const char **argv) } session = perf_session__new(&data, 0, &c2c.tool); - if (session == NULL) { - pr_debug("No memory for session\n"); + if (IS_ERR(session)) { + err = PTR_ERR(session); + pr_debug("Error creating perf session\n"); goto out; } diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 827e4800d862..c37a78677955 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -23,6 +23,7 @@ #include "util/time-utils.h" #include "util/annotate.h" #include "util/map.h" +#include #include #include #include @@ -1153,9 +1154,9 @@ static int check_file_brstack(void) data__for_each_file(i, d) { d->session = perf_session__new(&d->data, false, &pdiff.tool); - if (!d->session) { + if (IS_ERR(d->session)) { pr_err("Failed to open %s\n", d->data.path); - return -1; + return PTR_ERR(d->session); } has_br_stack = perf_header__has_feat(&d->session->header, @@ -1185,9 +1186,9 @@ static int __cmd_diff(void) data__for_each_file(i, d) { d->session = perf_session__new(&d->data, false, &pdiff.tool); - if (!d->session) { + if (IS_ERR(d->session)) { + ret = PTR_ERR(d->session); pr_err("Failed to open %s\n", d->data.path); - ret = -1; goto out_delete; } diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index 294494e60e48..60509ce4dd28 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -15,6 +15,7 @@ #include "util/session.h" #include "util/data.h" #include "util/debug.h" +#include static int __cmd_evlist(const char *file_name, struct perf_attr_details *details) { @@ -28,8 +29,8 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details bool has_tracepoint = false; session = perf_session__new(&data, 0, NULL); - if (session == NULL) - return -1; + if (IS_ERR(session)) + return PTR_ERR(session); evlist__for_each_entry(session->evlist, pos) { perf_evsel__fprintf(pos, details, stdout); diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 23a76cf3846f..372ecb3e2c06 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -23,6 +23,7 @@ #include "util/symbol.h" #include "util/synthetic-events.h" #include "util/thread.h" +#include #include @@ -835,8 +836,8 @@ int cmd_inject(int argc, const char **argv) data.path = inject.input_name; inject.session = perf_session__new(&data, true, &inject.tool); - if (inject.session == NULL) - return -1; + if (IS_ERR(inject.session)) + return PTR_ERR(inject.session); if (zstd_init(&(inject.session->zstd_data), 0) < 0) pr_warning("Decompression initialization failed.\n"); diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index b5682beaad72..1e61e353f579 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -14,6 +14,7 @@ #include "util/tool.h" #include "util/callchain.h" #include "util/time-utils.h" +#include #include #include @@ -1956,8 +1957,8 @@ int cmd_kmem(int argc, const char **argv) data.path = input_name; kmem_session = session = perf_session__new(&data, false, &perf_kmem); - if (session == NULL) - return -1; + if (IS_ERR(session)) + return PTR_ERR(session); ret = -1; diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 6e3e36658900..c4b22e1b0a40 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -1091,9 +1092,9 @@ static int read_events(struct perf_kvm_stat *kvm) kvm->tool = eops; kvm->session = perf_session__new(&file, false, &kvm->tool); - if (!kvm->session) { + if (IS_ERR(kvm->session)) { pr_err("Initializing perf session failed\n"); - return -1; + return PTR_ERR(kvm->session); } symbol__init(&kvm->session->header.env); @@ -1446,8 +1447,8 @@ static int kvm_events_live(struct perf_kvm_stat *kvm, * perf session */ kvm->session = perf_session__new(&data, false, &kvm->tool); - if (kvm->session == NULL) { - err = -1; + if (IS_ERR(kvm->session)) { + err = PTR_ERR(kvm->session); goto out; } kvm->session->evlist = kvm->evlist; diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 4c2b7f437cdf..474dfd59d7eb 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -30,6 +30,7 @@ #include #include #include +#include static struct perf_session *session; @@ -872,9 +873,9 @@ static int __cmd_report(bool display_info) }; session = perf_session__new(&data, false, &eops); - if (!session) { + if (IS_ERR(session)) { pr_err("Initializing perf session failed\n"); - return -1; + return PTR_ERR(session); } symbol__init(&session->header.env); diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 27d2bde943a8..a13f5817d6fc 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -17,6 +17,7 @@ #include "util/dso.h" #include "util/map.h" #include "util/symbol.h" +#include #define MEM_OPERATION_LOAD 0x1 #define MEM_OPERATION_STORE 0x2 @@ -249,8 +250,8 @@ static int report_raw_events(struct perf_mem *mem) struct perf_session *session = perf_session__new(&data, false, &mem->tool); - if (session == NULL) - return -1; + if (IS_ERR(session)) + return PTR_ERR(session); if (mem->cpu_list) { ret = perf_session__cpu_bitmap(session, mem->cpu_list, diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 4bd11c918e73..3f66a49a997f 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -1354,9 +1355,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) } session = perf_session__new(data, false, tool); - if (session == NULL) { + if (IS_ERR(session)) { pr_err("Perf session creation failed.\n"); - return -1; + return PTR_ERR(session); } fd = perf_data__fd(data); diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 3047e5169d9d..aae0e57c60fb 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1269,8 +1269,8 @@ int cmd_report(int argc, const char **argv) repeat: session = perf_session__new(&data, false, &report.tool); - if (session == NULL) - return -1; + if (IS_ERR(session)) + return PTR_ERR(session); ret = evswitch__init(&report.evswitch, session->evlist, stderr); if (ret) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index f0b828c201cc..079e67a36904 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -40,6 +40,7 @@ #include #include #include +#include #include @@ -1797,9 +1798,9 @@ static int perf_sched__read_events(struct perf_sched *sched) int rc = -1; session = perf_session__new(&data, false, &sched->tool); - if (session == NULL) { - pr_debug("No Memory for session\n"); - return -1; + if (IS_ERR(session)) { + pr_debug("Error creating perf session"); + return PTR_ERR(session); } symbol__init(&session->header.env); @@ -2989,8 +2990,8 @@ static int perf_sched__timehist(struct perf_sched *sched) symbol_conf.use_callchain = sched->show_callchain; session = perf_session__new(&data, false, &sched->tool); - if (session == NULL) - return -ENOMEM; + if (IS_ERR(session)) + return PTR_ERR(session); evlist = session->evlist; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index e079b34201f2..a17a9306bdf6 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "util/record.h" #include "util/util.h" #include "perf.h" @@ -3083,8 +3084,8 @@ int find_scripts(char **scripts_array, char **scripts_path_array, int num, int i = 0; session = perf_session__new(&data, false, NULL); - if (!session) - return -1; + if (IS_ERR(session)) + return PTR_ERR(session); snprintf(scripts_path, MAXPATHLEN, "%s/scripts", get_argv_exec_path()); @@ -3754,8 +3755,8 @@ int cmd_script(int argc, const char **argv) } session = perf_session__new(&data, false, &script.tool); - if (session == NULL) - return -1; + if (IS_ERR(session)) + return PTR_ERR(session); if (header || header_only) { script.tool.show_feat_hdr = SHOW_FEAT_HEADER; diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 60cdd383af81..f7d13326b830 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include @@ -1436,9 +1437,9 @@ static int __cmd_record(int argc, const char **argv) } session = perf_session__new(data, false, NULL); - if (session == NULL) { - pr_err("Perf session creation failed.\n"); - return -1; + if (IS_ERR(session)) { + pr_err("Perf session creation failed\n"); + return PTR_ERR(session); } init_features(session); @@ -1635,8 +1636,8 @@ static int __cmd_report(int argc, const char **argv) perf_stat.data.mode = PERF_DATA_MODE_READ; session = perf_session__new(&perf_stat.data, false, &perf_stat.tool); - if (session == NULL) - return -1; + if (IS_ERR(session)) + return PTR_ERR(session); perf_stat.session = session; stat_config.output = stderr; diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index e0e822695a29..9e84fae9b096 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -35,6 +35,7 @@ #include "util/tool.h" #include "util/data.h" #include "util/debug.h" +#include #ifdef LACKS_OPEN_MEMSTREAM_PROTOTYPE FILE *open_memstream(char **ptr, size_t *sizeloc); @@ -1601,8 +1602,8 @@ static int __cmd_timechart(struct timechart *tchart, const char *output_name) &tchart->tool); int ret = -EINVAL; - if (session == NULL) - return -1; + if (IS_ERR(session)) + return PTR_ERR(session); symbol__init(&session->header.env); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index b052470f89b4..8da3c939e6b0 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -77,6 +77,7 @@ #include #include #include +#include #include @@ -1672,8 +1673,8 @@ int cmd_top(int argc, const char **argv) } top.session = perf_session__new(NULL, false, NULL); - if (top.session == NULL) { - status = -1; + if (IS_ERR(top.session)) { + status = PTR_ERR(top.session); goto out_delete_evlist; } diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index f0f735093e21..a292658b4232 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3585,8 +3585,8 @@ static int trace__replay(struct trace *trace) trace->multiple_threads = true; session = perf_session__new(&data, false, &trace->tool); - if (session == NULL) - return -1; + if (IS_ERR(session)) + return PTR_ERR(session); if (trace->opts.target.pid) symbol_conf.pid_list_str = strdup(trace->opts.target.pid); diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 7d845d913d7d..4a800499d7c3 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -8,6 +8,7 @@ #include "session.h" #include "evlist.h" #include "debug.h" +#include #define TEMPL "/tmp/perf-test-XXXXXX" #define DATA_SIZE 10 @@ -39,7 +40,7 @@ static int session_write_header(char *path) }; session = perf_session__new(&data, false, NULL); - TEST_ASSERT_VAL("can't get session", session); + TEST_ASSERT_VAL("can't get session", !IS_ERR(session)); session->evlist = perf_evlist__new_default(); TEST_ASSERT_VAL("can't get evlist", session->evlist); @@ -70,7 +71,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) int i; session = perf_session__new(&data, false, NULL); - TEST_ASSERT_VAL("can't get session", session); + TEST_ASSERT_VAL("can't get session", !IS_ERR(session)); /* On platforms with large numbers of CPUs process_cpu_topology() * might issue an error while reading the perf.data file section diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 0c268449959c..dbc772bfb04e 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -30,6 +30,7 @@ #include "machine.h" #include "config.h" #include +#include #define pr_N(n, fmt, ...) \ eprintf(n, debug_data_convert, fmt, ##__VA_ARGS__) @@ -1619,8 +1620,10 @@ int bt_convert__perf2ctf(const char *input, const char *path, err = -1; /* perf.data session */ session = perf_session__new(&data, 0, &c.tool); - if (!session) + if (IS_ERR(session)) { + err = PTR_ERR(session); goto free_writer; + } if (c.queue_size) { ordered_events__set_alloc_size(&session->ordered_events, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 58b5bc34ba12..a621c73bad42 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -34,6 +34,7 @@ #include "../perf.h" #include "arch/common.h" #include +#include #ifdef HAVE_ZSTD_SUPPORT static int perf_session__process_compressed_event(struct perf_session *session, @@ -187,6 +188,7 @@ static int ordered_events__deliver_event(struct ordered_events *oe, struct perf_session *perf_session__new(struct perf_data *data, bool repipe, struct perf_tool *tool) { + int ret = -ENOMEM; struct perf_session *session = zalloc(sizeof(*session)); if (!session) @@ -201,13 +203,15 @@ struct perf_session *perf_session__new(struct perf_data *data, perf_env__init(&session->header.env); if (data) { - if (perf_data__open(data)) + ret = perf_data__open(data); + if (ret < 0) goto out_delete; session->data = data; if (perf_data__is_read(data)) { - if (perf_session__open(session) < 0) + ret = perf_session__open(session); + if (ret < 0) goto out_delete; /* @@ -222,8 +226,11 @@ struct perf_session *perf_session__new(struct perf_data *data, perf_evlist__init_trace_event_sample_raw(session->evlist); /* Open the directory data. */ - if (data->is_dir && perf_data__open_dir(data)) + if (data->is_dir) { + ret = perf_data__open_dir(data); + if (ret) goto out_delete; + } } } else { session->machines.host.env = &perf_env; @@ -256,7 +263,7 @@ struct perf_session *perf_session__new(struct perf_data *data, out_delete: perf_session__delete(session); out: - return NULL; + return ERR_PTR(ret); } static void perf_session__delete_threads(struct perf_session *session) From 9f014e3a66bc936412b6614304a4e6c70c70230e Mon Sep 17 00:00:00 2001 From: Roy Ben Shlomo Date: Fri, 20 Sep 2019 20:12:53 +0300 Subject: [PATCH 0702/2880] perf/core: Fix several typos in comments Fix typos in a few functions' documentation comments. Signed-off-by: Roy Ben Shlomo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: royb@sentinelone.com Link: http://lore.kernel.org/lkml/20190920171254.31373-1-royb@sentinelone.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4f08b17d6426..275eae05af20 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2239,7 +2239,7 @@ static void __perf_event_disable(struct perf_event *event, * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through + * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). @@ -6054,7 +6054,7 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr, * Get remaining task size from user stack pointer. * * It'd be better to take stack vma map and limit this more - * precisly, but there's no way to get it safely under interrupt, + * precisely, but there's no way to get it safely under interrupt, * so using TASK_SIZE as limit. */ static u64 perf_ustack_task_size(struct pt_regs *regs) @@ -6616,7 +6616,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_STACK_USER) { /* - * Either we need PERF_SAMPLE_STACK_USER bit to be allways + * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added * in case new sample type is added, because we could eat * up the rest of the sample size. From 5f1bc39979d868a0358c683864bec3fc8395440b Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 16 Sep 2019 07:59:37 -0400 Subject: [PATCH 0703/2880] SUNRPC: Fix buffer handling of GSS MIC without slack The GSS Message Integrity Check data for krb5i may lie partially in the XDR reply buffer's pages and tail. If so, we try to copy the entire MIC into free space in the tail. But as the estimations of the slack space required for authentication and verification have improved there may be less free space in the tail to complete this copy -- see commit 2c94b8eca1a2 ("SUNRPC: Use au_rslack when computing reply buffer size"). In fact, there may only be room in the tail for a single copy of the MIC, and not part of the MIC and then another complete copy. The real world failure reported is that `ls` of a directory on NFS may sometimes return -EIO, which can be traced back to xdr_buf_read_netobj() failing to find available free space in the tail to copy the MIC. Fix this by checking for the case of the MIC crossing the boundaries of head, pages, and tail. If so, shift the buffer until the MIC is contained completely within the pages or tail. This allows the remainder of the function to create a sub buffer that directly address the complete MIC. Signed-off-by: Benjamin Coddington Cc: stable@vger.kernel.org # v5.1 Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xdr.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 7ba0ede6b417..24ec2ab5bfa5 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1237,16 +1237,29 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) EXPORT_SYMBOL_GPL(xdr_encode_word); /* If the netobj starting offset bytes from the start of xdr_buf is contained - * entirely in the head or the tail, set object to point to it; otherwise - * try to find space for it at the end of the tail, copy it there, and - * set obj to point to it. */ + * entirely in the head, pages, or tail, set object to point to it; otherwise + * shift the buffer until it is contained entirely within the pages or tail. + */ int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset) { struct xdr_buf subbuf; + unsigned int boundary; if (xdr_decode_word(buf, offset, &obj->len)) return -EFAULT; - if (xdr_buf_subsegment(buf, &subbuf, offset + 4, obj->len)) + offset += 4; + + /* Is the obj partially in the head? */ + boundary = buf->head[0].iov_len; + if (offset < boundary && (offset + obj->len) > boundary) + xdr_shift_buf(buf, boundary - offset); + + /* Is the obj partially in the pages? */ + boundary += buf->page_len; + if (offset < boundary && (offset + obj->len) > boundary) + xdr_shrink_pagelen(buf, boundary - offset); + + if (xdr_buf_subsegment(buf, &subbuf, offset, obj->len)) return -EFAULT; /* Is the obj contained entirely in the head? */ @@ -1258,11 +1271,7 @@ int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned in if (subbuf.tail[0].iov_len == obj->len) return 0; - /* use end of tail as storage for obj: - * (We don't copy to the beginning because then we'd have - * to worry about doing a potentially overlapping copy. - * This assumes the object is at most half the length of the - * tail.) */ + /* Find a contiguous area in @buf to hold all of @obj */ if (obj->len > buf->buflen - buf->len) return -ENOMEM; if (buf->tail[0].iov_len != 0) From f925ab926d1a9c2112d34ecb59fbb050bb58646c Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 16 Sep 2019 07:59:38 -0400 Subject: [PATCH 0704/2880] SUNRPC: Rename xdr_buf_read_netobj to xdr_buf_read_mic Let the name reflect the single use. The function now assumes the GSS MIC is the last object in the buffer. Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 2 +- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/xdr.c | 52 ++++++++++++++++++++-------------- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 8a87d8bcb197..f33e5013bdfb 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -186,7 +186,7 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p) extern void xdr_shift_buf(struct xdr_buf *, size_t); extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *); extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); -extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, unsigned int); +extern int xdr_buf_read_mic(struct xdr_buf *, struct xdr_netobj *, unsigned int); extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 4ce42c62458e..d75fddca44c9 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1960,7 +1960,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len)) goto unwrap_failed; - if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset)) + if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 24ec2ab5bfa5..14ba9e72a204 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1236,52 +1236,60 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) } EXPORT_SYMBOL_GPL(xdr_encode_word); -/* If the netobj starting offset bytes from the start of xdr_buf is contained - * entirely in the head, pages, or tail, set object to point to it; otherwise - * shift the buffer until it is contained entirely within the pages or tail. +/** + * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf + * @buf: pointer to buffer containing a mic + * @mic: on success, returns the address of the mic + * @offset: the offset in buf where mic may be found + * + * This function may modify the xdr buf if the mic is found to be straddling + * a boundary between head, pages, and tail. On success the mic can be read + * from the address returned. There is no need to free the mic. + * + * Return: Success returns 0, otherwise an integer error. */ -int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset) +int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset) { struct xdr_buf subbuf; unsigned int boundary; - if (xdr_decode_word(buf, offset, &obj->len)) + if (xdr_decode_word(buf, offset, &mic->len)) return -EFAULT; offset += 4; - /* Is the obj partially in the head? */ + /* Is the mic partially in the head? */ boundary = buf->head[0].iov_len; - if (offset < boundary && (offset + obj->len) > boundary) + if (offset < boundary && (offset + mic->len) > boundary) xdr_shift_buf(buf, boundary - offset); - /* Is the obj partially in the pages? */ + /* Is the mic partially in the pages? */ boundary += buf->page_len; - if (offset < boundary && (offset + obj->len) > boundary) + if (offset < boundary && (offset + mic->len) > boundary) xdr_shrink_pagelen(buf, boundary - offset); - if (xdr_buf_subsegment(buf, &subbuf, offset, obj->len)) + if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len)) return -EFAULT; - /* Is the obj contained entirely in the head? */ - obj->data = subbuf.head[0].iov_base; - if (subbuf.head[0].iov_len == obj->len) + /* Is the mic contained entirely in the head? */ + mic->data = subbuf.head[0].iov_base; + if (subbuf.head[0].iov_len == mic->len) return 0; - /* ..or is the obj contained entirely in the tail? */ - obj->data = subbuf.tail[0].iov_base; - if (subbuf.tail[0].iov_len == obj->len) + /* ..or is the mic contained entirely in the tail? */ + mic->data = subbuf.tail[0].iov_base; + if (subbuf.tail[0].iov_len == mic->len) return 0; - /* Find a contiguous area in @buf to hold all of @obj */ - if (obj->len > buf->buflen - buf->len) + /* Find a contiguous area in @buf to hold all of @mic */ + if (mic->len > buf->buflen - buf->len) return -ENOMEM; if (buf->tail[0].iov_len != 0) - obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len; + mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len; else - obj->data = buf->head[0].iov_base + buf->head[0].iov_len; - __read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len); + mic->data = buf->head[0].iov_base + buf->head[0].iov_len; + __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len); return 0; } -EXPORT_SYMBOL_GPL(xdr_buf_read_netobj); +EXPORT_SYMBOL_GPL(xdr_buf_read_mic); /* Returns 0 on success, or else a negative error code. */ static int From 9ba828861c56a21d211d5d10f5643774b1ea330d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 16 Sep 2019 09:12:19 -0400 Subject: [PATCH 0705/2880] SUNRPC: Don't try to parse incomplete RPC messages If the copy of the RPC reply into our buffers did not complete, and we could end up with a truncated message. In that case, just resend the call. Fixes: a0584ee9aed80 ("SUNRPC: Use struct xdr_stream when decoding...") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 44d5cf318813..8b622ceb1158 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2465,6 +2465,7 @@ call_decode(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct xdr_stream xdr; + int err; dprint_status(task); @@ -2487,6 +2488,15 @@ call_decode(struct rpc_task *task) * before it changed req->rq_reply_bytes_recvd. */ smp_rmb(); + + /* + * Did we ever call xprt_complete_rqst()? If not, we should assume + * the message is incomplete. + */ + err = -EAGAIN; + if (!req->rq_reply_bytes_recvd) + goto out; + req->rq_rcv_buf.len = req->rq_private_buf.len; /* Check that the softirq receive buffer is valid */ @@ -2495,7 +2505,9 @@ call_decode(struct rpc_task *task) xdr_init_decode(&xdr, &req->rq_rcv_buf, req->rq_rcv_buf.head[0].iov_base, req); - switch (rpc_decode_header(task, &xdr)) { + err = rpc_decode_header(task, &xdr); +out: + switch (err) { case 0: task->tk_action = rpc_exit_task; task->tk_status = rpcauth_unwrap_resp(task, &xdr); From 8593e010786181df887b001824ff8f3e52e2098f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 13 Sep 2019 16:01:07 -0400 Subject: [PATCH 0706/2880] SUNRPC: Fix congestion window race with disconnect If the congestion window closes just as the transport disconnects, a reconnect is never driven because: 1. The XPRT_CONG_WAIT flag prevents tasks from taking the write lock 2. There's no wake-up of the first task on the xprt->sending queue To address this, clear the congestion wait flag as part of completing a disconnect. Fixes: 75891f502f5f ("SUNRPC: Support for congestion control ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 02d5b2125c07..83ec4edd2f91 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -456,6 +456,12 @@ void xprt_release_rqst_cong(struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); +static void xprt_clear_congestion_window_wait_locked(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) + __xprt_lock_write_next_cong(xprt); +} + /* * Clear the congestion window wait flag and wake up the next * entry on xprt->sending @@ -671,6 +677,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) spin_lock(&xprt->transport_lock); xprt_clear_connected(xprt); xprt_clear_write_space_locked(xprt); + xprt_clear_congestion_window_wait_locked(xprt); xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock(&xprt->transport_lock); } From 406cd91533dcc5e82ef2373c39e6a531d944131e Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 13 Sep 2019 08:29:02 -0400 Subject: [PATCH 0707/2880] NFS: Refactor nfs_instantiate() for dentry referencing callers Since commit b0c6108ecf64 ("nfs_instantiate(): prevent multiple aliases for directory inode"), nfs_instantiate() may succeed without actually instantiating the dentry that was passed in. That can be problematic for some callers in NFSv3, so this patch breaks things up so we can get the actual dentry obtained. Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 41 +++++++++++++++++++++++++++-------------- include/linux/nfs_fs.h | 3 +++ 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8d501093660f..d1bbf2fb6ac7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1669,24 +1669,23 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) #endif /* CONFIG_NFSV4 */ -/* - * Code common to create, mkdir, and mknod. - */ -int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, +struct dentry * +nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { struct dentry *parent = dget_parent(dentry); struct inode *dir = d_inode(parent); struct inode *inode; - struct dentry *d; - int error = -EACCES; + struct dentry *d = NULL; + int error; d_drop(dentry); /* We may have been initialized further down */ if (d_really_is_positive(dentry)) goto out; + if (fhandle->size == 0) { error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); if (error) @@ -1702,18 +1701,32 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, } inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); d = d_splice_alias(inode, dentry); - if (IS_ERR(d)) { - error = PTR_ERR(d); - goto out_error; - } - dput(d); out: dput(parent); - return 0; + return d; out_error: nfs_mark_for_revalidate(dir); - dput(parent); - return error; + d = ERR_PTR(error); + goto out; +} +EXPORT_SYMBOL_GPL(nfs_add_or_obtain); + +/* + * Code common to create, mkdir, and mknod. + */ +int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + struct dentry *d; + + d = nfs_add_or_obtain(dentry, fhandle, fattr, label); + if (IS_ERR(d)) + return PTR_ERR(d); + + /* Callers don't care */ + dput(d); + return 0; } EXPORT_SYMBOL_GPL(nfs_instantiate); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 0a11712a80e3..570a60c2f4f4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -490,6 +490,9 @@ extern const struct file_operations nfs_dir_operations; extern const struct dentry_operations nfs_dentry_operations; extern void nfs_force_lookup_revalidate(struct inode *dir); +extern struct dentry *nfs_add_or_obtain(struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr, + struct nfs4_label *label); extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label); extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags); From 17fd6e457b30e2e025fce4399fe20ae69c7081dd Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 13 Sep 2019 08:29:03 -0400 Subject: [PATCH 0708/2880] NFSv3: use nfs_add_or_obtain() to create and reference inodes Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/nfs3proc.c | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index a3ad2d46fd42..9eb2f1a503ab 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -279,15 +279,17 @@ static struct nfs3_createdata *nfs3_alloc_createdata(void) return data; } -static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) +static struct dentry * +nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) { int status; status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); nfs_post_op_update_inode(dir, data->res.dir_attr); - if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); - return status; + if (status != 0) + return ERR_PTR(status); + + return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL); } static void nfs3_free_createdata(struct nfs3_createdata *data) @@ -304,6 +306,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call create %pd\n", dentry); @@ -330,7 +333,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; for (;;) { - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); if (status != -ENOTSUPP) break; @@ -355,6 +359,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + /* When we created the file with exclusive semantics, make * sure we set the attributes afterwards. */ if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) { @@ -372,11 +379,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_post_op_update_inode(d_inode(dentry), data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); if (status != 0) - goto out_release_acls; + goto out_dput; } status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); +out_dput: + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); @@ -504,6 +513,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; if (len > NFS3_MAXPATHLEN) @@ -522,7 +532,11 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, data->arg.symlink.pathlen = len; data->arg.symlink.sattr = sattr; - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); + + if (status == 0) + dput(d_alias); nfs3_free_createdata(data); out: @@ -535,6 +549,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); @@ -553,12 +568,18 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) data->arg.mkdir.len = dentry->d_name.len; data->arg.mkdir.sattr = sattr; - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); + if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); @@ -660,6 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call mknod %pd %u:%u\n", dentry, @@ -698,12 +720,17 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; } - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); From 581057c8346b9da51f1115768fd4189ed5eab19b Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 13 Sep 2019 08:29:04 -0400 Subject: [PATCH 0709/2880] NFS: remove unused check for negative dentry This check has been hanging out since we used to have parallel paths to add dentry in nfs_create(), but that hasn't been the case for some years. Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d1bbf2fb6ac7..dd8b218785be 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1677,15 +1677,11 @@ nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, struct dentry *parent = dget_parent(dentry); struct inode *dir = d_inode(parent); struct inode *inode; - struct dentry *d = NULL; + struct dentry *d; int error; d_drop(dentry); - /* We may have been initialized further down */ - if (d_really_is_positive(dentry)) - goto out; - if (fhandle->size == 0) { error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); if (error) From 9c47b18cf722184f32148784189fca945a7d0561 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:40 -0400 Subject: [PATCH 0710/2880] pNFS: Ensure we do clear the return-on-close layout stateid on fatal errors IF the server rejected our layout return with a state error such as NFS4ERR_BAD_STATEID, or even a stale inode error, then we do want to clear out all the remaining layout segments and mark that stateid as invalid. Fixes: 1c5bd76d17cca ("pNFS: Enable layoutreturn operation for...") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4525d5acae38..0418b198edd3 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1449,10 +1449,15 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, const nfs4_stateid *res_stateid = NULL; struct nfs4_xdr_opaque_data *ld_private = args->ld_private; - if (ret == 0) { - arg_stateid = &args->stateid; + switch (ret) { + case -NFS4ERR_NOMATCHING_LAYOUT: + break; + case 0: if (res->lrs_present) res_stateid = &res->stateid; + /* Fallthrough */ + default: + arg_stateid = &args->stateid; } pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range, res_stateid); From f4ff4faf894d36b4aa243e241d4d47b4b8ba3c84 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 5 Sep 2019 16:15:50 +0530 Subject: [PATCH 0711/2880] PCI: tegra: Add support to configure sideband pins Add support to configure sideband signal pins when the information is present in the respective controller device-tree node. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi [bhelgaas: fold in YueHaibing's fix for build error without CONFIG_PINCTRL; https://lore.kernel.org/r/20190920014807.38288-1-yuehaibing@huawei.com] Signed-off-by: Bjorn Helgaas Reviewed-by: Andrew Murray Acked-by: Thierry Reding --- drivers/pci/controller/dwc/pcie-tegra194.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index 6056414c07d4..0b870287f6b6 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1311,8 +1312,13 @@ static int tegra_pcie_config_rp(struct tegra_pcie_dw *pcie) if (ret < 0) { dev_err(dev, "Failed to get runtime sync for PCIe dev: %d\n", ret); - pm_runtime_disable(dev); - return ret; + goto fail_pm_get_sync; + } + + ret = pinctrl_pm_select_default_state(dev); + if (ret < 0) { + dev_err(dev, "Failed to configure sideband pins: %d\n", ret); + goto fail_pinctrl; } tegra_pcie_init_controller(pcie); @@ -1339,7 +1345,9 @@ static int tegra_pcie_config_rp(struct tegra_pcie_dw *pcie) fail_host_init: tegra_pcie_deinit_controller(pcie); +fail_pinctrl: pm_runtime_put_sync(dev); +fail_pm_get_sync: pm_runtime_disable(dev); return ret; } From 0a901f2130802fdcabcd6f1cfabf4f62fa0d3cde Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 5 Sep 2019 16:15:51 +0530 Subject: [PATCH 0712/2880] PCI: tegra: Add support to enable slot regulators Add support to get regulator information of 3.3V and 12V supplies of a PCIe slot from the respective controller's device-tree node and enable those supplies. This is required in platforms like p2972-0000 where the supplies to x16 slot owned by C5 controller need to be enabled before attempting to enumerate the devices. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Acked-by: Thierry Reding --- drivers/pci/controller/dwc/pcie-tegra194.c | 83 ++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index 0b870287f6b6..f89f5acee72d 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -278,6 +278,8 @@ struct tegra_pcie_dw { u32 aspm_l0s_enter_lat; struct regulator *pex_ctl_supply; + struct regulator *slot_ctl_3v3; + struct regulator *slot_ctl_12v; unsigned int phy_count; struct phy **phys; @@ -1055,6 +1057,73 @@ static void tegra_pcie_downstream_dev_to_D0(struct tegra_pcie_dw *pcie) } } +static int tegra_pcie_get_slot_regulators(struct tegra_pcie_dw *pcie) +{ + pcie->slot_ctl_3v3 = devm_regulator_get_optional(pcie->dev, "vpcie3v3"); + if (IS_ERR(pcie->slot_ctl_3v3)) { + if (PTR_ERR(pcie->slot_ctl_3v3) != -ENODEV) + return PTR_ERR(pcie->slot_ctl_3v3); + + pcie->slot_ctl_3v3 = NULL; + } + + pcie->slot_ctl_12v = devm_regulator_get_optional(pcie->dev, "vpcie12v"); + if (IS_ERR(pcie->slot_ctl_12v)) { + if (PTR_ERR(pcie->slot_ctl_12v) != -ENODEV) + return PTR_ERR(pcie->slot_ctl_12v); + + pcie->slot_ctl_12v = NULL; + } + + return 0; +} + +static int tegra_pcie_enable_slot_regulators(struct tegra_pcie_dw *pcie) +{ + int ret; + + if (pcie->slot_ctl_3v3) { + ret = regulator_enable(pcie->slot_ctl_3v3); + if (ret < 0) { + dev_err(pcie->dev, + "Failed to enable 3.3V slot supply: %d\n", ret); + return ret; + } + } + + if (pcie->slot_ctl_12v) { + ret = regulator_enable(pcie->slot_ctl_12v); + if (ret < 0) { + dev_err(pcie->dev, + "Failed to enable 12V slot supply: %d\n", ret); + goto fail_12v_enable; + } + } + + /* + * According to PCI Express Card Electromechanical Specification + * Revision 1.1, Table-2.4, T_PVPERL (Power stable to PERST# inactive) + * should be a minimum of 100ms. + */ + if (pcie->slot_ctl_3v3 || pcie->slot_ctl_12v) + msleep(100); + + return 0; + +fail_12v_enable: + if (pcie->slot_ctl_3v3) + regulator_disable(pcie->slot_ctl_3v3); + return ret; +} + +static void tegra_pcie_disable_slot_regulators(struct tegra_pcie_dw *pcie) +{ + if (pcie->slot_ctl_12v) + regulator_disable(pcie->slot_ctl_12v); + if (pcie->slot_ctl_3v3) + regulator_disable(pcie->slot_ctl_3v3); +} + static int tegra_pcie_config_controller(struct tegra_pcie_dw *pcie, bool en_hw_hot_rst) { @@ -1068,6 +1137,10 @@ static int tegra_pcie_config_controller(struct tegra_pcie_dw *pcie, return ret; } + ret = tegra_pcie_enable_slot_regulators(pcie); + if (ret < 0) + goto fail_slot_reg_en; + ret = regulator_enable(pcie->pex_ctl_supply); if (ret < 0) { dev_err(pcie->dev, "Failed to enable regulator: %d\n", ret); @@ -1150,6 +1223,8 @@ fail_core_apb_rst: fail_core_clk: regulator_disable(pcie->pex_ctl_supply); fail_reg_en: + tegra_pcie_disable_slot_regulators(pcie); +fail_slot_reg_en: tegra_pcie_bpmp_set_ctrl_state(pcie, false); return ret; @@ -1182,6 +1257,8 @@ static int __deinit_controller(struct tegra_pcie_dw *pcie) return ret; } + tegra_pcie_disable_slot_regulators(pcie); + ret = tegra_pcie_bpmp_set_ctrl_state(pcie, false); if (ret) { dev_err(pcie->dev, "Failed to disable controller %d: %d\n", @@ -1381,6 +1458,12 @@ static int tegra_pcie_dw_probe(struct platform_device *pdev) return ret; } + ret = tegra_pcie_get_slot_regulators(pcie); + if (ret < 0) { + dev_err(dev, "Failed to get slot regulators: %d\n", ret); + return ret; + } + pcie->pex_ctl_supply = devm_regulator_get(dev, "vddio-pex-ctl"); if (IS_ERR(pcie->pex_ctl_supply)) { ret = PTR_ERR(pcie->pex_ctl_supply); From dbb72e2c305b269652eb97fb3544fcb474096a2a Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 5 Sep 2019 16:15:52 +0530 Subject: [PATCH 0713/2880] arm64: tegra: Add configuration for PCIe C5 sideband signals Add support to configure PCIe C5's sideband signals PERST# and CLKREQ# as output and bi-directional signals respectively which unlike other PCIe controllers sideband signals are not configured by default. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- arch/arm64/boot/dts/nvidia/tegra194.dtsi | 38 +++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/nvidia/tegra194.dtsi b/arch/arm64/boot/dts/nvidia/tegra194.dtsi index adebbbf36bd0..3c0cf54f0aab 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194.dtsi @@ -3,8 +3,9 @@ #include #include #include -#include +#include #include +#include #include / { @@ -130,6 +131,38 @@ }; }; + pinmux: pinmux@2430000 { + compatible = "nvidia,tegra194-pinmux"; + reg = <0x2430000 0x17000 + 0xc300000 0x4000>; + + status = "okay"; + + pex_rst_c5_out_state: pex_rst_c5_out { + pex_rst { + nvidia,pins = "pex_l5_rst_n_pgg1"; + nvidia,schmitt = ; + nvidia,lpdr = ; + nvidia,enable-input = ; + nvidia,io-high-voltage = ; + nvidia,tristate = ; + nvidia,pull = ; + }; + }; + + clkreq_c5_bi_dir_state: clkreq_c5_bi_dir { + clkreq { + nvidia,pins = "pex_l5_clkreq_n_pgg0"; + nvidia,schmitt = ; + nvidia,lpdr = ; + nvidia,enable-input = ; + nvidia,io-high-voltage = ; + nvidia,tristate = ; + nvidia,pull = ; + }; + }; + }; + uarta: serial@3100000 { compatible = "nvidia,tegra194-uart", "nvidia,tegra20-uart"; reg = <0x03100000 0x40>; @@ -1365,6 +1398,9 @@ num-viewport = <8>; linux,pci-domain = <5>; + pinctrl-names = "default"; + pinctrl-0 = <&pex_rst_c5_out_state>, <&clkreq_c5_bi_dir_state>; + clocks = <&bpmp TEGRA194_CLK_PEX1_CORE_5>, <&bpmp TEGRA194_CLK_PEX1_CORE_5M>; clock-names = "core", "core_m"; From 09a0774a183d4d9fdfa3d6e471fe09982cac2702 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 5 Sep 2019 16:15:53 +0530 Subject: [PATCH 0714/2880] arm64: tegra: Add PCIe slot supply information in p2972-0000 platform Add 3.3V and 12V supplies regulators information of x16 PCIe slot in p2972-0000 platform which is owned by C5 controller and also enable C5 controller. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- .../arm64/boot/dts/nvidia/tegra194-p2888.dtsi | 24 +++++++++++++++++++ .../boot/dts/nvidia/tegra194-p2972-0000.dts | 4 +++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi index 62e07e1197cc..4c38426a6969 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi @@ -289,5 +289,29 @@ gpio = <&gpio TEGRA194_MAIN_GPIO(A, 3) GPIO_ACTIVE_HIGH>; enable-active-high; }; + + vdd_3v3_pcie: regulator@2 { + compatible = "regulator-fixed"; + reg = <2>; + + regulator-name = "PEX_3V3"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + gpio = <&gpio TEGRA194_MAIN_GPIO(Z, 2) GPIO_ACTIVE_HIGH>; + regulator-boot-on; + enable-active-high; + }; + + vdd_12v_pcie: regulator@3 { + compatible = "regulator-fixed"; + reg = <3>; + + regulator-name = "VDD_12V"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <1200000>; + gpio = <&gpio TEGRA194_MAIN_GPIO(A, 1) GPIO_ACTIVE_LOW>; + regulator-boot-on; + enable-active-low; + }; }; }; diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts b/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts index 23597d53c9c9..d47cd8c4dd24 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts +++ b/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts @@ -93,9 +93,11 @@ }; pcie@141a0000 { - status = "disabled"; + status = "okay"; vddio-pex-ctl-supply = <&vdd_1v8ao>; + vpcie3v3-supply = <&vdd_3v3_pcie>; + vpcie12v-supply = <&vdd_12v_pcie>; phys = <&p2u_nvhs_0>, <&p2u_nvhs_1>, <&p2u_nvhs_2>, <&p2u_nvhs_3>, <&p2u_nvhs_4>, <&p2u_nvhs_5>, From 287a9c558b9b825b3af36731bb09b06621f3e744 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:41 -0400 Subject: [PATCH 0715/2880] NFSv4: Clean up pNFS return-on-close error handling Both close and delegreturn have identical code to handle pNFS return-on-close. This patch refactors that code and places it in pnfs.c Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 66 +++++++---------------------------------------- fs/nfs/pnfs.c | 27 +++++++++++++++++++ fs/nfs/pnfs.h | 13 ++++++++++ 3 files changed, 50 insertions(+), 56 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 00c7a92e3d6b..a7cecac5a4ec 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3363,32 +3363,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data) trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status); /* Handle Layoutreturn errors */ - if (calldata->arg.lr_args && task->tk_status != 0) { - switch (calldata->res.lr_ret) { - default: - calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - break; - case 0: - calldata->arg.lr_args = NULL; - calldata->res.lr_res = NULL; - break; - case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&calldata->arg.lr_args->stateid, - &calldata->arg.lr_args->range, - calldata->inode)) - goto lr_restart; - /* Fallthrough */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_UNKNOWN_LAYOUTTYPE: - case -NFS4ERR_WRONG_CRED: - calldata->arg.lr_args = NULL; - calldata->res.lr_res = NULL; - goto lr_restart; - } - } + if (pnfs_roc_done(task, calldata->inode, + &calldata->arg.lr_args, + &calldata->res.lr_res, + &calldata->res.lr_ret) == -EAGAIN) + goto out_restart; /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors @@ -3435,8 +3414,6 @@ out_release: nfs_refresh_inode(calldata->inode, &calldata->fattr); dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); return; -lr_restart: - calldata->res.lr_ret = 0; out_restart: task->tk_status = 0; rpc_restart_call_prepare(task); @@ -6128,32 +6105,11 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); /* Handle Layoutreturn errors */ - if (data->args.lr_args && task->tk_status != 0) { - switch(data->res.lr_ret) { - default: - data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - break; - case 0: - data->args.lr_args = NULL; - data->res.lr_res = NULL; - break; - case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&data->args.lr_args->stateid, - &data->args.lr_args->range, - data->inode)) - goto lr_restart; - /* Fallthrough */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_UNKNOWN_LAYOUTTYPE: - case -NFS4ERR_WRONG_CRED: - data->args.lr_args = NULL; - data->res.lr_res = NULL; - goto lr_restart; - } - } + if (pnfs_roc_done(task, data->inode, + &data->args.lr_args, + &data->res.lr_res, + &data->res.lr_ret) == -EAGAIN) + goto out_restart; switch (task->tk_status) { case 0: @@ -6191,8 +6147,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) } data->rpc_status = task->tk_status; return; -lr_restart: - data->res.lr_ret = 0; out_restart: task->tk_status = 0; rpc_restart_call_prepare(task); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0418b198edd3..8769422a12f5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1440,6 +1440,33 @@ out_noroc: return false; } +int pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret) +{ + struct nfs4_layoutreturn_args *arg = *argpp; + int retval = -EAGAIN; + + if (!arg) + return 0; + /* Handle Layoutreturn errors */ + switch (*ret) { + case 0: + retval = 0; + break; + case -NFS4ERR_OLD_STATEID: + if (!nfs4_layoutreturn_refresh_stateid(&arg->stateid, + &arg->range, inode)) + break; + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return -EAGAIN; + } + *argpp = NULL; + *respp = NULL; + return retval; +} + void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, int ret) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f15609c003d8..3ef3756d437c 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -282,6 +282,10 @@ bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, const struct cred *cred); +int pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret); void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, int ret); @@ -701,6 +705,15 @@ pnfs_roc(struct inode *ino, return false; } +static inline int +pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret) +{ + return 0; +} + static inline void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, From 078a432d1c6afc2e70c6c5a0ce8ff29ce2a2e34a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:42 -0400 Subject: [PATCH 0716/2880] NFSv4: Handle NFS4ERR_DELAY correctly in return-on-close If the server sends a NFS4ERR_DELAY, then allow the caller to retry. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 8769422a12f5..6436047dc999 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1455,6 +1455,10 @@ int pnfs_roc_done(struct rpc_task *task, struct inode *inode, case 0: retval = 0; break; + case -NFS4ERR_DELAY: + /* Let the caller handle the retry */ + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return 0; case -NFS4ERR_OLD_STATEID: if (!nfs4_layoutreturn_refresh_stateid(&arg->stateid, &arg->range, inode)) From 6109bcf7130126d24a9ec9d881c665d5d9ab0eb4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:43 -0400 Subject: [PATCH 0717/2880] NFSv4: Handle RPC level errors in LAYOUTRETURN Handle RPC level errors by assuming that the RPC call was successful. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 9 +++++++++ fs/nfs/pnfs.c | 15 +++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a7cecac5a4ec..c0bff7ad0c9f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9051,6 +9051,15 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) if (!nfs41_sequence_process(task, &lrp->res.seq_res)) return; + /* + * Was there an RPC level error? Assume the call succeeded, + * and that we need to release the layout + */ + if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) { + lrp->res.lrs_present = 0; + return; + } + server = NFS_SERVER(lrp->args.inode); switch (task->tk_status) { case -NFS4ERR_OLD_STATEID: diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6436047dc999..abc7188f1853 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1455,6 +1455,21 @@ int pnfs_roc_done(struct rpc_task *task, struct inode *inode, case 0: retval = 0; break; + case -NFS4ERR_NOMATCHING_LAYOUT: + /* Was there an RPC level error? If not, retry */ + if (task->tk_rpc_status == 0) + break; + /* If the call was not sent, let caller handle it */ + if (!RPC_WAS_SENT(task)) + return 0; + /* + * Otherwise, assume the call succeeded and + * that we need to release the layout + */ + *ret = 0; + (*respp)->lrs_present = 0; + retval = 0; + break; case -NFS4ERR_DELAY: /* Let the caller handle the retry */ *ret = -NFS4ERR_NOMATCHING_LAYOUT; From 922839570920c024baf740ef45dfa98009f0192e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:44 -0400 Subject: [PATCH 0718/2880] NFSv4: Add a helper to increment stateid seqids Add a helper function to increment stateid seqids according to the rules specified in RFC5661 Section 8.2.2. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4_fs.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 3564da1ba8a1..e8f74ed98e42 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -574,6 +574,15 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; } +static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1) +{ + u32 seqid = be32_to_cpu(s1->seqid); + + if (++seqid == 0) + ++seqid; + s1->seqid = cpu_to_be32(seqid); +} + static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) { return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; From 30cb3ee299cbf343fe07419e9b0f8dc305979eb6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:45 -0400 Subject: [PATCH 0719/2880] pNFS: Handle NFS4ERR_OLD_STATEID on layoutreturn by bumping the state seqid If a LAYOUTRETURN receives a reply of NFS4ERR_OLD_STATEID then assume we've missed an update, and just bump the stateid. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 +- fs/nfs/pnfs.c | 18 ++++++++++++++---- fs/nfs/pnfs.h | 4 ++-- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c0bff7ad0c9f..27964ea0b3b7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9063,7 +9063,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) server = NFS_SERVER(lrp->args.inode); switch (task->tk_status) { case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&lrp->args.stateid, + if (nfs4_layout_refresh_old_stateid(&lrp->args.stateid, &lrp->args.range, lrp->args.inode)) goto out_restart; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index abc7188f1853..bb80034a7661 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -359,9 +359,10 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg, } /* - * Update the seqid of a layout stateid + * Update the seqid of a layout stateid after receiving + * NFS4ERR_OLD_STATEID */ -bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode) { @@ -377,7 +378,15 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, spin_lock(&inode->i_lock); lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + if (lo && pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + /* Is our call using the most recent seqid? If so, bump it */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) { + nfs4_stateid_seqid_inc(dst); + ret = true; + goto out; + } + /* Try to update the seqid to the most recent */ err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); if (err != -EBUSY) { dst->seqid = lo->plh_stateid.seqid; @@ -385,6 +394,7 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, ret = true; } } +out: spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); return ret; @@ -1475,7 +1485,7 @@ int pnfs_roc_done(struct rpc_task *task, struct inode *inode, *ret = -NFS4ERR_NOMATCHING_LAYOUT; return 0; case -NFS4ERR_OLD_STATEID: - if (!nfs4_layoutreturn_refresh_stateid(&arg->stateid, + if (!nfs4_layout_refresh_old_stateid(&arg->stateid, &arg->range, inode)) break; *ret = -NFS4ERR_NOMATCHING_LAYOUT; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 3ef3756d437c..f8a38065c7e4 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -261,7 +261,7 @@ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, bool is_recall); int pnfs_destroy_layouts_byclid(struct nfs_client *clp, bool is_recall); -bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode); void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); @@ -798,7 +798,7 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) { } -static inline bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode) { From e217e825dca8d2d80c371b43da7257adc499404a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:46 -0400 Subject: [PATCH 0720/2880] NFSv4: Fix OPEN_DOWNGRADE error handling If OPEN_DOWNGRADE returns a state error, then we want to initiate state recovery in addition to marking the stateid as closed. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 27964ea0b3b7..9e283a8e8e93 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3399,7 +3399,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data) task->tk_msg.rpc_cred); /* Fallthrough */ case -NFS4ERR_BAD_STATEID: - break; + if (calldata->arg.fmode == 0) + break; + /* Fallthrough */ default: task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); From 0e0cb35b417f505447694463694aff75fca32889 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:47 -0400 Subject: [PATCH 0721/2880] NFSv4: Handle NFS4ERR_OLD_STATEID in CLOSE/OPEN_DOWNGRADE If a CLOSE or OPEN_DOWNGRADE operation receives a NFS4ERR_OLD_STATEID then bump the seqid before resending. Ensure we only bump the seqid by 1. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4_fs.h | 2 -- fs/nfs/nfs4proc.c | 75 ++++++++++++++++++++++++++++++++++++++++++++-- fs/nfs/nfs4state.c | 16 ---------- 3 files changed, 72 insertions(+), 21 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index e8f74ed98e42..16b2e5cc3e94 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -491,8 +491,6 @@ extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, const struct nfs_lock_context *, nfs4_stateid *, const struct cred **); -extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst, - struct nfs4_state *state); extern bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9e283a8e8e93..0949bb535d7c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3313,6 +3313,75 @@ nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task) return pnfs_wait_on_layoutreturn(inode, task); } +/* + * Update the seqid of an open stateid + */ +static void nfs4_sync_open_stateid(nfs4_stateid *dst, + struct nfs4_state *state) +{ + __be32 seqid_open; + u32 dst_seqid; + int seq; + + for (;;) { + if (!nfs4_valid_open_stateid(state)) + break; + seq = read_seqbegin(&state->seqlock); + if (!nfs4_state_match_open_stateid_other(state, dst)) { + nfs4_stateid_copy(dst, &state->open_stateid); + if (read_seqretry(&state->seqlock, seq)) + continue; + break; + } + seqid_open = state->open_stateid.seqid; + if (read_seqretry(&state->seqlock, seq)) + continue; + + dst_seqid = be32_to_cpu(dst->seqid); + if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) < 0) + dst->seqid = seqid_open; + break; + } +} + +/* + * Update the seqid of an open stateid after receiving + * NFS4ERR_OLD_STATEID + */ +static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, + struct nfs4_state *state) +{ + __be32 seqid_open; + u32 dst_seqid; + bool ret; + int seq; + + for (;;) { + ret = false; + if (!nfs4_valid_open_stateid(state)) + break; + seq = read_seqbegin(&state->seqlock); + if (!nfs4_state_match_open_stateid_other(state, dst)) { + if (read_seqretry(&state->seqlock, seq)) + continue; + break; + } + seqid_open = state->open_stateid.seqid; + if (read_seqretry(&state->seqlock, seq)) + continue; + + dst_seqid = be32_to_cpu(dst->seqid); + if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) >= 0) + dst->seqid = cpu_to_be32(dst_seqid + 1); + else + dst->seqid = seqid_open; + ret = true; + break; + } + + return ret; +} + struct nfs4_closedata { struct inode *inode; struct nfs4_state *state; @@ -3387,7 +3456,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) break; case -NFS4ERR_OLD_STATEID: /* Did we race with OPEN? */ - if (nfs4_refresh_open_stateid(&calldata->arg.stateid, + if (nfs4_refresh_open_old_stateid(&calldata->arg.stateid, state)) goto out_restart; goto out_release; @@ -3456,8 +3525,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) } else if (is_rdwr) calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; - if (!nfs4_valid_open_stateid(state) || - !nfs4_refresh_open_stateid(&calldata->arg.stateid, state)) + nfs4_sync_open_stateid(&calldata->arg.stateid, state); + if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e916aba7a799..0c6d53dc3672 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1015,22 +1015,6 @@ out: return ret; } -bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) -{ - bool ret; - int seq; - - do { - ret = false; - seq = read_seqbegin(&state->seqlock); - if (nfs4_state_match_open_stateid_other(state, dst)) { - dst->seqid = state->open_stateid.seqid; - ret = true; - } - } while (read_seqretry(&state->seqlock, seq)); - return ret; -} - bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { bool ret; From 32c6e7eee39999084448b15e96a0e3c7d2f9021e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:48 -0400 Subject: [PATCH 0722/2880] NFSv4: Handle NFS4ERR_OLD_STATEID in LOCKU If a LOCKU request receives a NFS4ERR_OLD_STATEID, then bump the seqid before resending. Ensure we only bump the seqid by 1. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 53 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0949bb535d7c..11eafcfc490b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6410,6 +6410,42 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } +/* + * Update the seqid of a lock stateid after receiving + * NFS4ERR_OLD_STATEID + */ +static bool nfs4_refresh_lock_old_stateid(nfs4_stateid *dst, + struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret = false; + + spin_lock(&state->state_lock); + if (!nfs4_stateid_match_other(dst, &lsp->ls_stateid)) + goto out; + if (!nfs4_stateid_is_newer(&lsp->ls_stateid, dst)) + nfs4_stateid_seqid_inc(dst); + else + dst->seqid = lsp->ls_stateid.seqid; + ret = true; +out: + spin_unlock(&state->state_lock); + return ret; +} + +static bool nfs4_sync_lock_stateid(nfs4_stateid *dst, + struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret; + + spin_lock(&state->state_lock); + ret = !nfs4_stateid_match_other(dst, &lsp->ls_stateid); + nfs4_stateid_copy(dst, &lsp->ls_stateid); + spin_unlock(&state->state_lock); + return ret; +} + struct nfs4_unlockdata { struct nfs_locku_args arg; struct nfs_locku_res res; @@ -6427,7 +6463,8 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs_seqid *seqid) { struct nfs4_unlockdata *p; - struct inode *inode = lsp->ls_state->inode; + struct nfs4_state *state = lsp->ls_state; + struct inode *inode = state->inode; p = kzalloc(sizeof(*p), GFP_NOFS); if (p == NULL) @@ -6443,6 +6480,9 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); + spin_lock(&state->state_lock); + nfs4_stateid_copy(&p->arg.stateid, &lsp->ls_stateid); + spin_unlock(&state->state_lock); return p; } @@ -6481,10 +6521,14 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) task->tk_msg.rpc_cred); /* Fall through */ case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - if (!nfs4_stateid_match(&calldata->arg.stateid, - &calldata->lsp->ls_stateid)) + if (nfs4_sync_lock_stateid(&calldata->arg.stateid, + calldata->lsp)) + rpc_restart_call_prepare(task); + break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_lock_old_stateid(&calldata->arg.stateid, + calldata->lsp)) rpc_restart_call_prepare(task); break; default: @@ -6507,7 +6551,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) goto out_wait; - nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid); if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ goto out_no_action; From a315614b68993318f6913cc28b9b4e472ebbdf26 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Thu, 18 Jul 2019 09:32:05 +0800 Subject: [PATCH 0723/2880] pwm: mxs: Use devm_platform_ioremap_resource() to simplify code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the new helper devm_platform_ioremap_resource() which wraps the platform_get_resource() and devm_ioremap_resource() together, to simplify the code. Signed-off-by: Anson Huang Reviewed-by: Dong Aisheng Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-mxs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-mxs.c b/drivers/pwm/pwm-mxs.c index 04c0f6b95c1a..b14376b47ac8 100644 --- a/drivers/pwm/pwm-mxs.c +++ b/drivers/pwm/pwm-mxs.c @@ -126,15 +126,13 @@ static int mxs_pwm_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; struct mxs_pwm_chip *mxs; - struct resource *res; int ret; mxs = devm_kzalloc(&pdev->dev, sizeof(*mxs), GFP_KERNEL); if (!mxs) return -ENOMEM; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - mxs->base = devm_ioremap_resource(&pdev->dev, res); + mxs->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(mxs->base)) return PTR_ERR(mxs->base); From f6abac0379b8368519f28016c8c8821b8bd17f5e Mon Sep 17 00:00:00 2001 From: Ding Xiang Date: Thu, 18 Jul 2019 15:51:11 +0800 Subject: [PATCH 0724/2880] pwm: sifive: Remove redundant error message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit devm_ioremap_resource() already outputs an error message, so remove the extra error message on failure. Signed-off-by: Ding Xiang Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-sifive.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-sifive.c b/drivers/pwm/pwm-sifive.c index a7c107f19e66..bb4f02ce4f94 100644 --- a/drivers/pwm/pwm-sifive.c +++ b/drivers/pwm/pwm-sifive.c @@ -250,10 +250,8 @@ static int pwm_sifive_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); ddata->regs = devm_ioremap_resource(dev, res); - if (IS_ERR(ddata->regs)) { - dev_err(dev, "Unable to map IO resources\n"); + if (IS_ERR(ddata->regs)) return PTR_ERR(ddata->regs); - } ddata->clk = devm_clk_get(dev, NULL); if (IS_ERR(ddata->clk)) { From a003365cab64b0f7988ac3ccb1da895ce0bece5e Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Wed, 4 Sep 2019 00:55:29 -0400 Subject: [PATCH 0725/2880] powerpc/tm: Add tm-poison test Add TM selftest to check if FP or VEC register values from one process can leak into another process when both run on the same CPU. Signed-off-by: Gustavo Romero Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190904045529.23002-3-gromero@linux.vnet.ibm.com --- tools/testing/selftests/powerpc/tm/.gitignore | 1 + tools/testing/selftests/powerpc/tm/Makefile | 2 +- .../testing/selftests/powerpc/tm/tm-poison.c | 179 ++++++++++++++++++ 3 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/tm/tm-poison.c diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore index 951fe855f7cd..98f2708d86cc 100644 --- a/tools/testing/selftests/powerpc/tm/.gitignore +++ b/tools/testing/selftests/powerpc/tm/.gitignore @@ -17,3 +17,4 @@ tm-vmx-unavail tm-unavailable tm-trap tm-sigreturn +tm-poison diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index c0734ed0ef56..b15a1a325bd0 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -5,7 +5,7 @@ SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr tm-signal-context-chk-fpu TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \ tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \ $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt \ - tm-signal-context-force-tm + tm-signal-context-force-tm tm-poison top_srcdir = ../../../../.. include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/tm/tm-poison.c b/tools/testing/selftests/powerpc/tm/tm-poison.c new file mode 100644 index 000000000000..977558497c16 --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-poison.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2019, Gustavo Romero, Michael Neuling, IBM Corp. + * + * This test will spawn two processes. Both will be attached to the same + * CPU (CPU 0). The child will be in a loop writing to FP register f31 and + * VMX/VEC/Altivec register vr31 a known value, called poison, calling + * sched_yield syscall after to allow the parent to switch on the CPU. + * Parent will set f31 and vr31 to 1 and in a loop will check if f31 and + * vr31 remain 1 as expected until a given timeout (2m). If the issue is + * present child's poison will leak into parent's f31 or vr31 registers, + * otherwise, poison will never leak into parent's f31 and vr31 registers. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tm.h" + +int tm_poison_test(void) +{ + int pid; + cpu_set_t cpuset; + uint64_t poison = 0xdeadbeefc0dec0fe; + uint64_t unknown = 0; + bool fail_fp = false; + bool fail_vr = false; + + SKIP_IF(!have_htm()); + + /* Attach both Child and Parent to CPU 0 */ + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); + + pid = fork(); + if (!pid) { + /** + * child + */ + while (1) { + sched_yield(); + asm ( + "mtvsrd 31, %[poison];" // f31 = poison + "mtvsrd 63, %[poison];" // vr31 = poison + + : : [poison] "r" (poison) : ); + } + } + + /** + * parent + */ + asm ( + /* + * Set r3, r4, and f31 to known value 1 before entering + * in transaction. They won't be written after that. + */ + " li 3, 0x1 ;" + " li 4, 0x1 ;" + " mtvsrd 31, 4 ;" + + /* + * The Time Base (TB) is a 64-bit counter register that is + * independent of the CPU clock and which is incremented + * at a frequency of 512000000 Hz, so every 1.953125ns. + * So it's necessary 120s/0.000000001953125s = 61440000000 + * increments to get a 2 minutes timeout. Below we set that + * value in r5 and then use r6 to track initial TB value, + * updating TB values in r7 at every iteration and comparing it + * to r6. When r7 (current) - r6 (initial) > 61440000000 we bail + * out since for sure we spent already 2 minutes in the loop. + * SPR 268 is the TB register. + */ + " lis 5, 14 ;" + " ori 5, 5, 19996 ;" + " sldi 5, 5, 16 ;" // r5 = 61440000000 + + " mfspr 6, 268 ;" // r6 (TB initial) + "1: mfspr 7, 268 ;" // r7 (TB current) + " subf 7, 6, 7 ;" // r7 - r6 > 61440000000 ? + " cmpd 7, 5 ;" + " bgt 3f ;" // yes, exit + + /* + * Main loop to check f31 + */ + " tbegin. ;" // no, try again + " beq 1b ;" // restart if no timeout + " mfvsrd 3, 31 ;" // read f31 + " cmpd 3, 4 ;" // f31 == 1 ? + " bne 2f ;" // broken :-( + " tabort. 3 ;" // try another transaction + "2: tend. ;" // commit transaction + "3: mr %[unknown], 3 ;" // record r3 + + : [unknown] "=r" (unknown) + : + : "cr0", "r3", "r4", "r5", "r6", "r7", "vs31" + + ); + + /* + * On leak 'unknown' will contain 'poison' value from child, + * otherwise (no leak) 'unknown' will contain the same value + * as r3 before entering in transactional mode, i.e. 0x1. + */ + fail_fp = unknown != 0x1; + if (fail_fp) + printf("Unknown value %#"PRIx64" leaked into f31!\n", unknown); + else + printf("Good, no poison or leaked value into FP registers\n"); + + asm ( + /* + * Set r3, r4, and vr31 to known value 1 before entering + * in transaction. They won't be written after that. + */ + " li 3, 0x1 ;" + " li 4, 0x1 ;" + " mtvsrd 63, 4 ;" + + " lis 5, 14 ;" + " ori 5, 5, 19996 ;" + " sldi 5, 5, 16 ;" // r5 = 61440000000 + + " mfspr 6, 268 ;" // r6 (TB initial) + "1: mfspr 7, 268 ;" // r7 (TB current) + " subf 7, 6, 7 ;" // r7 - r6 > 61440000000 ? + " cmpd 7, 5 ;" + " bgt 3f ;" // yes, exit + + /* + * Main loop to check vr31 + */ + " tbegin. ;" // no, try again + " beq 1b ;" // restart if no timeout + " mfvsrd 3, 63 ;" // read vr31 + " cmpd 3, 4 ;" // vr31 == 1 ? + " bne 2f ;" // broken :-( + " tabort. 3 ;" // try another transaction + "2: tend. ;" // commit transaction + "3: mr %[unknown], 3 ;" // record r3 + + : [unknown] "=r" (unknown) + : + : "cr0", "r3", "r4", "r5", "r6", "r7", "vs63" + + ); + + /* + * On leak 'unknown' will contain 'poison' value from child, + * otherwise (no leak) 'unknown' will contain the same value + * as r3 before entering in transactional mode, i.e. 0x1. + */ + fail_vr = unknown != 0x1; + if (fail_vr) + printf("Unknown value %#"PRIx64" leaked into vr31!\n", unknown); + else + printf("Good, no poison or leaked value into VEC registers\n"); + + kill(pid, SIGKILL); + + return (fail_fp | fail_vr); +} + +int main(int argc, char *argv[]) +{ + /* Test completes in about 4m */ + test_harness_set_timeout(250); + return test_harness(tm_poison_test, "tm_poison_test"); +} From 7aec584eaf1cc1a527dcbe7d80f2e44e3bfcfe1d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 18 Sep 2019 19:31:03 +0530 Subject: [PATCH 0726/2880] powerpc/book3s64/radix: Remove WARN_ON in destroy_context() On failed task initialization due to memory allocation failures, we can call into destroy_context() with process_tb entry already populated. This patch forces the process_tb entry to zero in destroy_context(). With this patch, we lose the ability to track if we are destroying a context without flushing the process table entry. WARNING: CPU: 4 PID: 6368 at arch/powerpc/mm/mmu_context_book3s64.c:246 destroy_context+0x58/0x340 NIP [c0000000000875f8] destroy_context+0x58/0x340 LR [c00000000013da18] __mmdrop+0x78/0x270 Call Trace: [c000000f7db77c80] [c00000000013da18] __mmdrop+0x78/0x270 [c000000f7db77cf0] [c0000000004d6a34] __do_execve_file.isra.13+0xbd4/0x1000 [c000000f7db77e00] [c0000000004d7428] sys_execve+0x58/0x70 [c000000f7db77e30] [c00000000000b388] system_call+0x5c/0x70 Reported-by: Priya M.A Signed-off-by: Aneesh Kumar K.V [mpe: Reformat/tweak comment wording] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190918140103.24395-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/book3s64/mmu_context.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 2d0cb5ba9a47..0ba30b8b935b 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -256,8 +256,21 @@ void destroy_context(struct mm_struct *mm) #ifdef CONFIG_SPAPR_TCE_IOMMU WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); #endif + /* + * For tasks which were successfully initialized we end up calling + * arch_exit_mmap() which clears the process table entry. And + * arch_exit_mmap() is called before the required fullmm TLB flush + * which does a RIC=2 flush. Hence for an initialized task, we do clear + * any cached process table entries. + * + * The condition below handles the error case during task init. We have + * set the process table entry early and if we fail a task + * initialization, we need to ensure the process table entry is zeroed. + * We need not worry about process table entry caches because the task + * never ran with the PID value. + */ if (radix_enabled()) - WARN_ON(process_tb[mm->context.id].prtb0 != 0); + process_tb[mm->context.id].prtb0 = 0; else subpage_prot_free(mm); destroy_contexts(&mm->context); From c6fadabb2868f817299ddb338ac15885e25d12d2 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 17 Sep 2019 10:46:04 +1000 Subject: [PATCH 0727/2880] powerpc: Fix definition of PCR bits to work with old binutils Commit 388cc6e133132 ("KVM: PPC: Book3S HV: Support POWER6 compatibility mode on POWER7") introduced new macros defining the PCR bits. When used from assembly files these definitions lead to build errors using older versions of binutils that don't support the 'ul' suffix. This fixes the build errors by updating the definitions to use the __MASK() macro which selects the appropriate suffix. Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190917004605.22471-1-alistair@popple.id.au --- arch/powerpc/include/asm/reg.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index ec3714cf0989..931939af95d1 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -475,9 +475,9 @@ #define HMER_DEBUG_TRIG (1ul << (63 - 17)) /* Debug trigger */ #define SPRN_HMEER 0x151 /* Hyp maintenance exception enable reg */ #define SPRN_PCR 0x152 /* Processor compatibility register */ -#define PCR_VEC_DIS (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */ -#define PCR_VSX_DIS (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */ -#define PCR_TM_DIS (1ul << (63-2)) /* Trans. memory disable (POWER8) */ +#define PCR_VEC_DIS (__MASK(63-0)) /* Vec. disable (bit NA since POWER8) */ +#define PCR_VSX_DIS (__MASK(63-1)) /* VSX disable (bit NA since POWER8) */ +#define PCR_TM_DIS (__MASK(63-2)) /* Trans. memory disable (POWER8) */ /* * These bits are used in the function kvmppc_set_arch_compat() to specify and * determine both the compatibility level which we want to emulate and the From 13c7bb3c57dcfe779ea5b4b083f6c47753cc5327 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Tue, 17 Sep 2019 10:46:05 +1000 Subject: [PATCH 0728/2880] powerpc/64s: Set reserved PCR bits Currently the reserved bits of the Processor Compatibility Register (PCR) are cleared as per the Programming Note in Section 1.3.3 of version 3.0B of the Power ISA. This causes all new architecture features to be made available when running on newer processors with new architecture features added to the PCR as bits must be set to disable a given feature. For example to disable new features added as part of Version 2.07 of the ISA the corresponding bit in the PCR needs to be set. As new processor features generally require explicit kernel support they should be disabled until such support is implemented. Therefore kernels should set all unknown/reserved bits in the PCR such that any new architecture features which the kernel does not currently know about get disabled. An update is planned to the ISA to clarify that the PCR is an exception to the Programming Note on reserved bits in Section 1.3.3. Signed-off-by: Alistair Popple Signed-off-by: Jordan Niethe Tested-by: Joel Stanley Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190917004605.22471-2-alistair@popple.id.au --- arch/powerpc/include/asm/reg.h | 3 +++ arch/powerpc/kernel/cpu_setup_power.S | 6 ++++++ arch/powerpc/kernel/dt_cpu_ftrs.c | 3 ++- arch/powerpc/kvm/book3s_hv.c | 11 +++++++---- arch/powerpc/kvm/book3s_hv_nested.c | 6 +++--- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 ++++++---- 6 files changed, 27 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 931939af95d1..b3cbb1136bce 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -478,6 +478,7 @@ #define PCR_VEC_DIS (__MASK(63-0)) /* Vec. disable (bit NA since POWER8) */ #define PCR_VSX_DIS (__MASK(63-1)) /* VSX disable (bit NA since POWER8) */ #define PCR_TM_DIS (__MASK(63-2)) /* Trans. memory disable (POWER8) */ +#define PCR_HIGH_BITS (PCR_VEC_DIS | PCR_VSX_DIS | PCR_TM_DIS) /* * These bits are used in the function kvmppc_set_arch_compat() to specify and * determine both the compatibility level which we want to emulate and the @@ -486,6 +487,8 @@ #define PCR_ARCH_207 0x8 /* Architecture 2.07 */ #define PCR_ARCH_206 0x4 /* Architecture 2.06 */ #define PCR_ARCH_205 0x2 /* Architecture 2.05 */ +#define PCR_LOW_BITS (PCR_ARCH_207 | PCR_ARCH_206 | PCR_ARCH_205) +#define PCR_MASK ~(PCR_HIGH_BITS | PCR_LOW_BITS) /* PCR Reserved Bits */ #define SPRN_HEIR 0x153 /* Hypervisor Emulated Instruction Register */ #define SPRN_TLBINDEXR 0x154 /* P7 TLB control register */ #define SPRN_TLBVPNR 0x155 /* P7 TLB control register */ diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 3239a9fe6c1c..a460298c7ddb 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -23,6 +23,7 @@ _GLOBAL(__setup_cpu_power7) beqlr li r0,0 mtspr SPRN_LPID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR li r4,(LPCR_LPES1 >> LPCR_LPES_SH) @@ -37,6 +38,7 @@ _GLOBAL(__restore_cpu_power7) beqlr li r0,0 mtspr SPRN_LPID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR li r4,(LPCR_LPES1 >> LPCR_LPES_SH) @@ -54,6 +56,7 @@ _GLOBAL(__setup_cpu_power8) beqlr li r0,0 mtspr SPRN_LPID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR ori r3, r3, LPCR_PECEDH @@ -76,6 +79,7 @@ _GLOBAL(__restore_cpu_power8) beqlr li r0,0 mtspr SPRN_LPID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR ori r3, r3, LPCR_PECEDH @@ -98,6 +102,7 @@ _GLOBAL(__setup_cpu_power9) mtspr SPRN_PSSCR,r0 mtspr SPRN_LPID,r0 mtspr SPRN_PID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) @@ -123,6 +128,7 @@ _GLOBAL(__restore_cpu_power9) mtspr SPRN_PSSCR,r0 mtspr SPRN_LPID,r0 mtspr SPRN_PID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index bd95318d2202..bceee2fde885 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -101,7 +101,7 @@ static void __restore_cpu_cpufeatures(void) if (hv_mode) { mtspr(SPRN_LPID, 0); mtspr(SPRN_HFSCR, system_registers.hfscr); - mtspr(SPRN_PCR, 0); + mtspr(SPRN_PCR, PCR_MASK); } mtspr(SPRN_FSCR, system_registers.fscr); @@ -144,6 +144,7 @@ static void __init cpufeatures_setup_cpu(void) mtspr(SPRN_HFSCR, 0); } mtspr(SPRN_FSCR, 0); + mtspr(SPRN_PCR, PCR_MASK); /* * LPCR does not get cleared, to match behaviour with secondaries diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index efd8f93bc9dc..709cf1fd4cf4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -401,8 +401,11 @@ static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) spin_lock(&vc->lock); vc->arch_compat = arch_compat; - /* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */ - vc->pcr = host_pcr_bit - guest_pcr_bit; + /* + * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit + * Also set all reserved PCR bits + */ + vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK; spin_unlock(&vc->lock); return 0; @@ -3410,7 +3413,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, } if (vc->pcr) - mtspr(SPRN_PCR, vc->pcr); + mtspr(SPRN_PCR, vc->pcr | PCR_MASK); mtspr(SPRN_DPDES, vc->dpdes); mtspr(SPRN_VTB, vc->vtb); @@ -3490,7 +3493,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, vc->vtb = mfspr(SPRN_VTB); mtspr(SPRN_DPDES, 0); if (vc->pcr) - mtspr(SPRN_PCR, 0); + mtspr(SPRN_PCR, PCR_MASK); if (vc->tb_offset_applied) { u64 new_tb = mftb() - vc->tb_offset_applied; diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index fff90f2c3de2..cdf30c6eaf54 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -29,7 +29,7 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; - hr->pcr = vc->pcr; + hr->pcr = vc->pcr | PCR_MASK; hr->dpdes = vc->dpdes; hr->hfscr = vcpu->arch.hfscr; hr->tb_offset = vc->tb_offset; @@ -65,7 +65,7 @@ static void byteswap_hv_regs(struct hv_guest_state *hr) hr->lpid = swab32(hr->lpid); hr->vcpu_token = swab32(hr->vcpu_token); hr->lpcr = swab64(hr->lpcr); - hr->pcr = swab64(hr->pcr); + hr->pcr = swab64(hr->pcr) | PCR_MASK; hr->amor = swab64(hr->amor); hr->dpdes = swab64(hr->dpdes); hr->hfscr = swab64(hr->hfscr); @@ -148,7 +148,7 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; - vc->pcr = hr->pcr; + vc->pcr = hr->pcr | PCR_MASK; vc->dpdes = hr->dpdes; vcpu->arch.hfscr = hr->hfscr; vcpu->arch.dawr = hr->dawr0; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9a05b0d932ef..74a9cfe84aee 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -644,8 +644,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* Load guest PCR value to select appropriate compat mode */ 37: ld r7, VCORE_PCR(r5) - cmpdi r7, 0 + LOAD_REG_IMMEDIATE(r6, PCR_MASK) + cmpld r7, r6 beq 38f + or r7, r7, r6 mtspr SPRN_PCR, r7 38: @@ -1913,10 +1915,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Reset PCR */ ld r0, VCORE_PCR(r5) - cmpdi r0, 0 + LOAD_REG_IMMEDIATE(r6, PCR_MASK) + cmpld r0, r6 beq 18f - li r0, 0 - mtspr SPRN_PCR, r0 + mtspr SPRN_PCR, r6 18: /* Signal secondary CPUs to continue */ stb r0,VCORE_IN_GUEST(r5) From 4c0f5d1eb4072871c34530358df45f05ab80edd6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 21 Aug 2019 10:20:00 +0000 Subject: [PATCH 0729/2880] powerpc/mm: Add a helper to select PAGE_KERNEL_RO or PAGE_READONLY In a couple of places there is a need to select whether read-only protection of shadow pages is performed with PAGE_KERNEL_RO or with PAGE_READONLY. Add a helper to avoid duplicating the choice. Signed-off-by: Christophe Leroy Cc: stable@vger.kernel.org Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/9f33f44b9cd741c4a02b3dce7b8ef9438fe2cd2a.1566382750.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/kasan/kasan_init_32.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 802387b231ad..e8ab3cc5f6e4 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -12,6 +12,14 @@ #include #include +static pgprot_t kasan_prot_ro(void) +{ + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return PAGE_READONLY; + + return PAGE_KERNEL_RO; +} + static void kasan_populate_pte(pte_t *ptep, pgprot_t prot) { unsigned long va = (unsigned long)kasan_early_shadow_page; @@ -26,6 +34,7 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l { pmd_t *pmd; unsigned long k_cur, k_next; + pgprot_t prot = kasan_prot_ro(); pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start); @@ -43,10 +52,7 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l if (!new) return -ENOMEM; - if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) - kasan_populate_pte(new, PAGE_READONLY); - else - kasan_populate_pte(new, PAGE_KERNEL_RO); + kasan_populate_pte(new, prot); smp_wmb(); /* See comment in __pte_alloc */ @@ -103,10 +109,9 @@ static int __ref kasan_init_region(void *start, size_t size) static void __init kasan_remap_early_shadow_ro(void) { - if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) - kasan_populate_pte(kasan_early_shadow_pte, PAGE_READONLY); - else - kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO); + pgprot_t prot = kasan_prot_ro(); + + kasan_populate_pte(kasan_early_shadow_pte, prot); flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); } From cbd18991e24fea2c31da3bb117c83e4a3538cd11 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 21 Aug 2019 10:20:11 +0000 Subject: [PATCH 0730/2880] powerpc/mm: Fix an Oops in kasan_mmu_init() Uncompressing Kernel Image ... OK Loading Device Tree to 01ff7000, end 01fff74f ... OK [ 0.000000] printk: bootconsole [udbg0] enabled [ 0.000000] BUG: Unable to handle kernel data access at 0xf818c000 [ 0.000000] Faulting instruction address: 0xc0013c7c [ 0.000000] Thread overran stack, or stack corrupted [ 0.000000] Oops: Kernel access of bad area, sig: 11 [#1] [ 0.000000] BE PAGE_SIZE=16K PREEMPT [ 0.000000] Modules linked in: [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.3.0-rc4-s3k-dev-00743-g5abe4a3e8fd3-dirty #2080 [ 0.000000] NIP: c0013c7c LR: c0013310 CTR: 00000000 [ 0.000000] REGS: c0c5ff38 TRAP: 0300 Not tainted (5.3.0-rc4-s3k-dev-00743-g5abe4a3e8fd3-dirty) [ 0.000000] MSR: 00001032 CR: 99033955 XER: 80002100 [ 0.000000] DAR: f818c000 DSISR: 82000000 [ 0.000000] GPR00: c0013310 c0c5fff0 c0ad6ac0 c0c600c0 f818c031 82000000 00000000 ffffffff [ 0.000000] GPR08: 00000000 f1f1f1f1 c0013c2c c0013304 99033955 00400008 00000000 07ff9598 [ 0.000000] GPR16: 00000000 07ffb94c 00000000 00000000 00000000 00000000 00000000 f818cfb2 [ 0.000000] GPR24: 00000000 00000000 00001000 ffffffff 00000000 c07dbf80 00000000 f818c000 [ 0.000000] NIP [c0013c7c] do_page_fault+0x50/0x904 [ 0.000000] LR [c0013310] handle_page_fault+0xc/0x38 [ 0.000000] Call Trace: [ 0.000000] Instruction dump: [ 0.000000] be010080 91410014 553fe8fe 3d40c001 3d20f1f1 7d800026 394a3c2c 3fffe000 [ 0.000000] 6129f1f1 900100c4 9181007c 91410018 <913f0000> 3d2001f4 6129f4f4 913f0004 Don't map the early shadow page read-only yet when creating the new page tables for the real shadow memory, otherwise the memblock allocations that immediately follows to create the real shadow pages that are about to replace the early shadow page trigger a page fault if they fall into the region being worked on at the moment. Signed-off-by: Christophe Leroy Fixes: 2edb16efc899 ("powerpc/32: Add KASAN support") Cc: stable@vger.kernel.org Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/fe86886fb8db44360417cee0dc515ad47ca6ef72.1566382750.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/kasan/kasan_init_32.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index e8ab3cc5f6e4..0e6ed4413eea 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -34,7 +34,7 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l { pmd_t *pmd; unsigned long k_cur, k_next; - pgprot_t prot = kasan_prot_ro(); + pgprot_t prot = slab_is_available() ? kasan_prot_ro() : PAGE_KERNEL; pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start); @@ -110,9 +110,22 @@ static int __ref kasan_init_region(void *start, size_t size) static void __init kasan_remap_early_shadow_ro(void) { pgprot_t prot = kasan_prot_ro(); + unsigned long k_start = KASAN_SHADOW_START; + unsigned long k_end = KASAN_SHADOW_END; + unsigned long k_cur; + phys_addr_t pa = __pa(kasan_early_shadow_page); kasan_populate_pte(kasan_early_shadow_pte, prot); + for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + pte_t *ptep = pte_offset_kernel(pmd, k_cur); + + if ((pte_val(*ptep) & PTE_RPN_MASK) != pa) + continue; + + __set_pte_at(&init_mm, k_cur, ptep, pfn_pte(PHYS_PFN(pa), prot), 0); + } flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); } From 3b442c60cf9766c76bc3c1e44e3e387853074a08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 30 Jul 2019 14:32:29 +0200 Subject: [PATCH 0731/2880] pwm: jz4740: Document known limitations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The JZ4740 PWM implementation doesn't fulfill the (up to now insufficiently documented) requirements of the PWM API. At least document them in the driver. Signed-off-by: Uwe Kleine-König Reviewed-by: Paul Cercueil Signed-off-by: Thierry Reding --- drivers/pwm/pwm-jz4740.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/pwm/pwm-jz4740.c b/drivers/pwm/pwm-jz4740.c index f901e8a0d33d..9d444d012f92 100644 --- a/drivers/pwm/pwm-jz4740.c +++ b/drivers/pwm/pwm-jz4740.c @@ -2,6 +2,11 @@ /* * Copyright (C) 2010, Lars-Peter Clausen * JZ4740 platform PWM support + * + * Limitations: + * - The .apply callback doesn't complete the currently running period before + * reconfiguring the hardware. + * - Each period starts with the inactive part. */ #include From f6960976c465179cd1b64dfae2edfb647820af1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 30 Jul 2019 14:45:27 +0200 Subject: [PATCH 0732/2880] pwm: imx: Document known limitations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-imx27.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pwm/pwm-imx27.c b/drivers/pwm/pwm-imx27.c index 434a351fb626..91c23cbbc167 100644 --- a/drivers/pwm/pwm-imx27.c +++ b/drivers/pwm/pwm-imx27.c @@ -3,6 +3,10 @@ * simple driver for PWM (Pulse Width Modulator) controller * * Derived from pxa PWM driver by eric miao + * + * Limitations: + * - When disabled the output is driven to 0 independent of the configured + * polarity. */ #include From fb5a35dbee8d6bd2ff638b7ebaf540b86860ee0a Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 30 Jul 2019 11:15:36 -0700 Subject: [PATCH 0733/2880] pwm: Remove dev_err() usage after platform_get_irq() We don't need dev_err() messages when platform_get_irq() fails now that platform_get_irq() prints an error message itself when something goes wrong. Let's remove these prints with a simple semantic patch. // @@ expression ret; struct platform_device *E; @@ ret = ( platform_get_irq(E, ...) | platform_get_irq_byname(E, ...) ); if ( \( ret < 0 \| ret <= 0 \) ) { ( -if (ret != -EPROBE_DEFER) -{ ... -dev_err(...); -... } | ... -dev_err(...); ) ... } // While we're here, remove braces on if statements that only have one statement (manually). Cc: Thierry Reding Cc: linux-pwm@vger.kernel.org Cc: Greg Kroah-Hartman Signed-off-by: Stephen Boyd Signed-off-by: Thierry Reding --- drivers/pwm/pwm-sti.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c index 20450e34ad57..1508616d794c 100644 --- a/drivers/pwm/pwm-sti.c +++ b/drivers/pwm/pwm-sti.c @@ -564,10 +564,8 @@ static int sti_pwm_probe(struct platform_device *pdev) return PTR_ERR(pc->regmap); irq = platform_get_irq(pdev, 0); - if (irq < 0) { - dev_err(&pdev->dev, "Failed to obtain IRQ\n"); + if (irq < 0) return irq; - } ret = devm_request_irq(&pdev->dev, irq, sti_pwm_interrupt, 0, pdev->name, pc); From 4b046497341c034c29405295e7b37f2547f44f71 Mon Sep 17 00:00:00 2001 From: Fabien Parent Date: Mon, 5 Aug 2019 14:58:47 +0200 Subject: [PATCH 0734/2880] dt-bindings: pwm: mediatek: Add documentation for MT8516 Add the device-tree documentation for the PWM IP on the MediaTek MT8516 SoCs. Signed-off-by: Fabien Parent Reviewed-by: Rob Herring Signed-off-by: Thierry Reding --- Documentation/devicetree/bindings/pwm/pwm-mediatek.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt index 991728cb46cb..9152bf5afe56 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt @@ -6,6 +6,7 @@ Required properties: - "mediatek,mt7622-pwm": found on mt7622 SoC. - "mediatek,mt7623-pwm": found on mt7623 SoC. - "mediatek,mt7628-pwm": found on mt7628 SoC. + - "mediatek,mt8516-pwm": found on mt8516 SoC. - reg: physical base address and length of the controller's registers. - #pwm-cells: must be 2. See pwm.txt in this directory for a description of the cell format. From 8d190728fd8e272b733e1575e000fc1982b5d9b2 Mon Sep 17 00:00:00 2001 From: Fabien Parent Date: Mon, 5 Aug 2019 14:58:48 +0200 Subject: [PATCH 0735/2880] pwm: mediatek: Add MT8516 SoC support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the compatible and the platform data to support PWM on the MT8516 SoC. Signed-off-by: Fabien Parent Reviewed-by: Matthias Brugger Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-mediatek.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index eb6674ce995f..6697e30811e7 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -302,11 +302,18 @@ static const struct mtk_pwm_platform_data mt7628_pwm_data = { .has_clks = false, }; +static const struct mtk_pwm_platform_data mt8516_pwm_data = { + .num_pwms = 5, + .pwm45_fixup = false, + .has_clks = true, +}; + static const struct of_device_id mtk_pwm_of_match[] = { { .compatible = "mediatek,mt2712-pwm", .data = &mt2712_pwm_data }, { .compatible = "mediatek,mt7622-pwm", .data = &mt7622_pwm_data }, { .compatible = "mediatek,mt7623-pwm", .data = &mt7623_pwm_data }, { .compatible = "mediatek,mt7628-pwm", .data = &mt7628_pwm_data }, + { .compatible = "mediatek,mt8516-pwm", .data = &mt8516_pwm_data }, { }, }; MODULE_DEVICE_TABLE(of, mtk_pwm_of_match); From bdaadd5948179f07ca8b508a62a8f8b00ece51ed Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 14 Aug 2019 20:46:10 +0800 Subject: [PATCH 0736/2880] dt-bindings: pwm: sprd: Add Spreadtrum PWM documentation Add Spreadtrum PWM controller documentation. Signed-off-by: Baolin Wang Reviewed-by: Rob Herring Signed-off-by: Thierry Reding --- .../devicetree/bindings/pwm/pwm-sprd.txt | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 Documentation/devicetree/bindings/pwm/pwm-sprd.txt diff --git a/Documentation/devicetree/bindings/pwm/pwm-sprd.txt b/Documentation/devicetree/bindings/pwm/pwm-sprd.txt new file mode 100644 index 000000000000..16fa5a096206 --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/pwm-sprd.txt @@ -0,0 +1,40 @@ +Spreadtrum PWM controller + +Spreadtrum SoCs PWM controller provides 4 PWM channels. + +Required properties: +- compatible : Should be "sprd,ums512-pwm". +- reg: Physical base address and length of the controller's registers. +- clocks: The phandle and specifier referencing the controller's clocks. +- clock-names: Should contain following entries: + "pwmn": used to derive the functional clock for PWM channel n (n range: 0 ~ 3). + "enablen": for PWM channel n enable clock (n range: 0 ~ 3). +- #pwm-cells: Should be 2. See pwm.txt in this directory for a description of + the cells format. + +Optional properties: +- assigned-clocks: Reference to the PWM clock entries. +- assigned-clock-parents: The phandle of the parent clock of PWM clock. + +Example: + pwms: pwm@32260000 { + compatible = "sprd,ums512-pwm"; + reg = <0 0x32260000 0 0x10000>; + clock-names = "pwm0", "enable0", + "pwm1", "enable1", + "pwm2", "enable2", + "pwm3", "enable3"; + clocks = <&aon_clk CLK_PWM0>, <&aonapb_gate CLK_PWM0_EB>, + <&aon_clk CLK_PWM1>, <&aonapb_gate CLK_PWM1_EB>, + <&aon_clk CLK_PWM2>, <&aonapb_gate CLK_PWM2_EB>, + <&aon_clk CLK_PWM3>, <&aonapb_gate CLK_PWM3_EB>; + assigned-clocks = <&aon_clk CLK_PWM0>, + <&aon_clk CLK_PWM1>, + <&aon_clk CLK_PWM2>, + <&aon_clk CLK_PWM3>; + assigned-clock-parents = <&ext_26m>, + <&ext_26m>, + <&ext_26m>, + <&ext_26m>; + #pwm-cells = <2>; + }; From 8aae4b02e8a6dd138afd2b54d9984d17685b0364 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 14 Aug 2019 20:46:11 +0800 Subject: [PATCH 0737/2880] pwm: sprd: Add Spreadtrum PWM support This patch adds the Spreadtrum PWM support, which provides maximum 4 channels. Signed-off-by: Neo Hou Signed-off-by: Baolin Wang Signed-off-by: Thierry Reding --- drivers/pwm/Kconfig | 11 ++ drivers/pwm/Makefile | 1 + drivers/pwm/pwm-sprd.c | 309 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 321 insertions(+) create mode 100644 drivers/pwm/pwm-sprd.c diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index a7e57516959e..31dfc88ef362 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -423,6 +423,17 @@ config PWM_SPEAR To compile this driver as a module, choose M here: the module will be called pwm-spear. +config PWM_SPRD + tristate "Spreadtrum PWM support" + depends on ARCH_SPRD || COMPILE_TEST + depends on HAS_IOMEM + help + Generic PWM framework driver for the PWM controller on + Spreadtrum SoCs. + + To compile this driver as a module, choose M here: the module + will be called pwm-sprd. + config PWM_STI tristate "STiH4xx PWM support" depends on ARCH_STI diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile index 76b555b51887..26326adf71d7 100644 --- a/drivers/pwm/Makefile +++ b/drivers/pwm/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_PWM_ROCKCHIP) += pwm-rockchip.o obj-$(CONFIG_PWM_SAMSUNG) += pwm-samsung.o obj-$(CONFIG_PWM_SIFIVE) += pwm-sifive.o obj-$(CONFIG_PWM_SPEAR) += pwm-spear.o +obj-$(CONFIG_PWM_SPRD) += pwm-sprd.o obj-$(CONFIG_PWM_STI) += pwm-sti.o obj-$(CONFIG_PWM_STM32) += pwm-stm32.o obj-$(CONFIG_PWM_STM32_LP) += pwm-stm32-lp.o diff --git a/drivers/pwm/pwm-sprd.c b/drivers/pwm/pwm-sprd.c new file mode 100644 index 000000000000..68c2d9f0411b --- /dev/null +++ b/drivers/pwm/pwm-sprd.c @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Spreadtrum Communications Inc. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define SPRD_PWM_PRESCALE 0x0 +#define SPRD_PWM_MOD 0x4 +#define SPRD_PWM_DUTY 0x8 +#define SPRD_PWM_ENABLE 0x18 + +#define SPRD_PWM_MOD_MAX GENMASK(7, 0) +#define SPRD_PWM_DUTY_MSK GENMASK(15, 0) +#define SPRD_PWM_PRESCALE_MSK GENMASK(7, 0) +#define SPRD_PWM_ENABLE_BIT BIT(0) + +#define SPRD_PWM_CHN_NUM 4 +#define SPRD_PWM_REGS_SHIFT 5 +#define SPRD_PWM_CHN_CLKS_NUM 2 +#define SPRD_PWM_CHN_OUTPUT_CLK 1 + +struct sprd_pwm_chn { + struct clk_bulk_data clks[SPRD_PWM_CHN_CLKS_NUM]; + u32 clk_rate; +}; + +struct sprd_pwm_chip { + void __iomem *base; + struct device *dev; + struct pwm_chip chip; + int num_pwms; + struct sprd_pwm_chn chn[SPRD_PWM_CHN_NUM]; +}; + +/* + * The list of clocks required by PWM channels, and each channel has 2 clocks: + * enable clock and pwm clock. + */ +static const char * const sprd_pwm_clks[] = { + "enable0", "pwm0", + "enable1", "pwm1", + "enable2", "pwm2", + "enable3", "pwm3", +}; + +static u32 sprd_pwm_read(struct sprd_pwm_chip *spc, u32 hwid, u32 reg) +{ + u32 offset = reg + (hwid << SPRD_PWM_REGS_SHIFT); + + return readl_relaxed(spc->base + offset); +} + +static void sprd_pwm_write(struct sprd_pwm_chip *spc, u32 hwid, + u32 reg, u32 val) +{ + u32 offset = reg + (hwid << SPRD_PWM_REGS_SHIFT); + + writel_relaxed(val, spc->base + offset); +} + +static void sprd_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state) +{ + struct sprd_pwm_chip *spc = + container_of(chip, struct sprd_pwm_chip, chip); + struct sprd_pwm_chn *chn = &spc->chn[pwm->hwpwm]; + u32 val, duty, prescale; + u64 tmp; + int ret; + + /* + * The clocks to PWM channel has to be enabled first before + * reading to the registers. + */ + ret = clk_bulk_prepare_enable(SPRD_PWM_CHN_CLKS_NUM, chn->clks); + if (ret) { + dev_err(spc->dev, "failed to enable pwm%u clocks\n", + pwm->hwpwm); + return; + } + + val = sprd_pwm_read(spc, pwm->hwpwm, SPRD_PWM_ENABLE); + if (val & SPRD_PWM_ENABLE_BIT) + state->enabled = true; + else + state->enabled = false; + + /* + * The hardware provides a counter that is feed by the source clock. + * The period length is (PRESCALE + 1) * MOD counter steps. + * The duty cycle length is (PRESCALE + 1) * DUTY counter steps. + * Thus the period_ns and duty_ns calculation formula should be: + * period_ns = NSEC_PER_SEC * (prescale + 1) * mod / clk_rate + * duty_ns = NSEC_PER_SEC * (prescale + 1) * duty / clk_rate + */ + val = sprd_pwm_read(spc, pwm->hwpwm, SPRD_PWM_PRESCALE); + prescale = val & SPRD_PWM_PRESCALE_MSK; + tmp = (prescale + 1) * NSEC_PER_SEC * SPRD_PWM_MOD_MAX; + state->period = DIV_ROUND_CLOSEST_ULL(tmp, chn->clk_rate); + + val = sprd_pwm_read(spc, pwm->hwpwm, SPRD_PWM_DUTY); + duty = val & SPRD_PWM_DUTY_MSK; + tmp = (prescale + 1) * NSEC_PER_SEC * duty; + state->duty_cycle = DIV_ROUND_CLOSEST_ULL(tmp, chn->clk_rate); + + /* Disable PWM clocks if the PWM channel is not in enable state. */ + if (!state->enabled) + clk_bulk_disable_unprepare(SPRD_PWM_CHN_CLKS_NUM, chn->clks); +} + +static int sprd_pwm_config(struct sprd_pwm_chip *spc, struct pwm_device *pwm, + int duty_ns, int period_ns) +{ + struct sprd_pwm_chn *chn = &spc->chn[pwm->hwpwm]; + u32 prescale, duty; + u64 tmp; + + /* + * The hardware provides a counter that is feed by the source clock. + * The period length is (PRESCALE + 1) * MOD counter steps. + * The duty cycle length is (PRESCALE + 1) * DUTY counter steps. + * + * To keep the maths simple we're always using MOD = SPRD_PWM_MOD_MAX. + * The value for PRESCALE is selected such that the resulting period + * gets the maximal length not bigger than the requested one with the + * given settings (MOD = SPRD_PWM_MOD_MAX and input clock). + */ + duty = duty_ns * SPRD_PWM_MOD_MAX / period_ns; + + tmp = (u64)chn->clk_rate * period_ns; + do_div(tmp, NSEC_PER_SEC); + prescale = DIV_ROUND_CLOSEST_ULL(tmp, SPRD_PWM_MOD_MAX) - 1; + if (prescale > SPRD_PWM_PRESCALE_MSK) + prescale = SPRD_PWM_PRESCALE_MSK; + + /* + * Note: Writing DUTY triggers the hardware to actually apply the + * values written to MOD and DUTY to the output, so must keep writing + * DUTY last. + * + * The hardware can ensures that current running period is completed + * before changing a new configuration to avoid mixed settings. + */ + sprd_pwm_write(spc, pwm->hwpwm, SPRD_PWM_PRESCALE, prescale); + sprd_pwm_write(spc, pwm->hwpwm, SPRD_PWM_MOD, SPRD_PWM_MOD_MAX); + sprd_pwm_write(spc, pwm->hwpwm, SPRD_PWM_DUTY, duty); + + return 0; +} + +static int sprd_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state) +{ + struct sprd_pwm_chip *spc = + container_of(chip, struct sprd_pwm_chip, chip); + struct sprd_pwm_chn *chn = &spc->chn[pwm->hwpwm]; + struct pwm_state *cstate = &pwm->state; + int ret; + + if (state->enabled) { + if (!cstate->enabled) { + /* + * The clocks to PWM channel has to be enabled first + * before writing to the registers. + */ + ret = clk_bulk_prepare_enable(SPRD_PWM_CHN_CLKS_NUM, + chn->clks); + if (ret) { + dev_err(spc->dev, + "failed to enable pwm%u clocks\n", + pwm->hwpwm); + return ret; + } + } + + if (state->period != cstate->period || + state->duty_cycle != cstate->duty_cycle) { + ret = sprd_pwm_config(spc, pwm, state->duty_cycle, + state->period); + if (ret) + return ret; + } + + sprd_pwm_write(spc, pwm->hwpwm, SPRD_PWM_ENABLE, 1); + } else if (cstate->enabled) { + /* + * Note: After setting SPRD_PWM_ENABLE to zero, the controller + * will not wait for current period to be completed, instead it + * will stop the PWM channel immediately. + */ + sprd_pwm_write(spc, pwm->hwpwm, SPRD_PWM_ENABLE, 0); + + clk_bulk_disable_unprepare(SPRD_PWM_CHN_CLKS_NUM, chn->clks); + } + + return 0; +} + +static const struct pwm_ops sprd_pwm_ops = { + .apply = sprd_pwm_apply, + .get_state = sprd_pwm_get_state, + .owner = THIS_MODULE, +}; + +static int sprd_pwm_clk_init(struct sprd_pwm_chip *spc) +{ + struct clk *clk_pwm; + int ret, i; + + for (i = 0; i < SPRD_PWM_CHN_NUM; i++) { + struct sprd_pwm_chn *chn = &spc->chn[i]; + int j; + + for (j = 0; j < SPRD_PWM_CHN_CLKS_NUM; ++j) + chn->clks[j].id = + sprd_pwm_clks[i * SPRD_PWM_CHN_CLKS_NUM + j]; + + ret = devm_clk_bulk_get(spc->dev, SPRD_PWM_CHN_CLKS_NUM, + chn->clks); + if (ret) { + if (ret == -ENOENT) + break; + + if (ret != -EPROBE_DEFER) + dev_err(spc->dev, + "failed to get channel clocks\n"); + + return ret; + } + + clk_pwm = chn->clks[SPRD_PWM_CHN_OUTPUT_CLK].clk; + chn->clk_rate = clk_get_rate(clk_pwm); + } + + if (!i) { + dev_err(spc->dev, "no available PWM channels\n"); + return -ENODEV; + } + + spc->num_pwms = i; + + return 0; +} + +static int sprd_pwm_probe(struct platform_device *pdev) +{ + struct sprd_pwm_chip *spc; + int ret; + + spc = devm_kzalloc(&pdev->dev, sizeof(*spc), GFP_KERNEL); + if (!spc) + return -ENOMEM; + + spc->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(spc->base)) + return PTR_ERR(spc->base); + + spc->dev = &pdev->dev; + platform_set_drvdata(pdev, spc); + + ret = sprd_pwm_clk_init(spc); + if (ret) + return ret; + + spc->chip.dev = &pdev->dev; + spc->chip.ops = &sprd_pwm_ops; + spc->chip.base = -1; + spc->chip.npwm = spc->num_pwms; + + ret = pwmchip_add(&spc->chip); + if (ret) + dev_err(&pdev->dev, "failed to add PWM chip\n"); + + return ret; +} + +static int sprd_pwm_remove(struct platform_device *pdev) +{ + struct sprd_pwm_chip *spc = platform_get_drvdata(pdev); + + return pwmchip_remove(&spc->chip); +} + +static const struct of_device_id sprd_pwm_of_match[] = { + { .compatible = "sprd,ums512-pwm", }, + { }, +}; +MODULE_DEVICE_TABLE(of, sprd_pwm_of_match); + +static struct platform_driver sprd_pwm_driver = { + .driver = { + .name = "sprd-pwm", + .of_match_table = sprd_pwm_of_match, + }, + .probe = sprd_pwm_probe, + .remove = sprd_pwm_remove, +}; + +module_platform_driver(sprd_pwm_driver); + +MODULE_DESCRIPTION("Spreadtrum PWM Driver"); +MODULE_LICENSE("GPL v2"); From c79468b8955b4924b5aca858247395265201ac42 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 19 Aug 2019 15:20:12 +0900 Subject: [PATCH 0738/2880] pwm: rcar: Remove a redundant condition in rcar_pwm_apply() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the rcar_pwm_apply() has already checked whether state->enabled is set or not, this patch removes a redundant condition. Signed-off-by: Yoshihiro Shimoda Reviewed-by: Geert Uytterhoeven Reviewed-by: Uwe Kleine-König Reviewed-by: Simon Horman Signed-off-by: Thierry Reding --- drivers/pwm/pwm-rcar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-rcar.c b/drivers/pwm/pwm-rcar.c index 5b2b8ecc354c..c8cd43f91efc 100644 --- a/drivers/pwm/pwm-rcar.c +++ b/drivers/pwm/pwm-rcar.c @@ -187,7 +187,7 @@ static int rcar_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, /* The SYNC should be set to 0 even if rcar_pwm_set_counter failed */ rcar_pwm_update(rp, RCAR_PWMCR_SYNC, 0, RCAR_PWMCR); - if (!ret && state->enabled) + if (!ret) ret = rcar_pwm_enable(rp); return ret; From 4537e52a526698f503dcb151234414cfa46d534b Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Sat, 24 Aug 2019 16:09:46 +0200 Subject: [PATCH 0739/2880] pwm: bcm2835: Suppress error message for invalid period_ns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PWM config can be triggered via sysfs, so we better suppress the error message in case of an invalid period to avoid kernel log spamming. Signed-off-by: Stefan Wahren Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-bcm2835.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/pwm/pwm-bcm2835.c b/drivers/pwm/pwm-bcm2835.c index f6fe0b922e1e..52763063c100 100644 --- a/drivers/pwm/pwm-bcm2835.c +++ b/drivers/pwm/pwm-bcm2835.c @@ -72,11 +72,8 @@ static int bcm2835_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, scaler = DIV_ROUND_CLOSEST(NSEC_PER_SEC, rate); - if (period_ns <= MIN_PERIOD) { - dev_err(pc->dev, "period %d not supported, minimum %d\n", - period_ns, MIN_PERIOD); + if (period_ns <= MIN_PERIOD) return -EINVAL; - } writel(DIV_ROUND_CLOSEST(duty_ns, scaler), pc->base + DUTY(pwm->hwpwm)); From 7e9713af316187f323b25ebf7eb1a8f03425e7e9 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Sat, 24 Aug 2019 16:09:47 +0200 Subject: [PATCH 0740/2880] pwm: bcm2835: Fix period_ns range check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The range check for period_ns was written under assumption of a fixed PWM clock. With clk-bcm2835 driver the PWM clock is a dynamic one. So fix this by doing the range check on the period register value. Signed-off-by: Stefan Wahren Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-bcm2835.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/pwm/pwm-bcm2835.c b/drivers/pwm/pwm-bcm2835.c index 52763063c100..2c82386c0ebf 100644 --- a/drivers/pwm/pwm-bcm2835.c +++ b/drivers/pwm/pwm-bcm2835.c @@ -21,7 +21,7 @@ #define PERIOD(x) (((x) * 0x10) + 0x10) #define DUTY(x) (((x) * 0x10) + 0x14) -#define MIN_PERIOD 108 /* 9.2 MHz max. PWM clock */ +#define PERIOD_MIN 0x2 struct bcm2835_pwm { struct pwm_chip chip; @@ -64,6 +64,7 @@ static int bcm2835_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, struct bcm2835_pwm *pc = to_bcm2835_pwm(chip); unsigned long rate = clk_get_rate(pc->clk); unsigned long scaler; + u32 period; if (!rate) { dev_err(pc->dev, "failed to get clock rate\n"); @@ -71,14 +72,14 @@ static int bcm2835_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, } scaler = DIV_ROUND_CLOSEST(NSEC_PER_SEC, rate); + period = DIV_ROUND_CLOSEST(period_ns, scaler); - if (period_ns <= MIN_PERIOD) + if (period < PERIOD_MIN) return -EINVAL; writel(DIV_ROUND_CLOSEST(duty_ns, scaler), pc->base + DUTY(pwm->hwpwm)); - writel(DIV_ROUND_CLOSEST(period_ns, scaler), - pc->base + PERIOD(pwm->hwpwm)); + writel(period, pc->base + PERIOD(pwm->hwpwm)); return 0; } From 9e3ca01f7e58ef4662c18adcb766ee3422ace0fb Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Sat, 24 Aug 2019 16:09:48 +0200 Subject: [PATCH 0741/2880] pwm: bcm2835: Suppress error message during deferred probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This suppresses error messages in case the PWM clock isn't ready yet. Signed-off-by: Stefan Wahren Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-bcm2835.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/pwm/pwm-bcm2835.c b/drivers/pwm/pwm-bcm2835.c index 2c82386c0ebf..91e24f01b54e 100644 --- a/drivers/pwm/pwm-bcm2835.c +++ b/drivers/pwm/pwm-bcm2835.c @@ -153,8 +153,11 @@ static int bcm2835_pwm_probe(struct platform_device *pdev) pc->clk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(pc->clk)) { - dev_err(&pdev->dev, "clock not found: %ld\n", PTR_ERR(pc->clk)); - return PTR_ERR(pc->clk); + ret = PTR_ERR(pc->clk); + if (ret != -EPROBE_DEFER) + dev_err(&pdev->dev, "clock not found: %d\n", ret); + + return ret; } ret = clk_prepare_enable(pc->clk); From ba73deb16ff5b2d70b1ffc025c84c1126aad7fea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 2 Sep 2019 16:39:41 +0200 Subject: [PATCH 0742/2880] pwm: rockchip: Set polarity unconditionally in .get_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't rely on *state being zero initialized and PWM_POLARITY_NORMAL being zero. So always assign .polarity. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-rockchip.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index 51b96cb7dd25..8eb2db59741d 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -90,10 +90,10 @@ static void rockchip_pwm_get_state(struct pwm_chip *chip, state->enabled = ((val & enable_conf) == enable_conf) ? true : false; - if (pc->data->supports_polarity) { - if (!(val & PWM_DUTY_POSITIVE)) - state->polarity = PWM_POLARITY_INVERSED; - } + if (pc->data->supports_polarity && !(val & PWM_DUTY_POSITIVE)) + state->polarity = PWM_POLARITY_INVERSED; + else + state->polarity = PWM_POLARITY_NORMAL; clk_disable(pc->pclk); } From fc3c5512e337bdb8b019883ea9078bbccc00c4e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 24 Aug 2019 17:37:02 +0200 Subject: [PATCH 0743/2880] pwm: Introduce local struct pwm_chip in pwm_apply_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pwm->chip is dereferenced several times in the pwm_apply_state() function. Introducing a local variable for it helps keeping some lines a bit shorter. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 8edfac17364e..4ab683a30629 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -454,20 +454,23 @@ EXPORT_SYMBOL_GPL(pwm_free); */ int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state) { + struct pwm_chip *chip; int err; if (!pwm || !state || !state->period || state->duty_cycle > state->period) return -EINVAL; + chip = pwm->chip; + if (state->period == pwm->state.period && state->duty_cycle == pwm->state.duty_cycle && state->polarity == pwm->state.polarity && state->enabled == pwm->state.enabled) return 0; - if (pwm->chip->ops->apply) { - err = pwm->chip->ops->apply(pwm->chip, pwm, state); + if (chip->ops->apply) { + err = chip->ops->apply(chip, pwm, state); if (err) return err; @@ -477,7 +480,7 @@ int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state) * FIXME: restore the initial state in case of error. */ if (state->polarity != pwm->state.polarity) { - if (!pwm->chip->ops->set_polarity) + if (!chip->ops->set_polarity) return -ENOTSUPP; /* @@ -486,12 +489,12 @@ int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state) * ->apply(). */ if (pwm->state.enabled) { - pwm->chip->ops->disable(pwm->chip, pwm); + chip->ops->disable(chip, pwm); pwm->state.enabled = false; } - err = pwm->chip->ops->set_polarity(pwm->chip, pwm, - state->polarity); + err = chip->ops->set_polarity(chip, pwm, + state->polarity); if (err) return err; @@ -500,9 +503,9 @@ int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state) if (state->period != pwm->state.period || state->duty_cycle != pwm->state.duty_cycle) { - err = pwm->chip->ops->config(pwm->chip, pwm, - state->duty_cycle, - state->period); + err = chip->ops->config(pwm->chip, pwm, + state->duty_cycle, + state->period); if (err) return err; @@ -512,11 +515,11 @@ int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state) if (state->enabled != pwm->state.enabled) { if (state->enabled) { - err = pwm->chip->ops->enable(pwm->chip, pwm); + err = chip->ops->enable(chip, pwm); if (err) return err; } else { - pwm->chip->ops->disable(pwm->chip, pwm); + chip->ops->disable(chip, pwm); } pwm->state.enabled = state->enabled; From 01ccf903edd65f6421612321648fa5a7f4b7cb10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 24 Aug 2019 17:37:03 +0200 Subject: [PATCH 0744/2880] pwm: Let pwm_get_state() return the last implemented state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When pwm_apply_state() is called the lowlevel driver usually has to apply some rounding because the hardware doesn't support nanosecond resolution. So let pwm_get_state() return the actually implemented state instead of the last applied one if possible. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 4ab683a30629..449ba161877d 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -474,7 +474,14 @@ int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state) if (err) return err; - pwm->state = *state; + /* + * .apply might have to round some values in *state, if possible + * read the actually implemented value back. + */ + if (chip->ops->get_state) + chip->ops->get_state(chip, pwm, &pwm->state); + else + pwm->state = *state; } else { /* * FIXME: restore the initial state in case of error. From 797a5ebc26daee5552e668ee4622bc3f47c1f743 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 24 Aug 2019 17:37:04 +0200 Subject: [PATCH 0745/2880] pwm: rockchip: Don't update the state for the caller of pwm_apply_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pwm-rockchip driver is one of only three PWM drivers which updates the state for the caller of pwm_apply_state(). This might have surprising results if the caller reuses the values expecting them to still represent the same state. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-rockchip.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index 8eb2db59741d..83c7627868d8 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -212,12 +212,6 @@ static int rockchip_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, goto out; } - /* - * Update the state with the real hardware, which can differ a bit - * because of period/duty_cycle approximation. - */ - rockchip_pwm_get_state(chip, pwm, state); - out: clk_disable(pc->pclk); From deb9c462f4e539cc7f8389b9855eb7a507c78e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 24 Aug 2019 17:37:05 +0200 Subject: [PATCH 0746/2880] pwm: sun4i: Don't update the state for the caller of pwm_apply_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pwm-sun4i driver is one of only three PWM drivers which updates the state for the caller of pwm_apply_state(). This might have surprising results if the caller reuses the values expecting them to still represent the same state. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-sun4i.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c index de78c824bbfd..39007a7c0d83 100644 --- a/drivers/pwm/pwm-sun4i.c +++ b/drivers/pwm/pwm-sun4i.c @@ -192,12 +192,6 @@ static int sun4i_pwm_calculate(struct sun4i_pwm_chip *sun4i_pwm, *dty = div; *prsclr = prescaler; - div = (u64)pval * NSEC_PER_SEC * *prd; - state->period = DIV_ROUND_CLOSEST_ULL(div, clk_rate); - - div = (u64)pval * NSEC_PER_SEC * *dty; - state->duty_cycle = DIV_ROUND_CLOSEST_ULL(div, clk_rate); - return 0; } From c9675829ba4b0e95c613f6d6d83d2b5cb9c5371c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 24 Aug 2019 17:37:06 +0200 Subject: [PATCH 0747/2880] pwm: fsl-ftm: Don't update the state for the caller of pwm_apply_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pwm-fsl-ftm driver is one of only three PWM drivers which updates the state for the caller of pwm_apply_state(). This might have surprising results if the caller reuses the values expecting them to still represent the same state. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-fsl-ftm.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/pwm/pwm-fsl-ftm.c b/drivers/pwm/pwm-fsl-ftm.c index 9d31a217111d..3c9738617ceb 100644 --- a/drivers/pwm/pwm-fsl-ftm.c +++ b/drivers/pwm/pwm-fsl-ftm.c @@ -292,10 +292,6 @@ static int fsl_pwm_apply_config(struct fsl_pwm_chip *fpc, regmap_update_bits(fpc->regmap, FTM_POL, BIT(pwm->hwpwm), reg_polarity); - newstate->period = fsl_pwm_ticks_to_ns(fpc, - fpc->period.mod_period + 1); - newstate->duty_cycle = fsl_pwm_ticks_to_ns(fpc, duty); - ftm_set_write_protection(fpc); return 0; From 92974a1d006ad8b30d53047c70974c9e065eb7df Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 17 Sep 2019 11:30:55 +0200 Subject: [PATCH 0748/2880] net/sched: act_sample: don't push mac header on ip6gre ingress current 'sample' action doesn't push the mac header of ingress packets if they are received by a layer 3 tunnel (like gre or sit); but it forgot to check for gre over ipv6, so the following script: # tc q a dev $d clsact # tc f a dev $d ingress protocol ip flower ip_proto icmp action sample \ > group 100 rate 1 # psample -v -g 100 dumps everything, including outer header and mac, when $d is a gre tunnel over ipv6. Fix this adding a missing label for ARPHRD_IP6GRE devices. Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action") Signed-off-by: Davide Caratti Reviewed-by: Yotam Gigi Signed-off-by: Jakub Kicinski --- net/sched/act_sample.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 692c4c9040fd..514456a0b9a8 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -146,6 +146,7 @@ static bool tcf_sample_dev_ok_push(struct net_device *dev) case ARPHRD_TUNNEL6: case ARPHRD_SIT: case ARPHRD_IPGRE: + case ARPHRD_IP6GRE: case ARPHRD_VOID: case ARPHRD_NONE: return false; From 9e5c8d39b88c15f4ec3ab24a73e09c112f209344 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Tue, 17 Sep 2019 13:30:52 +0300 Subject: [PATCH 0749/2880] dt-bindings: net: dwmac: fix 'mac-mode' type The 'mac-mode' property is similar to 'phy-mode' and 'phy-connection-type', which are enums of mode strings. The 'dwmac' driver supports almost all modes declared in the 'phy-mode' enum (except for 1 or 2). But in general, there may be a case where 'mac-mode' becomes more generic and is moved as part of phylib or netdev. In any case, the 'mac-mode' field should be made an enum, and it also makes sense to just reference the 'phy-connection-type' from 'ethernet-controller.yaml'. That will also make it more future-proof for new modes. Fixes: 9c15d3597c62 ("dt-bindings: net: dwmac: document 'mac-mode' property") Signed-off-by: Alexandru Ardelean Reviewed-by: Andrew Lunn Reviewed-by: Rob Herring Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/snps,dwmac.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml index ebe4537a7cce..4845e29411e4 100644 --- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -113,7 +113,7 @@ properties: const: stmmaceth mac-mode: - maxItems: 1 + $ref: ethernet-controller.yaml#/properties/phy-connection-type description: The property is identical to 'phy-mode', and assumes that there is mode converter in-between the MAC & PHY (e.g. GMII-to-RGMII). This converter From 0360894a05ed52be268e3c4d40b2df9d94975fa6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Sep 2019 10:30:21 -0700 Subject: [PATCH 0750/2880] selftests: Update fib_tests to handle missing ping6 Some distributions (e.g., debian buster) do not install ping6. Re-use the hook in pmtu.sh to detect this and fallback to ping. Fixes: a0e11da78f48 ("fib_tests: Add tests for metrics on routes") Signed-off-by: David Ahern Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_tests.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 4465fc2dae14..cba83a12da82 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -17,6 +17,8 @@ PAUSE=no IP="ip -netns ns1" NS_EXEC="ip netns exec ns1" +which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) + log_test() { local rc=$1 @@ -1086,7 +1088,7 @@ ipv6_route_metrics_test() log_test $rc 0 "Multipath route with mtu metric" $IP -6 ro add 2001:db8:104::/64 via 2001:db8:101::2 mtu 1300 - run_cmd "ip netns exec ns1 ping6 -w1 -c1 -s 1500 2001:db8:104::1" + run_cmd "ip netns exec ns1 ${ping6} -w1 -c1 -s 1500 2001:db8:104::1" log_test $? 0 "Using route with mtu metric" run_cmd "$IP -6 ro add 2001:db8:114::/64 via 2001:db8:101::2 congctl lock foo" From e84622ce24482f6e9c1bf29d3bdd556eb587ff41 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Sep 2019 10:30:35 -0700 Subject: [PATCH 0751/2880] selftests: Update fib_nexthop_multiprefix to handle missing ping6 Some distributions (e.g., debian buster) do not install ping6. Re-use the hook in pmtu.sh to detect this and fallback to ping. Fixes: 735ab2f65dce ("selftests: Add test with multiple prefixes using single nexthop") Signed-off-by: David Ahern Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_nexthop_multiprefix.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/fib_nexthop_multiprefix.sh b/tools/testing/selftests/net/fib_nexthop_multiprefix.sh index e6828732843e..9dc35a16e415 100755 --- a/tools/testing/selftests/net/fib_nexthop_multiprefix.sh +++ b/tools/testing/selftests/net/fib_nexthop_multiprefix.sh @@ -15,6 +15,8 @@ PAUSE_ON_FAIL=no VERBOSE=0 +which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) + ################################################################################ # helpers @@ -200,7 +202,7 @@ validate_v6_exception() local rc if [ ${ping_sz} != "0" ]; then - run_cmd ip netns exec h0 ping6 -s ${ping_sz} -c5 -w5 ${dst} + run_cmd ip netns exec h0 ${ping6} -s ${ping_sz} -c5 -w5 ${dst} fi if [ "$VERBOSE" = "1" ]; then @@ -243,7 +245,7 @@ do run_cmd taskset -c ${c} ip netns exec h0 ping -c1 -w1 172.16.10${i}.1 [ $? -ne 0 ] && printf "\nERROR: ping to h${i} failed\n" && ret=1 - run_cmd taskset -c ${c} ip netns exec h0 ping6 -c1 -w1 2001:db8:10${i}::1 + run_cmd taskset -c ${c} ip netns exec h0 ${ping6} -c1 -w1 2001:db8:10${i}::1 [ $? -ne 0 ] && printf "\nERROR: ping6 to h${i} failed\n" && ret=1 [ $ret -ne 0 ] && break From 77d5bc7e6a6cf8bbeca31aab7f0c5449a5eee762 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Sep 2019 10:39:49 -0700 Subject: [PATCH 0752/2880] ipv4: Revert removal of rt_uses_gateway Julian noted that rt_uses_gateway has a more subtle use than 'is gateway set': https://lore.kernel.org/netdev/alpine.LFD.2.21.1909151104060.2546@ja.home.ssi.bg/ Revert that part of the commit referenced in the Fixes tag. Currently, there are no u8 holes in 'struct rtable'. There is a 4-byte hole in the second cacheline which contains the gateway declaration. So move rt_gw_family down to the gateway declarations since they are always used together, and then re-use that u8 for rt_uses_gateway. End result is that rtable size is unchanged. Fixes: 1550c171935d ("ipv4: Prepare rtable for IPv6 gateway") Reported-by: Julian Anastasov Signed-off-by: David Ahern Reviewed-by: Julian Anastasov Signed-off-by: Jakub Kicinski --- drivers/infiniband/core/addr.c | 2 +- include/net/route.h | 3 ++- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/route.c | 34 +++++++++++++++++++-------------- net/ipv4/xfrm4_policy.c | 1 + 7 files changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 9b76a8fcdd24..bf539c34ccd3 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -352,7 +352,7 @@ static bool has_gateway(const struct dst_entry *dst, sa_family_t family) if (family == AF_INET) { rt = container_of(dst, struct rtable, dst); - return rt->rt_gw_family == AF_INET; + return rt->rt_uses_gateway; } rt6 = container_of(dst, struct rt6_info, dst); diff --git a/include/net/route.h b/include/net/route.h index dfce19c9fa96..6c516840380d 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -53,10 +53,11 @@ struct rtable { unsigned int rt_flags; __u16 rt_type; __u8 rt_is_input; - u8 rt_gw_family; + __u8 rt_uses_gateway; int rt_iif; + u8 rt_gw_family; /* Info on neighbour */ union { __be32 rt_gw4; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f5c163d4771b..a9183543ca30 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -560,7 +560,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gw_family) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; @@ -598,7 +598,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gw_family) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 06f6f280b9ff..00ec819f949b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_gw_family) + if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; IPCB(skb)->flags |= IPSKB_FORWARDED; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5eb73775c3f7..a77c3a4c24de 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -499,7 +499,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b6a6f18c3dd1..7dcce724c78b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -635,6 +635,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_uses_gateway = 1; rt->rt_gw_family = AF_INET; rt->rt_gw4 = fnhe->fnhe_gw; } @@ -1313,7 +1314,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = READ_ONCE(dst->dev->mtu); if (unlikely(ip_mtu_locked(dst))) { - if (rt->rt_gw_family && mtu > 576) + if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } @@ -1569,6 +1570,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, struct fib_nh_common *nhc = FIB_RES_NHC(*res); if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { + rt->rt_uses_gateway = 1; rt->rt_gw_family = nhc->nhc_gw_family; /* only INET and INET6 are supported */ if (likely(nhc->nhc_gw_family == AF_INET)) @@ -1634,6 +1636,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; + rt->rt_uses_gateway = 0; rt->rt_gw_family = 0; rt->rt_gw4 = 0; INIT_LIST_HEAD(&rt->rt_uncached); @@ -2694,6 +2697,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; + rt->rt_uses_gateway = ort->rt_uses_gateway; rt->rt_gw_family = ort->rt_gw_family; if (rt->rt_gw_family == AF_INET) rt->rt_gw4 = ort->rt_gw4; @@ -2778,21 +2782,23 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_gw_family == AF_INET && - nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { - goto nla_put_failure; - } else if (rt->rt_gw_family == AF_INET6) { - int alen = sizeof(struct in6_addr); - struct nlattr *nla; - struct rtvia *via; - - nla = nla_reserve(skb, RTA_VIA, alen + 2); - if (!nla) + if (rt->rt_uses_gateway) { + if (rt->rt_gw_family == AF_INET && + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; + } else if (rt->rt_gw_family == AF_INET6) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; - via = nla_data(nla); - via->rtvia_family = AF_INET6; - memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + goto nla_put_failure; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } } expires = rt->dst.expires; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index cdef8f9a3b01..35b84b52b702 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -85,6 +85,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; + xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) xdst->u.rt.rt_gw4 = rt->rt_gw4; From 71523d1812aca61e32e742e87ec064e3d8c615e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 24 Aug 2019 17:37:07 +0200 Subject: [PATCH 0753/2880] pwm: Ensure pwm_apply_state() doesn't modify the state argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is surprising for a PWM consumer when the variable holding the requested state is modified by pwm_apply_state(). Consider for example a driver doing: #define PERIOD 5000000 #define DUTY_LITTLE 10 ... struct pwm_state state = { .period = PERIOD, .duty_cycle = DUTY_LITTLE, .polarity = PWM_POLARITY_NORMAL, .enabled = true, }; pwm_apply_state(mypwm, &state); ... state.duty_cycle = PERIOD / 2; pwm_apply_state(mypwm, &state); For sure the second call to pwm_apply_state() should still have state.period = PERIOD and not something the hardware driver chose for a reason that doesn't necessarily apply to the second call. So declare the state argument as a pointer to a const type and adapt all drivers' .apply callbacks. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/gpio/gpio-mvebu.c | 2 +- drivers/pwm/core.c | 6 ++---- drivers/pwm/pwm-atmel-hlcdc.c | 2 +- drivers/pwm/pwm-atmel.c | 2 +- drivers/pwm/pwm-bcm-iproc.c | 2 +- drivers/pwm/pwm-cros-ec.c | 2 +- drivers/pwm/pwm-fsl-ftm.c | 4 ++-- drivers/pwm/pwm-hibvt.c | 2 +- drivers/pwm/pwm-imx-tpm.c | 4 ++-- drivers/pwm/pwm-imx27.c | 2 +- drivers/pwm/pwm-jz4740.c | 2 +- drivers/pwm/pwm-lpss.c | 2 +- drivers/pwm/pwm-meson.c | 4 ++-- drivers/pwm/pwm-rcar.c | 2 +- drivers/pwm/pwm-rockchip.c | 4 ++-- drivers/pwm/pwm-sifive.c | 2 +- drivers/pwm/pwm-sprd.c | 2 +- drivers/pwm/pwm-stm32-lp.c | 2 +- drivers/pwm/pwm-stm32.c | 4 ++-- drivers/pwm/pwm-sun4i.c | 4 ++-- drivers/pwm/pwm-zx.c | 2 +- include/linux/pwm.h | 4 ++-- 22 files changed, 30 insertions(+), 32 deletions(-) diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c index 869d47f89599..6c0687694341 100644 --- a/drivers/gpio/gpio-mvebu.c +++ b/drivers/gpio/gpio-mvebu.c @@ -694,7 +694,7 @@ static void mvebu_pwm_get_state(struct pwm_chip *chip, } static int mvebu_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct mvebu_pwm *mvpwm = to_mvebu_pwm(chip); struct mvebu_gpio_chip *mvchip = mvpwm->mvchip; diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 449ba161877d..6ad51aa60c03 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -448,11 +448,9 @@ EXPORT_SYMBOL_GPL(pwm_free); /** * pwm_apply_state() - atomically apply a new state to a PWM device * @pwm: PWM device - * @state: new state to apply. This can be adjusted by the PWM driver - * if the requested config is not achievable, for example, - * ->duty_cycle and ->period might be approximated. + * @state: new state to apply */ -int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state) +int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state) { struct pwm_chip *chip; int err; diff --git a/drivers/pwm/pwm-atmel-hlcdc.c b/drivers/pwm/pwm-atmel-hlcdc.c index d13a83f430ac..dcbc0489dfd4 100644 --- a/drivers/pwm/pwm-atmel-hlcdc.c +++ b/drivers/pwm/pwm-atmel-hlcdc.c @@ -39,7 +39,7 @@ static inline struct atmel_hlcdc_pwm *to_atmel_hlcdc_pwm(struct pwm_chip *chip) } static int atmel_hlcdc_pwm_apply(struct pwm_chip *c, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct atmel_hlcdc_pwm *chip = to_atmel_hlcdc_pwm(c); struct atmel_hlcdc *hlcdc = chip->hlcdc; diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c index e5e1eaf372fa..53bc7b9b3581 100644 --- a/drivers/pwm/pwm-atmel.c +++ b/drivers/pwm/pwm-atmel.c @@ -209,7 +209,7 @@ static void atmel_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm, } static int atmel_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct atmel_pwm_chip *atmel_pwm = to_atmel_pwm_chip(chip); struct pwm_state cstate; diff --git a/drivers/pwm/pwm-bcm-iproc.c b/drivers/pwm/pwm-bcm-iproc.c index d961a8207b1c..56c38cfae92c 100644 --- a/drivers/pwm/pwm-bcm-iproc.c +++ b/drivers/pwm/pwm-bcm-iproc.c @@ -115,7 +115,7 @@ static void iproc_pwmc_get_state(struct pwm_chip *chip, struct pwm_device *pwm, } static int iproc_pwmc_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { unsigned long prescale = IPROC_PWM_PRESCALE_MIN; struct iproc_pwmc *ip = to_iproc_pwmc(chip); diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c index 98f6ac6cf6ab..db5faa79c33f 100644 --- a/drivers/pwm/pwm-cros-ec.c +++ b/drivers/pwm/pwm-cros-ec.c @@ -93,7 +93,7 @@ static int cros_ec_pwm_get_duty(struct cros_ec_device *ec, u8 index) } static int cros_ec_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct cros_ec_pwm_device *ec_pwm = pwm_to_cros_ec_pwm(chip); int duty_cycle; diff --git a/drivers/pwm/pwm-fsl-ftm.c b/drivers/pwm/pwm-fsl-ftm.c index 3c9738617ceb..59272a920479 100644 --- a/drivers/pwm/pwm-fsl-ftm.c +++ b/drivers/pwm/pwm-fsl-ftm.c @@ -227,7 +227,7 @@ static bool fsl_pwm_is_other_pwm_enabled(struct fsl_pwm_chip *fpc, static int fsl_pwm_apply_config(struct fsl_pwm_chip *fpc, struct pwm_device *pwm, - struct pwm_state *newstate) + const struct pwm_state *newstate) { unsigned int duty; u32 reg_polarity; @@ -298,7 +298,7 @@ static int fsl_pwm_apply_config(struct fsl_pwm_chip *fpc, } static int fsl_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *newstate) + const struct pwm_state *newstate) { struct fsl_pwm_chip *fpc = to_fsl_chip(chip); struct pwm_state *oldstate = &pwm->state; diff --git a/drivers/pwm/pwm-hibvt.c b/drivers/pwm/pwm-hibvt.c index 753bd58111e4..ad205fdad372 100644 --- a/drivers/pwm/pwm-hibvt.c +++ b/drivers/pwm/pwm-hibvt.c @@ -149,7 +149,7 @@ static void hibvt_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, } static int hibvt_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct hibvt_pwm_chip *hi_pwm_chip = to_hibvt_pwm_chip(chip); diff --git a/drivers/pwm/pwm-imx-tpm.c b/drivers/pwm/pwm-imx-tpm.c index e8385c1cf342..9145f6160649 100644 --- a/drivers/pwm/pwm-imx-tpm.c +++ b/drivers/pwm/pwm-imx-tpm.c @@ -89,7 +89,7 @@ to_imx_tpm_pwm_chip(struct pwm_chip *chip) static int pwm_imx_tpm_round_state(struct pwm_chip *chip, struct imx_tpm_pwm_param *p, struct pwm_state *real_state, - struct pwm_state *state) + const struct pwm_state *state) { struct imx_tpm_pwm_chip *tpm = to_imx_tpm_pwm_chip(chip); u32 rate, prescale, period_count, clock_unit; @@ -289,7 +289,7 @@ static int pwm_imx_tpm_apply_hw(struct pwm_chip *chip, static int pwm_imx_tpm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct imx_tpm_pwm_chip *tpm = to_imx_tpm_pwm_chip(chip); struct imx_tpm_pwm_param param; diff --git a/drivers/pwm/pwm-imx27.c b/drivers/pwm/pwm-imx27.c index 91c23cbbc167..ae11d8577f18 100644 --- a/drivers/pwm/pwm-imx27.c +++ b/drivers/pwm/pwm-imx27.c @@ -209,7 +209,7 @@ static void pwm_imx27_wait_fifo_slot(struct pwm_chip *chip, } static int pwm_imx27_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { unsigned long period_cycles, duty_cycles, prescale; struct pwm_imx27_chip *imx = to_pwm_imx27_chip(chip); diff --git a/drivers/pwm/pwm-jz4740.c b/drivers/pwm/pwm-jz4740.c index 9d444d012f92..9d78cc21cb12 100644 --- a/drivers/pwm/pwm-jz4740.c +++ b/drivers/pwm/pwm-jz4740.c @@ -88,7 +88,7 @@ static void jz4740_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) } static int jz4740_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct jz4740_pwm_chip *jz4740 = to_jz4740(pwm->chip); unsigned long long tmp; diff --git a/drivers/pwm/pwm-lpss.c b/drivers/pwm/pwm-lpss.c index 4098a4601691..75bbfe5f3bc2 100644 --- a/drivers/pwm/pwm-lpss.c +++ b/drivers/pwm/pwm-lpss.c @@ -122,7 +122,7 @@ static inline void pwm_lpss_cond_enable(struct pwm_device *pwm, bool cond) } static int pwm_lpss_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct pwm_lpss_chip *lpwm = to_lpwm(chip); int ret; diff --git a/drivers/pwm/pwm-meson.c b/drivers/pwm/pwm-meson.c index 3cbff5cbb789..6245bbdb6e6c 100644 --- a/drivers/pwm/pwm-meson.c +++ b/drivers/pwm/pwm-meson.c @@ -159,7 +159,7 @@ static void meson_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) } static int meson_pwm_calc(struct meson_pwm *meson, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct meson_pwm_channel *channel = pwm_get_chip_data(pwm); unsigned int duty, period, pre_div, cnt, duty_cnt; @@ -265,7 +265,7 @@ static void meson_pwm_disable(struct meson_pwm *meson, struct pwm_device *pwm) } static int meson_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct meson_pwm_channel *channel = pwm_get_chip_data(pwm); struct meson_pwm *meson = to_meson_pwm(chip); diff --git a/drivers/pwm/pwm-rcar.c b/drivers/pwm/pwm-rcar.c index c8cd43f91efc..852eb2347954 100644 --- a/drivers/pwm/pwm-rcar.c +++ b/drivers/pwm/pwm-rcar.c @@ -158,7 +158,7 @@ static void rcar_pwm_disable(struct rcar_pwm_chip *rp) } static int rcar_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct rcar_pwm_chip *rp = to_rcar_pwm_chip(chip); struct pwm_state cur_state; diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index 83c7627868d8..73352e6fbccb 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -99,7 +99,7 @@ static void rockchip_pwm_get_state(struct pwm_chip *chip, } static void rockchip_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); unsigned long period, duty; @@ -183,7 +183,7 @@ static int rockchip_pwm_enable(struct pwm_chip *chip, } static int rockchip_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); struct pwm_state curstate; diff --git a/drivers/pwm/pwm-sifive.c b/drivers/pwm/pwm-sifive.c index bb4f02ce4f94..cc63f9baa481 100644 --- a/drivers/pwm/pwm-sifive.c +++ b/drivers/pwm/pwm-sifive.c @@ -147,7 +147,7 @@ static int pwm_sifive_enable(struct pwm_chip *chip, bool enable) } static int pwm_sifive_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct pwm_sifive_ddata *ddata = pwm_sifive_chip_to_ddata(chip); struct pwm_state cur_state; diff --git a/drivers/pwm/pwm-sprd.c b/drivers/pwm/pwm-sprd.c index 68c2d9f0411b..be2394227423 100644 --- a/drivers/pwm/pwm-sprd.c +++ b/drivers/pwm/pwm-sprd.c @@ -156,7 +156,7 @@ static int sprd_pwm_config(struct sprd_pwm_chip *spc, struct pwm_device *pwm, } static int sprd_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct sprd_pwm_chip *spc = container_of(chip, struct sprd_pwm_chip, chip); diff --git a/drivers/pwm/pwm-stm32-lp.c b/drivers/pwm/pwm-stm32-lp.c index 2211a642066d..21cb260dc2c0 100644 --- a/drivers/pwm/pwm-stm32-lp.c +++ b/drivers/pwm/pwm-stm32-lp.c @@ -32,7 +32,7 @@ static inline struct stm32_pwm_lp *to_stm32_pwm_lp(struct pwm_chip *chip) #define STM32_LPTIM_MAX_PRESCALER 128 static int stm32_pwm_lp_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct stm32_pwm_lp *priv = to_stm32_pwm_lp(chip); unsigned long long prd, div, dty; diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c index 740e2dec8313..359b08596d9e 100644 --- a/drivers/pwm/pwm-stm32.c +++ b/drivers/pwm/pwm-stm32.c @@ -440,7 +440,7 @@ static void stm32_pwm_disable(struct stm32_pwm *priv, int ch) } static int stm32_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { bool enabled; struct stm32_pwm *priv = to_stm32_pwm_dev(chip); @@ -468,7 +468,7 @@ static int stm32_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, } static int stm32_pwm_apply_locked(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct stm32_pwm *priv = to_stm32_pwm_dev(chip); int ret; diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c index 39007a7c0d83..6f5840a1a82d 100644 --- a/drivers/pwm/pwm-sun4i.c +++ b/drivers/pwm/pwm-sun4i.c @@ -145,7 +145,7 @@ static void sun4i_pwm_get_state(struct pwm_chip *chip, } static int sun4i_pwm_calculate(struct sun4i_pwm_chip *sun4i_pwm, - struct pwm_state *state, + const struct pwm_state *state, u32 *dty, u32 *prd, unsigned int *prsclr) { u64 clk_rate, div = 0; @@ -196,7 +196,7 @@ static int sun4i_pwm_calculate(struct sun4i_pwm_chip *sun4i_pwm, } static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct sun4i_pwm_chip *sun4i_pwm = to_sun4i_pwm_chip(chip); struct pwm_state cstate; diff --git a/drivers/pwm/pwm-zx.c b/drivers/pwm/pwm-zx.c index e24f4be35316..e2c21cc34a96 100644 --- a/drivers/pwm/pwm-zx.c +++ b/drivers/pwm/pwm-zx.c @@ -148,7 +148,7 @@ static int zx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, } static int zx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) + const struct pwm_state *state) { struct zx_pwm_chip *zpc = to_zx_pwm_chip(chip); struct pwm_state cstate; diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 24632a7a7d11..b2c9c460947d 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -262,7 +262,7 @@ struct pwm_ops { int (*capture)(struct pwm_chip *chip, struct pwm_device *pwm, struct pwm_capture *result, unsigned long timeout); int (*apply)(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state); + const struct pwm_state *state); void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm, struct pwm_state *state); struct module *owner; @@ -316,7 +316,7 @@ struct pwm_capture { /* PWM user APIs */ struct pwm_device *pwm_request(int pwm_id, const char *label); void pwm_free(struct pwm_device *pwm); -int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state); +int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state); int pwm_adjust_config(struct pwm_device *pwm); /** From c91e3234c6035baf5a79763cb4fcd5d23ce75c2b Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Wed, 18 Sep 2019 16:54:21 +0200 Subject: [PATCH 0754/2880] pwm: stm32-lp: Add check in case requested period cannot be achieved MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LPTimer can use a 32KHz clock for counting. It depends on clock tree configuration. In such a case, PWM output frequency range is limited. Although unlikely, nothing prevents user from requesting a PWM frequency above counting clock (32KHz for instance): - This causes (prd - 1) = 0xffff to be written in ARR register later in the apply() routine. This results in badly configured PWM period (and also duty_cycle). Add a check to report an error is such a case. Signed-off-by: Fabrice Gasnier Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-stm32-lp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pwm/pwm-stm32-lp.c b/drivers/pwm/pwm-stm32-lp.c index 21cb260dc2c0..67fca62524dc 100644 --- a/drivers/pwm/pwm-stm32-lp.c +++ b/drivers/pwm/pwm-stm32-lp.c @@ -59,6 +59,12 @@ static int stm32_pwm_lp_apply(struct pwm_chip *chip, struct pwm_device *pwm, /* Calculate the period and prescaler value */ div = (unsigned long long)clk_get_rate(priv->clk) * state->period; do_div(div, NSEC_PER_SEC); + if (!div) { + /* Clock is too slow to achieve requested period. */ + dev_dbg(priv->chip.dev, "Can't reach %u ns\n", state->period); + return -EINVAL; + } + prd = div; while (div > STM32_LPTIM_MAX_ARR) { presc++; From 3d4d85741ad3d880cb432d9121f5563e6a57ff2e Mon Sep 17 00:00:00 2001 From: Kamel Bouhara Date: Wed, 18 Sep 2019 16:57:16 +0200 Subject: [PATCH 0755/2880] pwm: atmel: Remove platform_device_id and use only dt bindings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 26202873bb51 ("avr32: remove support for AVR32 architecture") there is no more user of platform_device_id and we should only use dt bindings Signed-off-by: Kamel Bouhara Acked-by: Alexandre Belloni Acked-by: Claudiu Beznea Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/Kconfig | 2 +- drivers/pwm/pwm-atmel.c | 35 +++-------------------------------- 2 files changed, 4 insertions(+), 33 deletions(-) diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index 31dfc88ef362..08f35e2b167e 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -44,7 +44,7 @@ config PWM_AB8500 config PWM_ATMEL tristate "Atmel PWM support" - depends on ARCH_AT91 + depends on ARCH_AT91 && OF help Generic PWM framework driver for Atmel SoC. diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c index 53bc7b9b3581..5b861da3e325 100644 --- a/drivers/pwm/pwm-atmel.c +++ b/drivers/pwm/pwm-atmel.c @@ -318,19 +318,6 @@ static const struct atmel_pwm_data mchp_sam9x60_pwm_data = { }, }; -static const struct platform_device_id atmel_pwm_devtypes[] = { - { - .name = "at91sam9rl-pwm", - .driver_data = (kernel_ulong_t)&atmel_sam9rl_pwm_data, - }, { - .name = "sama5d3-pwm", - .driver_data = (kernel_ulong_t)&atmel_sama5_pwm_data, - }, { - /* sentinel */ - }, -}; -MODULE_DEVICE_TABLE(platform, atmel_pwm_devtypes); - static const struct of_device_id atmel_pwm_dt_ids[] = { { .compatible = "atmel,at91sam9rl-pwm", @@ -350,19 +337,6 @@ static const struct of_device_id atmel_pwm_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, atmel_pwm_dt_ids); -static inline const struct atmel_pwm_data * -atmel_pwm_get_driver_data(struct platform_device *pdev) -{ - const struct platform_device_id *id; - - if (pdev->dev.of_node) - return of_device_get_match_data(&pdev->dev); - - id = platform_get_device_id(pdev); - - return (struct atmel_pwm_data *)id->driver_data; -} - static int atmel_pwm_probe(struct platform_device *pdev) { const struct atmel_pwm_data *data; @@ -370,7 +344,7 @@ static int atmel_pwm_probe(struct platform_device *pdev) struct resource *res; int ret; - data = atmel_pwm_get_driver_data(pdev); + data = of_device_get_match_data(&pdev->dev); if (!data) return -ENODEV; @@ -396,10 +370,8 @@ static int atmel_pwm_probe(struct platform_device *pdev) atmel_pwm->chip.dev = &pdev->dev; atmel_pwm->chip.ops = &atmel_pwm_ops; - if (pdev->dev.of_node) { - atmel_pwm->chip.of_xlate = of_pwm_xlate_with_flags; - atmel_pwm->chip.of_pwm_n_cells = 3; - } + atmel_pwm->chip.of_xlate = of_pwm_xlate_with_flags; + atmel_pwm->chip.of_pwm_n_cells = 3; atmel_pwm->chip.base = -1; atmel_pwm->chip.npwm = 4; @@ -437,7 +409,6 @@ static struct platform_driver atmel_pwm_driver = { .name = "atmel-pwm", .of_match_table = of_match_ptr(atmel_pwm_dt_ids), }, - .id_table = atmel_pwm_devtypes, .probe = atmel_pwm_probe, .remove = atmel_pwm_remove, }; From d85b9ce198e3689141cce965cb840f1c435ac4d5 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Sat, 21 Sep 2019 01:55:48 +0200 Subject: [PATCH 0756/2880] pwm: atmel: Remove unneeded check for match data Since the driver is now exclusively DT, it only binds if it finds a match in the of_device_id table. But in that case the associated data can never be NULL, so drop the unnecessary check. While at it, drop the extra local variable and store the pointer to this per-SoC data in the driver data directly. Signed-off-by: Thierry Reding --- drivers/pwm/pwm-atmel.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c index 5b861da3e325..2e0b3d985d24 100644 --- a/drivers/pwm/pwm-atmel.c +++ b/drivers/pwm/pwm-atmel.c @@ -339,19 +339,16 @@ MODULE_DEVICE_TABLE(of, atmel_pwm_dt_ids); static int atmel_pwm_probe(struct platform_device *pdev) { - const struct atmel_pwm_data *data; struct atmel_pwm_chip *atmel_pwm; struct resource *res; int ret; - data = of_device_get_match_data(&pdev->dev); - if (!data) - return -ENODEV; - atmel_pwm = devm_kzalloc(&pdev->dev, sizeof(*atmel_pwm), GFP_KERNEL); if (!atmel_pwm) return -ENOMEM; + atmel_pwm->data = of_device_get_match_data(&pdev->dev); + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); atmel_pwm->base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(atmel_pwm->base)) @@ -375,7 +372,6 @@ static int atmel_pwm_probe(struct platform_device *pdev) atmel_pwm->chip.base = -1; atmel_pwm->chip.npwm = 4; - atmel_pwm->data = data; atmel_pwm->updated_pwms = 0; mutex_init(&atmel_pwm->isr_lock); From 9193c16e5a9899e742a862a0fec9bb5235008370 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Sat, 21 Sep 2019 01:58:58 +0200 Subject: [PATCH 0757/2880] pwm: atmel: Consolidate driver data initialization This helps readability by separating the driver-specific bits from the PWM framework bits. Signed-off-by: Thierry Reding --- drivers/pwm/pwm-atmel.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c index 2e0b3d985d24..9ba733467e26 100644 --- a/drivers/pwm/pwm-atmel.c +++ b/drivers/pwm/pwm-atmel.c @@ -347,7 +347,9 @@ static int atmel_pwm_probe(struct platform_device *pdev) if (!atmel_pwm) return -ENOMEM; + mutex_init(&atmel_pwm->isr_lock); atmel_pwm->data = of_device_get_match_data(&pdev->dev); + atmel_pwm->updated_pwms = 0; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); atmel_pwm->base = devm_ioremap_resource(&pdev->dev, res); @@ -366,14 +368,10 @@ static int atmel_pwm_probe(struct platform_device *pdev) atmel_pwm->chip.dev = &pdev->dev; atmel_pwm->chip.ops = &atmel_pwm_ops; - atmel_pwm->chip.of_xlate = of_pwm_xlate_with_flags; atmel_pwm->chip.of_pwm_n_cells = 3; - atmel_pwm->chip.base = -1; atmel_pwm->chip.npwm = 4; - atmel_pwm->updated_pwms = 0; - mutex_init(&atmel_pwm->isr_lock); ret = pwmchip_add(&atmel_pwm->chip); if (ret < 0) { From e6c7c258f035ffec9d8a808c1bc34b6a5beae0ef Mon Sep 17 00:00:00 2001 From: Sam Shih Date: Fri, 20 Sep 2019 06:49:02 +0800 Subject: [PATCH 0758/2880] pwm: mediatek: Drop the check for of_device_get_match_data() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch drop the check for of_device_get_match_data. Due to the only way call driver probe is compatible match. The data pointer which points to the SoC specify data is directly set by driver, and it should not be NULL in our case. We can safety remove the check for the result of of_device_get_match_data(). Signed-off-by: Ryder Lee Signed-off-by: Sam Shih Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-mediatek.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 6697e30811e7..459b2ccd0b75 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -226,7 +226,6 @@ static const struct pwm_ops mtk_pwm_ops = { static int mtk_pwm_probe(struct platform_device *pdev) { - const struct mtk_pwm_platform_data *data; struct mtk_pwm_chip *pc; struct resource *res; unsigned int i; @@ -236,17 +235,14 @@ static int mtk_pwm_probe(struct platform_device *pdev) if (!pc) return -ENOMEM; - data = of_device_get_match_data(&pdev->dev); - if (data == NULL) - return -EINVAL; - pc->soc = data; + pc->soc = of_device_get_match_data(&pdev->dev); res = platform_get_resource(pdev, IORESOURCE_MEM, 0); pc->regs = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(pc->regs)) return PTR_ERR(pc->regs); - for (i = 0; i < data->num_pwms + 2 && pc->soc->has_clks; i++) { + for (i = 0; i < pc->soc->num_pwms + 2 && pc->soc->has_clks; i++) { pc->clks[i] = devm_clk_get(&pdev->dev, mtk_pwm_clk_name[i]); if (IS_ERR(pc->clks[i])) { dev_err(&pdev->dev, "clock: %s fail: %ld\n", @@ -260,7 +256,7 @@ static int mtk_pwm_probe(struct platform_device *pdev) pc->chip.dev = &pdev->dev; pc->chip.ops = &mtk_pwm_ops; pc->chip.base = -1; - pc->chip.npwm = data->num_pwms; + pc->chip.npwm = pc->soc->num_pwms; ret = pwmchip_add(&pc->chip); if (ret < 0) { From 432264e9dfd10537e3cb66a11739f76754fc89e6 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Wed, 18 Sep 2019 14:14:47 +0300 Subject: [PATCH 0759/2880] dt-bindings: net: remove un-implemented property The `adi,disable-energy-detect` property was implemented in an initial version of the `adin` driver series, but after a review it was discarded in favor of implementing the ETHTOOL_PHY_EDPD phy-tunable option. With the ETHTOOL_PHY_EDPD control, it's possible to disable/enable Energy-Detect-Power-Down for the `adin` PHY, so this device-tree is not needed. Fixes: 767078132ff9 ("dt-bindings: net: add bindings for ADIN PHY driver") Signed-off-by: Alexandru Ardelean Reviewed-by: Rob Herring Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/adi,adin.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Documentation/devicetree/bindings/net/adi,adin.yaml b/Documentation/devicetree/bindings/net/adi,adin.yaml index 69375cb28e92..d95cc691a65f 100644 --- a/Documentation/devicetree/bindings/net/adi,adin.yaml +++ b/Documentation/devicetree/bindings/net/adi,adin.yaml @@ -36,12 +36,6 @@ properties: enum: [ 4, 8, 12, 16, 20, 24 ] default: 8 - adi,disable-energy-detect: - description: | - Disables Energy Detect Powerdown Mode (default disabled, i.e energy detect - is enabled if this property is unspecified) - type: boolean - examples: - | ethernet { @@ -68,6 +62,5 @@ examples: reg = <1>; adi,fifo-depth-bits = <16>; - adi,disable-energy-detect; }; }; From b41d936b5ecfdb3a4abc525ce6402a6c49cffddc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Sep 2019 08:05:39 -0700 Subject: [PATCH 0760/2880] sch_netem: fix a divide by zero in tabledist() syzbot managed to crash the kernel in tabledist() loading an empty distribution table. t = dist->table[rnd % dist->size]; Simply return an error when such load is attempted. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b17f2ed970e2..f5cb35e550f8 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -777,7 +777,7 @@ static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, struct disttable *d; int i; - if (n > NETEM_DIST_MAX) + if (!n || n > NETEM_DIST_MAX) return -EINVAL; d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); From 7b09c2d052db4b4ad0b27b97918b46a7746966fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Sep 2019 10:12:36 -0700 Subject: [PATCH 0761/2880] ipv6: fix a typo in fib6_rule_lookup() Yi Ren reported an issue discovered by syzkaller, and bisected to the cited commit. Many thanks to Yi, this trivial patch does not reflect the patient work that has been done. Fixes: d64a1f574a29 ("ipv6: honor RT6_LOOKUP_F_DST_NOREF in rule lookup logic") Signed-off-by: Eric Dumazet Acked-by: Wei Wang Bisected-and-reported-by: Yi Ren Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 87f47bc55c5e..6e2af411cd9c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -318,7 +318,7 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; - if (!(flags | RT6_LOOKUP_F_DST_NOREF)) + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } From dc579ca5cfea3b9652db73009b394b9a3f46ae29 Mon Sep 17 00:00:00 2001 From: Yan-Hsuan Chuang Date: Mon, 16 Sep 2019 15:03:34 +0800 Subject: [PATCH 0762/2880] rtw88: pci: extract skbs free routine for trx rings These skbs free routines could be used when driver wants to stop PCI bus, because some of the skbs remained in the queue may not have been returned via DMA interrupt. Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtw88/pci.c | 36 +++++++++++++++++------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c index 3fdb52a5789a..bc3a36402e56 100644 --- a/drivers/net/wireless/realtek/rtw88/pci.c +++ b/drivers/net/wireless/realtek/rtw88/pci.c @@ -90,16 +90,13 @@ static inline void *rtw_pci_get_tx_desc(struct rtw_pci_tx_ring *tx_ring, u8 idx) return tx_ring->r.head + offset; } -static void rtw_pci_free_tx_ring(struct rtw_dev *rtwdev, - struct rtw_pci_tx_ring *tx_ring) +static void rtw_pci_free_tx_ring_skbs(struct rtw_dev *rtwdev, + struct rtw_pci_tx_ring *tx_ring) { struct pci_dev *pdev = to_pci_dev(rtwdev->dev); struct rtw_pci_tx_data *tx_data; struct sk_buff *skb, *tmp; dma_addr_t dma; - u8 *head = tx_ring->r.head; - u32 len = tx_ring->r.len; - int ring_sz = len * tx_ring->r.desc_size; /* free every skb remained in tx list */ skb_queue_walk_safe(&tx_ring->queue, skb, tmp) { @@ -110,21 +107,30 @@ static void rtw_pci_free_tx_ring(struct rtw_dev *rtwdev, pci_unmap_single(pdev, dma, skb->len, PCI_DMA_TODEVICE); dev_kfree_skb_any(skb); } +} + +static void rtw_pci_free_tx_ring(struct rtw_dev *rtwdev, + struct rtw_pci_tx_ring *tx_ring) +{ + struct pci_dev *pdev = to_pci_dev(rtwdev->dev); + u8 *head = tx_ring->r.head; + u32 len = tx_ring->r.len; + int ring_sz = len * tx_ring->r.desc_size; + + rtw_pci_free_tx_ring_skbs(rtwdev, tx_ring); /* free the ring itself */ pci_free_consistent(pdev, ring_sz, head, tx_ring->r.dma); tx_ring->r.head = NULL; } -static void rtw_pci_free_rx_ring(struct rtw_dev *rtwdev, - struct rtw_pci_rx_ring *rx_ring) +static void rtw_pci_free_rx_ring_skbs(struct rtw_dev *rtwdev, + struct rtw_pci_rx_ring *rx_ring) { struct pci_dev *pdev = to_pci_dev(rtwdev->dev); struct sk_buff *skb; - dma_addr_t dma; - u8 *head = rx_ring->r.head; int buf_sz = RTK_PCI_RX_BUF_SIZE; - int ring_sz = rx_ring->r.desc_size * rx_ring->r.len; + dma_addr_t dma; int i; for (i = 0; i < rx_ring->r.len; i++) { @@ -137,6 +143,16 @@ static void rtw_pci_free_rx_ring(struct rtw_dev *rtwdev, dev_kfree_skb(skb); rx_ring->buf[i] = NULL; } +} + +static void rtw_pci_free_rx_ring(struct rtw_dev *rtwdev, + struct rtw_pci_rx_ring *rx_ring) +{ + struct pci_dev *pdev = to_pci_dev(rtwdev->dev); + u8 *head = rx_ring->r.head; + int ring_sz = rx_ring->r.desc_size * rx_ring->r.len; + + rtw_pci_free_rx_ring_skbs(rtwdev, rx_ring); pci_free_consistent(pdev, ring_sz, head, rx_ring->r.dma); } From 0e41edcdfe86435fef709b7de8397e8a5a0e1b2f Mon Sep 17 00:00:00 2001 From: Yan-Hsuan Chuang Date: Mon, 16 Sep 2019 15:03:35 +0800 Subject: [PATCH 0763/2880] rtw88: pci: release tx skbs DMAed when stop Interrupt is disabled to stop PCI, which means the skbs queued for each TX ring will not be released via DMA interrupt. To avoid those skbs remained being left in the skb queue until PCI has been removed, driver needs to release skbs by itself. Signed-off-by: Yan-Hsuan Chuang Reviewed-by: Brian Norris Tested-by: Brian Norris Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtw88/pci.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c index bc3a36402e56..d90928be663b 100644 --- a/drivers/net/wireless/realtek/rtw88/pci.c +++ b/drivers/net/wireless/realtek/rtw88/pci.c @@ -500,6 +500,17 @@ static void rtw_pci_dma_reset(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci) rtwpci->rx_tag = 0; } +static void rtw_pci_dma_release(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci) +{ + struct rtw_pci_tx_ring *tx_ring; + u8 queue; + + for (queue = 0; queue < RTK_MAX_TX_QUEUE_NUM; queue++) { + tx_ring = &rtwpci->tx_rings[queue]; + rtw_pci_free_tx_ring_skbs(rtwdev, tx_ring); + } +} + static int rtw_pci_start(struct rtw_dev *rtwdev) { struct rtw_pci *rtwpci = (struct rtw_pci *)rtwdev->priv; @@ -521,6 +532,7 @@ static void rtw_pci_stop(struct rtw_dev *rtwdev) spin_lock_irqsave(&rtwpci->irq_lock, flags); rtw_pci_disable_interrupt(rtwdev, rtwpci); + rtw_pci_dma_release(rtwdev, rtwpci); spin_unlock_irqrestore(&rtwpci->irq_lock, flags); } From 6355592e6b55be8c568b62efba5fe5a1c919a2db Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 19 Sep 2019 11:15:32 +0200 Subject: [PATCH 0764/2880] zd1211rw: zd_usb: Use "%zu" to format size_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 32-bit: drivers/net/wireless/zydas/zd1211rw/zd_usb.c: In function ‘check_read_regs’: drivers/net/wireless/zydas/zd1211rw/zd_def.h:18:25: warning: format ‘%ld’ expects argument of type ‘long int’, but argument 6 has type ‘size_t’ {aka ‘unsigned int’} [-Wformat=] dev_printk(level, dev, "%s() " fmt, __func__, ##args) ^~~~~~~ drivers/net/wireless/zydas/zd1211rw/zd_def.h:22:4: note: in expansion of macro ‘dev_printk_f’ dev_printk_f(KERN_DEBUG, dev, fmt, ## args) ^~~~~~~~~~~~ drivers/net/wireless/zydas/zd1211rw/zd_usb.c:1635:3: note: in expansion of macro ‘dev_dbg_f’ dev_dbg_f(zd_usb_dev(usb), ^~~~~~~~~ drivers/net/wireless/zydas/zd1211rw/zd_usb.c:1636:51: note: format string is defined here "error: actual length %d less than expected %ld\n", ~~^ %d Fixes: 84b0b66352470e64 ("zd1211rw: zd_usb: Use struct_size() helper") Signed-off-by: Geert Uytterhoeven Signed-off-by: Kalle Valo --- drivers/net/wireless/zydas/zd1211rw/zd_usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/zydas/zd1211rw/zd_usb.c b/drivers/net/wireless/zydas/zd1211rw/zd_usb.c index 4e44ea8c652d..7b5c2fe5bd4d 100644 --- a/drivers/net/wireless/zydas/zd1211rw/zd_usb.c +++ b/drivers/net/wireless/zydas/zd1211rw/zd_usb.c @@ -1633,7 +1633,7 @@ static bool check_read_regs(struct zd_usb *usb, struct usb_req_read_regs *req, */ if (rr->length < struct_size(regs, regs, count)) { dev_dbg_f(zd_usb_dev(usb), - "error: actual length %d less than expected %ld\n", + "error: actual length %d less than expected %zu\n", rr->length, struct_size(regs, regs, count)); return false; } From 7e7db86c7e1088e768438fe6c894d748b0c32abe Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 19 Sep 2019 04:00:55 -0500 Subject: [PATCH 0765/2880] smb3: allow decryption keys to be dumped by admin for debugging In order to debug certain problems it is important to be able to decrypt network traces (e.g. wireshark) but to do this we need to be able to dump out the encryption/decryption keys. Dumping them to an ioctl is safer than dumping then to dmesg, (and better than showing all keys in a pseudofile). Restrict this to root (CAP_SYS_ADMIN), and only for a mount that this admin has access to. Sample smbinfo output: SMB3.0 encryption Session Id: 0x82d2ec52 Session Key: a5 6d 81 d0 e c1 ca e1 d8 13 aa 20 e8 f2 cc 71 Server Encryption Key: 1a c3 be ba 3d fc dc 3c e bc 93 9e 50 9e 19 c1 Server Decryption Key: e0 d4 d9 43 1b a2 1b e3 d8 76 77 49 56 f7 20 88 Reviewed-by: Aurelien Aptel Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifs_ioctl.h | 9 +++++++++ fs/cifs/ioctl.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 6c3bd07868d7..0f0dc1c1fe41 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -57,9 +57,18 @@ struct smb_query_info { /* char buffer[]; */ } __packed; +struct smb3_key_debug_info { + __u64 Suid; + __u16 cipher_type; + __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */ + __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) #define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array) #define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info) +#define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 76ddd98b6298..1a01e108d75e 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -164,6 +164,7 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); + struct smb3_key_debug_info pkey_inf; int rc = -ENOTTY; /* strange error - but the precedent */ unsigned int xid; struct cifsFileInfo *pSMBFile = filep->private_data; @@ -270,6 +271,34 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = -EOPNOTSUPP; break; + case CIFS_DUMP_KEY: + if (pSMBFile == NULL) + break; + if (!capable(CAP_SYS_ADMIN)) { + rc = -EACCES; + break; + } + + tcon = tlink_tcon(pSMBFile->tlink); + if (!smb3_encryption_required(tcon)) { + rc = -EOPNOTSUPP; + break; + } + pkey_inf.cipher_type = + le16_to_cpu(tcon->ses->server->cipher_type); + pkey_inf.Suid = tcon->ses->Suid; + memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response, + 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); + memcpy(pkey_inf.smb3decryptionkey, + tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + memcpy(pkey_inf.smb3encryptionkey, + tcon->ses->smb3encryptionkey, SMB3_SIGN_KEY_SIZE); + if (copy_to_user((void __user *)arg, &pkey_inf, + sizeof(struct smb3_key_debug_info))) + rc = -EFAULT; + else + rc = 0; + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; From 0d8006ddbe89cbaedef06a8789ddefa1164a3a77 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 4 Sep 2019 22:26:00 +1000 Subject: [PATCH 0766/2880] PCI: Add pci_irq_vector() and other stubs when !CONFIG_PCI Add stub functions pci_alloc_irq_vectors_affinity() and pci_irq_vector() when CONFIG_PCI is off so drivers can use them without resorting to ifdefs. Also move the PCI_IRQ_* macros outside of the ifdefs so they are always available. Fixes: 625f269a5a7a ("crypto: inside-secure - add support for PCI based FPGA development board") Link: https://lore.kernel.org/r/20190904122600.GA28660@gondor.apana.org.au Reported-by: kbuild test robot Reported-by: YueHaibing Signed-off-by: Herbert Xu Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/include/linux/pci.h b/include/linux/pci.h index f5bab312995d..23e4a6bcd08d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -925,6 +925,11 @@ enum { PCI_SCAN_ALL_PCIE_DEVS = 0x00000040, /* Scan all, not just dev 0 */ }; +#define PCI_IRQ_LEGACY (1 << 0) /* Allow legacy interrupts */ +#define PCI_IRQ_MSI (1 << 1) /* Allow MSI interrupts */ +#define PCI_IRQ_MSIX (1 << 2) /* Allow MSI-X interrupts */ +#define PCI_IRQ_AFFINITY (1 << 3) /* Auto-assign affinity */ + /* These external functions are only available when PCI support is enabled */ #ifdef CONFIG_PCI @@ -1408,11 +1413,6 @@ resource_size_t pcibios_window_alignment(struct pci_bus *bus, int pci_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags); -#define PCI_IRQ_LEGACY (1 << 0) /* Allow legacy interrupts */ -#define PCI_IRQ_MSI (1 << 1) /* Allow MSI interrupts */ -#define PCI_IRQ_MSIX (1 << 2) /* Allow MSI-X interrupts */ -#define PCI_IRQ_AFFINITY (1 << 3) /* Auto-assign affinity */ - /* * Virtual interrupts allow for more interrupts to be allocated * than the device has interrupts for. These are not programmed @@ -1517,14 +1517,6 @@ static inline int pci_irq_get_node(struct pci_dev *pdev, int vec) } #endif -static inline int -pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, - unsigned int max_vecs, unsigned int flags) -{ - return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs, flags, - NULL); -} - /** * pci_irqd_intx_xlate() - Translate PCI INTx value to an IRQ domain hwirq * @d: the INTx IRQ domain @@ -1780,8 +1772,29 @@ static inline const struct pci_device_id *pci_match_id(const struct pci_device_i struct pci_dev *dev) { return NULL; } static inline bool pci_ats_disabled(void) { return true; } + +static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr) +{ + return -EINVAL; +} + +static inline int +pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags, + struct irq_affinity *aff_desc) +{ + return -ENOSPC; +} #endif /* CONFIG_PCI */ +static inline int +pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags) +{ + return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs, flags, + NULL); +} + #ifdef CONFIG_PCI_ATS /* Address Translation Service */ void pci_ats_init(struct pci_dev *dev); From 3fe4b3351301660653a2bc73f2226da0ebd2b95e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 18 Sep 2019 14:01:46 +0200 Subject: [PATCH 0767/2880] cdc_ncm: fix divide-by-zero caused by invalid wMaxPacketSize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Endpoints with zero wMaxPacketSize are not usable for transferring data. Ignore such endpoints when looking for valid in, out and status pipes, to make the driver more robust against invalid and meaningless descriptors. The wMaxPacketSize of the out pipe is used as divisor. So this change fixes a divide-by-zero bug. Reported-by: syzbot+ce366e2b8296e25d84f5@syzkaller.appspotmail.com Signed-off-by: Bjørn Mork Signed-off-by: Jakub Kicinski --- drivers/net/usb/cdc_ncm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 50c05d0f44cb..00cab3f43a4c 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -681,8 +681,12 @@ cdc_ncm_find_endpoints(struct usbnet *dev, struct usb_interface *intf) u8 ep; for (ep = 0; ep < intf->cur_altsetting->desc.bNumEndpoints; ep++) { - e = intf->cur_altsetting->endpoint + ep; + + /* ignore endpoints which cannot transfer data */ + if (!usb_endpoint_maxp(&e->desc)) + continue; + switch (e->desc.bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) { case USB_ENDPOINT_XFER_INT: if (usb_endpoint_dir_in(&e->desc)) { From 8d3d7c2029c1b360f1a6b0a2fca470b57eb575c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 18 Sep 2019 14:17:38 +0200 Subject: [PATCH 0768/2880] usbnet: ignore endpoints with invalid wMaxPacketSize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Endpoints with zero wMaxPacketSize are not usable for transferring data. Ignore such endpoints when looking for valid in, out and status pipes, to make the drivers more robust against invalid and meaningless descriptors. The wMaxPacketSize of these endpoints are used for memory allocations and as divisors in many usbnet minidrivers. Avoiding zero is therefore critical. Signed-off-by: Bjørn Mork Signed-off-by: Jakub Kicinski --- drivers/net/usb/usbnet.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index e44849499b89..dde05e2fdc3e 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -100,6 +100,11 @@ int usbnet_get_endpoints(struct usbnet *dev, struct usb_interface *intf) int intr = 0; e = alt->endpoint + ep; + + /* ignore endpoints which cannot transfer data */ + if (!usb_endpoint_maxp(&e->desc)) + continue; + switch (e->desc.bmAttributes) { case USB_ENDPOINT_XFER_INT: if (!usb_endpoint_dir_in(&e->desc)) From e47488b2df7f9cb405789c7f5d4c27909fc597ae Mon Sep 17 00:00:00 2001 From: Peter Mamonov Date: Wed, 18 Sep 2019 19:27:55 +0300 Subject: [PATCH 0769/2880] net/phy: fix DP83865 10 Mbps HDX loopback disable function According to the DP83865 datasheet "the 10 Mbps HDX loopback can be disabled in the expanded memory register 0x1C0.1". The driver erroneously used bit 0 instead of bit 1. Fixes: 4621bf129856 ("phy: Add file missed in previous commit.") Signed-off-by: Peter Mamonov Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- drivers/net/phy/national.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/national.c b/drivers/net/phy/national.c index a221dd552c3c..a5bf0874c7d8 100644 --- a/drivers/net/phy/national.c +++ b/drivers/net/phy/national.c @@ -105,14 +105,17 @@ static void ns_giga_speed_fallback(struct phy_device *phydev, int mode) static void ns_10_base_t_hdx_loopack(struct phy_device *phydev, int disable) { + u16 lb_dis = BIT(1); + if (disable) - ns_exp_write(phydev, 0x1c0, ns_exp_read(phydev, 0x1c0) | 1); + ns_exp_write(phydev, 0x1c0, + ns_exp_read(phydev, 0x1c0) | lb_dis); else ns_exp_write(phydev, 0x1c0, - ns_exp_read(phydev, 0x1c0) & 0xfffe); + ns_exp_read(phydev, 0x1c0) & ~lb_dis); pr_debug("10BASE-T HDX loopback %s\n", - (ns_exp_read(phydev, 0x1c0) & 0x0001) ? "off" : "on"); + (ns_exp_read(phydev, 0x1c0) & lb_dis) ? "off" : "on"); } static int ns_config_init(struct phy_device *phydev) From 73f0c11d11329a0d6d205d4312b6e5d2512af7c5 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 18 Sep 2019 10:21:17 -0700 Subject: [PATCH 0770/2880] net: qrtr: Stop rx_worker before freeing node As the endpoint is unregistered there might still be work pending to handle incoming messages, which will result in a use after free scenario. The plan is to remove the rx_worker, but until then (and for stable@) ensure that the work is stopped before the node is freed. Fixes: bdabad3e363d ("net: Add Qualcomm IPC router") Cc: stable@vger.kernel.org Signed-off-by: Bjorn Andersson Signed-off-by: Jakub Kicinski --- net/qrtr/qrtr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 6c8b0f6d28f9..88f98f27ad88 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -150,6 +150,7 @@ static void __qrtr_node_release(struct kref *kref) list_del(&node->item); mutex_unlock(&qrtr_node_lock); + cancel_work_sync(&node->work); skb_queue_purge(&node->rx_queue); kfree(node); } From b0e1ee435aba68c4080e3cb67adf6573aa5bcc6d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Sep 2019 22:21:24 +0200 Subject: [PATCH 0771/2880] net: remove netx ethernet driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ARM netx platform got removed in 5.3, so this driver is now useless. Reported-by: Uwe Kleine-König Cc: Sascha Hauer Signed-off-by: Arnd Bergmann Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/Kconfig | 11 - drivers/net/ethernet/Makefile | 1 - drivers/net/ethernet/netx-eth.c | 497 ------------------------- include/linux/platform_data/eth-netx.h | 13 - 4 files changed, 522 deletions(-) delete mode 100644 drivers/net/ethernet/netx-eth.c delete mode 100644 include/linux/platform_data/eth-netx.h diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index 1e2de9d062bf..e8e9c166185d 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -140,17 +140,6 @@ source "drivers/net/ethernet/neterion/Kconfig" source "drivers/net/ethernet/netronome/Kconfig" source "drivers/net/ethernet/ni/Kconfig" source "drivers/net/ethernet/8390/Kconfig" - -config NET_NETX - tristate "NetX Ethernet support" - select MII - depends on ARCH_NETX - ---help--- - This is support for the Hilscher netX builtin Ethernet ports - - To compile this driver as a module, choose M here. The module - will be called netx-eth. - source "drivers/net/ethernet/nvidia/Kconfig" source "drivers/net/ethernet/nxp/Kconfig" source "drivers/net/ethernet/oki-semi/Kconfig" diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile index 77f9838a76c9..05abebc17804 100644 --- a/drivers/net/ethernet/Makefile +++ b/drivers/net/ethernet/Makefile @@ -64,7 +64,6 @@ obj-$(CONFIG_NET_VENDOR_NATSEMI) += natsemi/ obj-$(CONFIG_NET_VENDOR_NETERION) += neterion/ obj-$(CONFIG_NET_VENDOR_NETRONOME) += netronome/ obj-$(CONFIG_NET_VENDOR_NI) += ni/ -obj-$(CONFIG_NET_NETX) += netx-eth.o obj-$(CONFIG_NET_VENDOR_NVIDIA) += nvidia/ obj-$(CONFIG_LPC_ENET) += nxp/ obj-$(CONFIG_NET_VENDOR_OKI) += oki-semi/ diff --git a/drivers/net/ethernet/netx-eth.c b/drivers/net/ethernet/netx-eth.c deleted file mode 100644 index cf6e7eb1b1e1..000000000000 --- a/drivers/net/ethernet/netx-eth.c +++ /dev/null @@ -1,497 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * drivers/net/ethernet/netx-eth.c - * - * Copyright (c) 2005 Sascha Hauer , Pengutronix - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -/* XC Fifo Offsets */ -#define EMPTY_PTR_FIFO(xcno) (0 + ((xcno) << 3)) /* Index of the empty pointer FIFO */ -#define IND_FIFO_PORT_HI(xcno) (1 + ((xcno) << 3)) /* Index of the FIFO where received */ - /* Data packages are indicated by XC */ -#define IND_FIFO_PORT_LO(xcno) (2 + ((xcno) << 3)) /* Index of the FIFO where received */ - /* Data packages are indicated by XC */ -#define REQ_FIFO_PORT_HI(xcno) (3 + ((xcno) << 3)) /* Index of the FIFO where Data packages */ - /* have to be indicated by ARM which */ - /* shall be sent */ -#define REQ_FIFO_PORT_LO(xcno) (4 + ((xcno) << 3)) /* Index of the FIFO where Data packages */ - /* have to be indicated by ARM which shall */ - /* be sent */ -#define CON_FIFO_PORT_HI(xcno) (5 + ((xcno) << 3)) /* Index of the FIFO where sent Data packages */ - /* are confirmed */ -#define CON_FIFO_PORT_LO(xcno) (6 + ((xcno) << 3)) /* Index of the FIFO where sent Data */ - /* packages are confirmed */ -#define PFIFO_MASK(xcno) (0x7f << (xcno*8)) - -#define FIFO_PTR_FRAMELEN_SHIFT 0 -#define FIFO_PTR_FRAMELEN_MASK (0x7ff << 0) -#define FIFO_PTR_FRAMELEN(len) (((len) << 0) & FIFO_PTR_FRAMELEN_MASK) -#define FIFO_PTR_TIMETRIG (1<<11) -#define FIFO_PTR_MULTI_REQ -#define FIFO_PTR_ORIGIN (1<<14) -#define FIFO_PTR_VLAN (1<<15) -#define FIFO_PTR_FRAMENO_SHIFT 16 -#define FIFO_PTR_FRAMENO_MASK (0x3f << 16) -#define FIFO_PTR_FRAMENO(no) (((no) << 16) & FIFO_PTR_FRAMENO_MASK) -#define FIFO_PTR_SEGMENT_SHIFT 22 -#define FIFO_PTR_SEGMENT_MASK (0xf << 22) -#define FIFO_PTR_SEGMENT(seg) (((seg) & 0xf) << 22) -#define FIFO_PTR_ERROR_SHIFT 28 -#define FIFO_PTR_ERROR_MASK (0xf << 28) - -#define ISR_LINK_STATUS_CHANGE (1<<4) -#define ISR_IND_LO (1<<3) -#define ISR_CON_LO (1<<2) -#define ISR_IND_HI (1<<1) -#define ISR_CON_HI (1<<0) - -#define ETH_MAC_LOCAL_CONFIG 0x1560 -#define ETH_MAC_4321 0x1564 -#define ETH_MAC_65 0x1568 - -#define MAC_TRAFFIC_CLASS_ARRANGEMENT_SHIFT 16 -#define MAC_TRAFFIC_CLASS_ARRANGEMENT_MASK (0xf<data; - unsigned int len = skb->len; - - spin_lock_irq(&priv->lock); - memcpy_toio(priv->sram_base + 1560, (void *)buf, len); - if (len < 60) { - memset_io(priv->sram_base + 1560 + len, 0, 60 - len); - len = 60; - } - - pfifo_push(REQ_FIFO_PORT_LO(priv->id), - FIFO_PTR_SEGMENT(priv->id) | - FIFO_PTR_FRAMENO(1) | - FIFO_PTR_FRAMELEN(len)); - - ndev->stats.tx_packets++; - ndev->stats.tx_bytes += skb->len; - - netif_stop_queue(ndev); - spin_unlock_irq(&priv->lock); - dev_kfree_skb(skb); - - return NETDEV_TX_OK; -} - -static void netx_eth_receive(struct net_device *ndev) -{ - struct netx_eth_priv *priv = netdev_priv(ndev); - unsigned int val, frameno, seg, len; - unsigned char *data; - struct sk_buff *skb; - - val = pfifo_pop(IND_FIFO_PORT_LO(priv->id)); - - frameno = (val & FIFO_PTR_FRAMENO_MASK) >> FIFO_PTR_FRAMENO_SHIFT; - seg = (val & FIFO_PTR_SEGMENT_MASK) >> FIFO_PTR_SEGMENT_SHIFT; - len = (val & FIFO_PTR_FRAMELEN_MASK) >> FIFO_PTR_FRAMELEN_SHIFT; - - skb = netdev_alloc_skb(ndev, len); - if (unlikely(skb == NULL)) { - ndev->stats.rx_dropped++; - return; - } - - data = skb_put(skb, len); - - memcpy_fromio(data, priv->sram_base + frameno * 1560, len); - - pfifo_push(EMPTY_PTR_FIFO(priv->id), - FIFO_PTR_SEGMENT(seg) | FIFO_PTR_FRAMENO(frameno)); - - skb->protocol = eth_type_trans(skb, ndev); - netif_rx(skb); - ndev->stats.rx_packets++; - ndev->stats.rx_bytes += len; -} - -static irqreturn_t -netx_eth_interrupt(int irq, void *dev_id) -{ - struct net_device *ndev = dev_id; - struct netx_eth_priv *priv = netdev_priv(ndev); - int status; - unsigned long flags; - - spin_lock_irqsave(&priv->lock, flags); - - status = readl(NETX_PFIFO_XPEC_ISR(priv->id)); - while (status) { - int fill_level; - writel(status, NETX_PFIFO_XPEC_ISR(priv->id)); - - if ((status & ISR_CON_HI) || (status & ISR_IND_HI)) - printk("%s: unexpected status: 0x%08x\n", - __func__, status); - - fill_level = - readl(NETX_PFIFO_FILL_LEVEL(IND_FIFO_PORT_LO(priv->id))); - while (fill_level--) - netx_eth_receive(ndev); - - if (status & ISR_CON_LO) - netif_wake_queue(ndev); - - if (status & ISR_LINK_STATUS_CHANGE) - mii_check_media(&priv->mii, netif_msg_link(priv), 1); - - status = readl(NETX_PFIFO_XPEC_ISR(priv->id)); - } - spin_unlock_irqrestore(&priv->lock, flags); - return IRQ_HANDLED; -} - -static int netx_eth_open(struct net_device *ndev) -{ - struct netx_eth_priv *priv = netdev_priv(ndev); - - if (request_irq - (ndev->irq, netx_eth_interrupt, IRQF_SHARED, ndev->name, ndev)) - return -EAGAIN; - - writel(ndev->dev_addr[0] | - ndev->dev_addr[1]<<8 | - ndev->dev_addr[2]<<16 | - ndev->dev_addr[3]<<24, - priv->xpec_base + NETX_XPEC_RAM_START_OFS + ETH_MAC_4321); - writel(ndev->dev_addr[4] | - ndev->dev_addr[5]<<8, - priv->xpec_base + NETX_XPEC_RAM_START_OFS + ETH_MAC_65); - - writel(LOCAL_CONFIG_LINK_STATUS_IRQ_EN | - LOCAL_CONFIG_CON_LO_IRQ_EN | - LOCAL_CONFIG_CON_HI_IRQ_EN | - LOCAL_CONFIG_IND_LO_IRQ_EN | - LOCAL_CONFIG_IND_HI_IRQ_EN, - priv->xpec_base + NETX_XPEC_RAM_START_OFS + - ETH_MAC_LOCAL_CONFIG); - - mii_check_media(&priv->mii, netif_msg_link(priv), 1); - netif_start_queue(ndev); - - return 0; -} - -static int netx_eth_close(struct net_device *ndev) -{ - struct netx_eth_priv *priv = netdev_priv(ndev); - - netif_stop_queue(ndev); - - writel(0, - priv->xpec_base + NETX_XPEC_RAM_START_OFS + ETH_MAC_LOCAL_CONFIG); - - free_irq(ndev->irq, ndev); - - return 0; -} - -static void netx_eth_timeout(struct net_device *ndev) -{ - struct netx_eth_priv *priv = netdev_priv(ndev); - int i; - - printk(KERN_ERR "%s: transmit timed out, resetting\n", ndev->name); - - spin_lock_irq(&priv->lock); - - xc_reset(priv->xc); - xc_start(priv->xc); - - for (i=2; i<=18; i++) - pfifo_push(EMPTY_PTR_FIFO(priv->id), - FIFO_PTR_FRAMENO(i) | FIFO_PTR_SEGMENT(priv->id)); - - spin_unlock_irq(&priv->lock); - - netif_wake_queue(ndev); -} - -static int -netx_eth_phy_read(struct net_device *ndev, int phy_id, int reg) -{ - unsigned int val; - - val = MIIMU_SNRDY | MIIMU_PREAMBLE | MIIMU_PHYADDR(phy_id) | - MIIMU_REGADDR(reg) | MIIMU_PHY_NRES; - - writel(val, NETX_MIIMU); - while (readl(NETX_MIIMU) & MIIMU_SNRDY); - - return readl(NETX_MIIMU) >> 16; - -} - -static void -netx_eth_phy_write(struct net_device *ndev, int phy_id, int reg, int value) -{ - unsigned int val; - - val = MIIMU_SNRDY | MIIMU_PREAMBLE | MIIMU_PHYADDR(phy_id) | - MIIMU_REGADDR(reg) | MIIMU_PHY_NRES | MIIMU_OPMODE_WRITE | - MIIMU_DATA(value); - - writel(val, NETX_MIIMU); - while (readl(NETX_MIIMU) & MIIMU_SNRDY); -} - -static const struct net_device_ops netx_eth_netdev_ops = { - .ndo_open = netx_eth_open, - .ndo_stop = netx_eth_close, - .ndo_start_xmit = netx_eth_hard_start_xmit, - .ndo_tx_timeout = netx_eth_timeout, - .ndo_set_rx_mode = netx_eth_set_multicast_list, - .ndo_validate_addr = eth_validate_addr, - .ndo_set_mac_address = eth_mac_addr, -}; - -static int netx_eth_enable(struct net_device *ndev) -{ - struct netx_eth_priv *priv = netdev_priv(ndev); - unsigned int mac4321, mac65; - int running, i, ret; - bool inv_mac_addr = false; - - ndev->netdev_ops = &netx_eth_netdev_ops; - ndev->watchdog_timeo = msecs_to_jiffies(5000); - - priv->msg_enable = NETIF_MSG_LINK; - priv->mii.phy_id_mask = 0x1f; - priv->mii.reg_num_mask = 0x1f; - priv->mii.force_media = 0; - priv->mii.full_duplex = 0; - priv->mii.dev = ndev; - priv->mii.mdio_read = netx_eth_phy_read; - priv->mii.mdio_write = netx_eth_phy_write; - priv->mii.phy_id = INTERNAL_PHY_ADR + priv->id; - - running = xc_running(priv->xc); - xc_stop(priv->xc); - - /* if the xc engine is already running, assume the bootloader has - * loaded the firmware for us - */ - if (running) { - /* get Node Address from hardware */ - mac4321 = readl(priv->xpec_base + - NETX_XPEC_RAM_START_OFS + ETH_MAC_4321); - mac65 = readl(priv->xpec_base + - NETX_XPEC_RAM_START_OFS + ETH_MAC_65); - - ndev->dev_addr[0] = mac4321 & 0xff; - ndev->dev_addr[1] = (mac4321 >> 8) & 0xff; - ndev->dev_addr[2] = (mac4321 >> 16) & 0xff; - ndev->dev_addr[3] = (mac4321 >> 24) & 0xff; - ndev->dev_addr[4] = mac65 & 0xff; - ndev->dev_addr[5] = (mac65 >> 8) & 0xff; - } else { - if (xc_request_firmware(priv->xc)) { - printk(CARDNAME ": requesting firmware failed\n"); - return -ENODEV; - } - } - - xc_reset(priv->xc); - xc_start(priv->xc); - - if (!is_valid_ether_addr(ndev->dev_addr)) - inv_mac_addr = true; - - for (i=2; i<=18; i++) - pfifo_push(EMPTY_PTR_FIFO(priv->id), - FIFO_PTR_FRAMENO(i) | FIFO_PTR_SEGMENT(priv->id)); - - ret = register_netdev(ndev); - if (inv_mac_addr) - printk("%s: Invalid ethernet MAC address. Please set using ip\n", - ndev->name); - - return ret; -} - -static int netx_eth_drv_probe(struct platform_device *pdev) -{ - struct netx_eth_priv *priv; - struct net_device *ndev; - struct netxeth_platform_data *pdata; - int ret; - - ndev = alloc_etherdev(sizeof (struct netx_eth_priv)); - if (!ndev) { - ret = -ENOMEM; - goto exit; - } - SET_NETDEV_DEV(ndev, &pdev->dev); - - platform_set_drvdata(pdev, ndev); - - priv = netdev_priv(ndev); - - pdata = dev_get_platdata(&pdev->dev); - priv->xc = request_xc(pdata->xcno, &pdev->dev); - if (!priv->xc) { - dev_err(&pdev->dev, "unable to request xc engine\n"); - ret = -ENODEV; - goto exit_free_netdev; - } - - ndev->irq = priv->xc->irq; - priv->id = pdev->id; - priv->xpec_base = priv->xc->xpec_base; - priv->xmac_base = priv->xc->xmac_base; - priv->sram_base = priv->xc->sram_base; - - spin_lock_init(&priv->lock); - - ret = pfifo_request(PFIFO_MASK(priv->id)); - if (ret) { - printk("unable to request PFIFO\n"); - goto exit_free_xc; - } - - ret = netx_eth_enable(ndev); - if (ret) - goto exit_free_pfifo; - - return 0; -exit_free_pfifo: - pfifo_free(PFIFO_MASK(priv->id)); -exit_free_xc: - free_xc(priv->xc); -exit_free_netdev: - free_netdev(ndev); -exit: - return ret; -} - -static int netx_eth_drv_remove(struct platform_device *pdev) -{ - struct net_device *ndev = platform_get_drvdata(pdev); - struct netx_eth_priv *priv = netdev_priv(ndev); - - unregister_netdev(ndev); - xc_stop(priv->xc); - free_xc(priv->xc); - free_netdev(ndev); - pfifo_free(PFIFO_MASK(priv->id)); - - return 0; -} - -static int netx_eth_drv_suspend(struct platform_device *pdev, pm_message_t state) -{ - dev_err(&pdev->dev, "suspend not implemented\n"); - return 0; -} - -static int netx_eth_drv_resume(struct platform_device *pdev) -{ - dev_err(&pdev->dev, "resume not implemented\n"); - return 0; -} - -static struct platform_driver netx_eth_driver = { - .probe = netx_eth_drv_probe, - .remove = netx_eth_drv_remove, - .suspend = netx_eth_drv_suspend, - .resume = netx_eth_drv_resume, - .driver = { - .name = CARDNAME, - }, -}; - -static int __init netx_eth_init(void) -{ - unsigned int phy_control, val; - - printk("NetX Ethernet driver\n"); - - phy_control = PHY_CONTROL_PHY_ADDRESS(INTERNAL_PHY_ADR>>1) | - PHY_CONTROL_PHY1_MODE(PHY_MODE_ALL) | - PHY_CONTROL_PHY1_AUTOMDIX | - PHY_CONTROL_PHY1_EN | - PHY_CONTROL_PHY0_MODE(PHY_MODE_ALL) | - PHY_CONTROL_PHY0_AUTOMDIX | - PHY_CONTROL_PHY0_EN | - PHY_CONTROL_CLK_XLATIN; - - val = readl(NETX_SYSTEM_IOC_ACCESS_KEY); - writel(val, NETX_SYSTEM_IOC_ACCESS_KEY); - - writel(phy_control | PHY_CONTROL_RESET, NETX_SYSTEM_PHY_CONTROL); - udelay(100); - - val = readl(NETX_SYSTEM_IOC_ACCESS_KEY); - writel(val, NETX_SYSTEM_IOC_ACCESS_KEY); - - writel(phy_control, NETX_SYSTEM_PHY_CONTROL); - - return platform_driver_register(&netx_eth_driver); -} - -static void __exit netx_eth_cleanup(void) -{ - platform_driver_unregister(&netx_eth_driver); -} - -module_init(netx_eth_init); -module_exit(netx_eth_cleanup); - -MODULE_AUTHOR("Sascha Hauer, Pengutronix"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:" CARDNAME); -MODULE_FIRMWARE("xc0.bin"); -MODULE_FIRMWARE("xc1.bin"); -MODULE_FIRMWARE("xc2.bin"); diff --git a/include/linux/platform_data/eth-netx.h b/include/linux/platform_data/eth-netx.h deleted file mode 100644 index a3a6322668d8..000000000000 --- a/include/linux/platform_data/eth-netx.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (c) 2005 Sascha Hauer , Pengutronix - */ - -#ifndef __ETH_NETX_H -#define __ETH_NETX_H - -struct netxeth_platform_data { - unsigned int xcno; /* number of xmac/xpec engine this eth uses */ -}; - -#endif From 62794fc4fbf52f2209dc094ea255eaef760e7d01 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 18 Sep 2019 16:24:12 -0700 Subject: [PATCH 0772/2880] net_sched: add max len check for TCA_KIND The TCA_KIND attribute is of NLA_STRING which does not check the NUL char. KMSAN reported an uninit-value of TCA_KIND which is likely caused by the lack of NUL. Change it to NLA_NUL_STRING and add a max len too. Fixes: 8b4c3cdd9dd8 ("net: sched: Add policy validation for tc attributes") Reported-and-tested-by: syzbot+618aacd49e8c8b8486bd@syzkaller.appspotmail.com Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Reviewed-by: David Ahern Acked-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 1047825d9f48..81d58b280612 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1390,7 +1390,8 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) } const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { - [TCA_KIND] = { .type = NLA_STRING }, + [TCA_KIND] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 }, [TCA_RATE] = { .type = NLA_BINARY, .len = sizeof(struct tc_estimator) }, [TCA_STAB] = { .type = NLA_NESTED }, From 199ce850ce112315cfc68d42b694bcaa27b097b7 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 18 Sep 2019 18:44:43 -0700 Subject: [PATCH 0773/2880] net_sched: add policy validation for action attributes Similar to commit 8b4c3cdd9dd8 ("net: sched: Add policy validation for tc attributes"), we need to add proper policy validation for TC action attributes too. Cc: David Ahern Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/sched/act_api.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 339712296164..2558f00f6b3e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -831,6 +831,15 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } +static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { + [TCA_ACT_KIND] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 }, + [TCA_ACT_INDEX] = { .type = NLA_U32 }, + [TCA_ACT_COOKIE] = { .type = NLA_BINARY, + .len = TC_COOKIE_MAX_SIZE }, + [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, +}; + struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, @@ -846,8 +855,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int err; if (name == NULL) { - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, - extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); if (err < 0) goto err_out; err = -EINVAL; @@ -856,18 +865,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, NL_SET_ERR_MSG(extack, "TC action kind must be specified"); goto err_out; } - if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) { - NL_SET_ERR_MSG(extack, "TC action name too long"); - goto err_out; - } + nla_strlcpy(act_name, kind, IFNAMSIZ); + if (tb[TCA_ACT_COOKIE]) { - int cklen = nla_len(tb[TCA_ACT_COOKIE]); - - if (cklen > TC_COOKIE_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "TC cookie size above the maximum"); - goto err_out; - } - cookie = nla_memdup_cookie(tb); if (!cookie) { NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); @@ -1098,7 +1098,8 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, int index; int err; - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); if (err < 0) goto err_out; @@ -1152,7 +1153,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, b = skb_tail_pointer(skb); - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); if (err < 0) goto err_out; @@ -1440,7 +1442,7 @@ static struct nlattr *find_dump_kind(struct nlattr **nla) if (tb[1] == NULL) return NULL; - if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0) + if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], tcf_action_policy, NULL) < 0) return NULL; kind = tb2[TCA_ACT_KIND]; From a8d570de0cc6acd6dc994515f163ccbc5e54ab60 Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Thu, 19 Sep 2019 14:38:19 +0800 Subject: [PATCH 0774/2880] net: dsa: sja1105: Add dependency for NET_DSA_SJA1105_TAS If CONFIG_NET_DSA_SJA1105_TAS=y and CONFIG_NET_SCH_TAPRIO=n, below error can be found: drivers/net/dsa/sja1105/sja1105_tas.o: In function `sja1105_setup_tc_taprio': sja1105_tas.c:(.text+0x318): undefined reference to `taprio_offload_free' sja1105_tas.c:(.text+0x590): undefined reference to `taprio_offload_get' drivers/net/dsa/sja1105/sja1105_tas.o: In function `sja1105_tas_teardown': sja1105_tas.c:(.text+0x610): undefined reference to `taprio_offload_free' make: *** [vmlinux] Error 1 sja1105_tas needs tc-taprio, so this patch add the dependency for it. Fixes: 317ab5b86c8e ("net: dsa: sja1105: Configure the Time-Aware Scheduler via tc-taprio offload") Signed-off-by: Mao Wenan Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/sja1105/Kconfig b/drivers/net/dsa/sja1105/Kconfig index 55424f39cb0d..f40b248f0b23 100644 --- a/drivers/net/dsa/sja1105/Kconfig +++ b/drivers/net/dsa/sja1105/Kconfig @@ -27,6 +27,7 @@ config NET_DSA_SJA1105_PTP config NET_DSA_SJA1105_TAS bool "Support for the Time-Aware Scheduler on NXP SJA1105" depends on NET_DSA_SJA1105 + depends on NET_SCH_TAPRIO help This enables support for the TTEthernet-based egress scheduling engine in the SJA1105 DSA driver, which is controlled using a From b6b6cc9acd7b99df216c007e9565aebdc2c77469 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Sep 2019 14:33:43 +0200 Subject: [PATCH 0775/2880] net: stmmac: selftest: avoid large stack usage Putting a struct stmmac_rss object on the stack is a bad idea, as it exceeds the warning limit for a stack frame on 32-bit architectures: drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c:1221:12: error: stack frame size of 1208 bytes in function '__stmmac_test_l3filt' [-Werror,-Wframe-larger-than=] drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c:1338:12: error: stack frame size of 1208 bytes in function '__stmmac_test_l4filt' [-Werror,-Wframe-larger-than=] As the object is the trivial empty case, change the called function to accept a NULL pointer to mean the same thing and remove the large variable in the two callers. Fixes: 4647e021193d ("net: stmmac: selftests: Add selftest for L3/L4 Filters") Signed-off-by: Arnd Bergmann Acked-by: Jose Abreu Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 5 ++--- .../net/ethernet/stmicro/stmmac/stmmac_selftests.c | 14 ++++---------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index d5173dd02a71..2b277b2c586b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -523,19 +523,18 @@ static int dwxgmac2_rss_configure(struct mac_device_info *hw, struct stmmac_rss *cfg, u32 num_rxq) { void __iomem *ioaddr = hw->pcsr; - u32 *key = (u32 *)cfg->key; int i, ret; u32 value; value = readl(ioaddr + XGMAC_RSS_CTRL); - if (!cfg->enable) { + if (!cfg || !cfg->enable) { value &= ~XGMAC_RSSE; writel(value, ioaddr + XGMAC_RSS_CTRL); return 0; } for (i = 0; i < (sizeof(cfg->key) / sizeof(u32)); i++) { - ret = dwxgmac2_rss_write_reg(ioaddr, true, i, *key++); + ret = dwxgmac2_rss_write_reg(ioaddr, true, i, cfg->key[i]); if (ret) return ret; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c index c56e89e1ae56..9c8d210b2d6a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c @@ -1233,12 +1233,9 @@ static int __stmmac_test_l3filt(struct stmmac_priv *priv, u32 dst, u32 src, return -EOPNOTSUPP; if (!priv->dma_cap.l3l4fnum) return -EOPNOTSUPP; - if (priv->rss.enable) { - struct stmmac_rss rss = { .enable = false, }; - - stmmac_rss_configure(priv, priv->hw, &rss, + if (priv->rss.enable) + stmmac_rss_configure(priv, priv->hw, NULL, priv->plat->rx_queues_to_use); - } dissector = kzalloc(sizeof(*dissector), GFP_KERNEL); if (!dissector) { @@ -1357,12 +1354,9 @@ static int __stmmac_test_l4filt(struct stmmac_priv *priv, u32 dst, u32 src, return -EOPNOTSUPP; if (!priv->dma_cap.l3l4fnum) return -EOPNOTSUPP; - if (priv->rss.enable) { - struct stmmac_rss rss = { .enable = false, }; - - stmmac_rss_configure(priv, priv->hw, &rss, + if (priv->rss.enable) + stmmac_rss_configure(priv, priv->hw, NULL, priv->plat->rx_queues_to_use); - } dissector = kzalloc(sizeof(*dissector), GFP_KERNEL); if (!dissector) { From 24ccb0ab95bf14e414cf2ba65af5458bc5a2e865 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 20 Sep 2019 06:56:56 +0200 Subject: [PATCH 0776/2880] qede: qede_fp: simplify a bit 'qede_rx_build_skb()' Use 'skb_put_data()' instead of rewritting it. This improves readability. Signed-off-by: Christophe JAILLET Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qede/qede_fp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 0ae28f0d2523..004c0bfec41d 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -779,8 +779,7 @@ qede_rx_build_skb(struct qede_dev *edev, return NULL; skb_reserve(skb, pad); - memcpy(skb_put(skb, len), - page_address(bd->data) + offset, len); + skb_put_data(skb, page_address(bd->data) + offset, len); qede_reuse_page(rxq, bd); goto out; } From eb09b3cc464d2c3bbde9a6648603c8d599ea8582 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 22 Sep 2019 10:01:05 -0600 Subject: [PATCH 0777/2880] pktcdvd: remove warning on attempting to register non-passthrough dev Anatoly reports that he gets the below warning when booting -git on a sparc64 box on debian unstable: ... [ 13.352975] aes_sparc64: Using sparc64 aes opcodes optimized AES implementation [ 13.428002] ------------[ cut here ]------------ [ 13.428081] WARNING: CPU: 21 PID: 586 at drivers/block/pktcdvd.c:2597 pkt_setup_dev+0x2e4/0x5a0 [pktcdvd] [ 13.428147] Attempt to register a non-SCSI queue [ 13.428184] Modules linked in: pktcdvd libdes cdrom aes_sparc64 n2_rng md5_sparc64 sha512_sparc64 rng_core sha256_sparc64 flash sha1_sparc64 ip_tables x_tables ipv6 crc_ccitt nf_defrag_ipv6 autofs4 ext4 crc16 mbcache jbd2 raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear md_mod crc32c_sparc64 [ 13.428452] CPU: 21 PID: 586 Comm: pktsetup Not tainted 5.3.0-10169-g574cc4539762 #1234 [ 13.428507] Call Trace: [ 13.428542] [00000000004635c0] __warn+0xc0/0x100 [ 13.428582] [0000000000463634] warn_slowpath_fmt+0x34/0x60 [ 13.428626] [000000001045b244] pkt_setup_dev+0x2e4/0x5a0 [pktcdvd] [ 13.428674] [000000001045ccf4] pkt_ctl_ioctl+0x94/0x220 [pktcdvd] [ 13.428724] [00000000006b95c8] do_vfs_ioctl+0x628/0x6e0 [ 13.428764] [00000000006b96c8] ksys_ioctl+0x48/0x80 [ 13.428803] [00000000006b9714] sys_ioctl+0x14/0x40 [ 13.428847] [0000000000406294] linux_sparc_syscall+0x34/0x44 [ 13.428890] irq event stamp: 4181 [ 13.428924] hardirqs last enabled at (4189): [<00000000004e0a74>] console_unlock+0x634/0x6c0 [ 13.428984] hardirqs last disabled at (4196): [<00000000004e0540>] console_unlock+0x100/0x6c0 [ 13.429048] softirqs last enabled at (3978): [<0000000000b2e2d8>] __do_softirq+0x498/0x520 [ 13.429110] softirqs last disabled at (3967): [<000000000042cfb4>] do_softirq_own_stack+0x34/0x60 [ 13.429172] ---[ end trace 2220ca468f32967d ]--- [ 13.430018] pktcdvd: setup of pktcdvd device failed [ 13.455589] des_sparc64: Using sparc64 des opcodes optimized DES implementation [ 13.515334] camellia_sparc64: Using sparc64 camellia opcodes optimized CAMELLIA implementation [ 13.522856] pktcdvd: setup of pktcdvd device failed [ 13.529327] pktcdvd: setup of pktcdvd device failed [ 13.532932] pktcdvd: setup of pktcdvd device failed [ 13.536165] pktcdvd: setup of pktcdvd device failed [ 13.539372] pktcdvd: setup of pktcdvd device failed [ 13.542834] pktcdvd: setup of pktcdvd device failed [ 13.546536] pktcdvd: setup of pktcdvd device failed [ 15.431071] XFS (dm-0): Mounting V5 Filesystem ... Apparently debian auto-attaches any cdrom like device to pktcdvd, which can lead to the above warning. There's really no reason to warn for this situation, kill it. Reported-by: Anatoly Pugachev Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 024060165afa..76457003f140 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2594,7 +2594,6 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) if (ret) return ret; if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) { - WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); return -EINVAL; } From dd89d82e751473d13da0daea1bfb15d5634071c4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 15 May 2019 12:34:21 +0300 Subject: [PATCH 0778/2880] thermal: thermal_mmio: remove some dead code The platform_get_resource() function doesn't return error pointers, it returns NULL. The way this is normally done, is that we pass the NULL resource to devm_ioremap_resource() and then check for errors from that. See the comment in front of devm_ioremap_resource() for more details. Signed-off-by: Dan Carpenter Acked-by: Talel Shenhar Signed-off-by: Eduardo Valentin --- drivers/thermal/thermal_mmio.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/thermal/thermal_mmio.c b/drivers/thermal/thermal_mmio.c index de3cceea23bc..40524fa13533 100644 --- a/drivers/thermal/thermal_mmio.c +++ b/drivers/thermal/thermal_mmio.c @@ -53,13 +53,6 @@ static int thermal_mmio_probe(struct platform_device *pdev) return -ENOMEM; resource = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (IS_ERR(resource)) { - dev_err(&pdev->dev, - "fail to get platform memory resource (%ld)\n", - PTR_ERR(resource)); - return PTR_ERR(resource); - } - sensor->mmio_base = devm_ioremap_resource(&pdev->dev, resource); if (IS_ERR(sensor->mmio_base)) { dev_err(&pdev->dev, "failed to ioremap memory (%ld)\n", From ff04cfbaa23644562f369eeca0b44ef66e185c9e Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Sun, 22 Sep 2019 13:38:08 +0800 Subject: [PATCH 0779/2880] net: ena: Select DIMLIB for ENA_ETHERNET If CONFIG_ENA_ETHERNET=y and CONFIG_DIMLIB=n, below erros can be found: drivers/net/ethernet/amazon/ena/ena_netdev.o: In function `ena_dim_work': ena_netdev.c:(.text+0x21cc): undefined reference to `net_dim_get_rx_moderation' ena_netdev.c:(.text+0x21cc): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `net_dim_get_rx_moderation' drivers/net/ethernet/amazon/ena/ena_netdev.o: In function `ena_io_poll': ena_netdev.c:(.text+0x7bd4): undefined reference to `net_dim' ena_netdev.c:(.text+0x7bd4): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `net_dim' After commit 282faf61a053 ("net: ena: switch to dim algorithm for rx adaptive interrupt moderation"), it introduces dim algorithm, which configured by CONFIG_DIMLIB. So, this patch is to select DIMLIB for ENA_ETHERNET. Fixes: 282faf61a053 ("net: ena: switch to dim algorithm for rx adaptive interrupt moderation") Signed-off-by: Mao Wenan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/amazon/Kconfig b/drivers/net/ethernet/amazon/Kconfig index 69ca99d8ac26..cca72a75f551 100644 --- a/drivers/net/ethernet/amazon/Kconfig +++ b/drivers/net/ethernet/amazon/Kconfig @@ -19,6 +19,7 @@ if NET_VENDOR_AMAZON config ENA_ETHERNET tristate "Elastic Network Adapter (ENA) support" depends on PCI_MSI && !CPU_BIG_ENDIAN + select DIMLIB ---help--- This driver supports Elastic Network Adapter (ENA)" From 73a63ee9955494e18a6f7d9ba396f5e78e3ce307 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 21 Sep 2019 08:59:26 +0300 Subject: [PATCH 0780/2880] ionic: Fix an error code in ionic_lif_alloc() We need to set the error code on this path. Otherwise it probably results in a NULL dereference down the line. Fixes: aa3198819bea ("ionic: Add RSS support") Signed-off-by: Dan Carpenter Acked-by: Shannon Nelson Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index db7c82742828..72107a0627a9 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1704,6 +1704,7 @@ static struct ionic_lif *ionic_lif_alloc(struct ionic *ionic, unsigned int index GFP_KERNEL); if (!lif->rss_ind_tbl) { + err = -ENOMEM; dev_err(dev, "Failed to allocate rss indirection table, aborting\n"); goto err_out_free_qcqs; } From 938e4d49c26eee7b6cb39f2d516126ac39391511 Mon Sep 17 00:00:00 2001 From: Nishad Kamdar Date: Sat, 21 Sep 2019 19:00:16 +0530 Subject: [PATCH 0781/2880] net: dsa: b53: Use the correct style for SPDX License Identifier This patch corrects the SPDX License Identifier style in header file for Broadcom BCM53xx managed switch driver. For C header files Documentation/process/license-rules.rst mandates C-like comments (opposed to C source files where C++ style should be used) Changes made by using a script provided by Joe Perches here: https://lkml.org/lkml/2019/2/7/46. Suggested-by: Joe Perches Signed-off-by: Nishad Kamdar Reviewed-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_serdes.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/b53/b53_serdes.h b/drivers/net/dsa/b53/b53_serdes.h index 3bb4f91aec9e..55d280fe38e4 100644 --- a/drivers/net/dsa/b53/b53_serdes.h +++ b/drivers/net/dsa/b53/b53_serdes.h @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause - * +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* * Northstar Plus switch SerDes/SGMII PHY definitions * * Copyright (C) 2018 Florian Fainelli From 34b4688425d9841a19a15fa6ae2bfc12a372650f Mon Sep 17 00:00:00 2001 From: Nishad Kamdar Date: Sat, 21 Sep 2019 19:15:25 +0530 Subject: [PATCH 0782/2880] net: dsa: Use the correct style for SPDX License Identifier This patch corrects the SPDX License Identifier style in header file for Distributed Switch Architecture drivers. For C header files Documentation/process/license-rules.rst mandates C-like comments (opposed to C source files where C++ style should be used) Changes made by using a script provided by Joe Perches here: https://lkml.org/lkml/2019/2/7/46. Suggested-by: Joe Perches Signed-off-by: Nishad Kamdar Reviewed-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/lantiq_pce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/lantiq_pce.h b/drivers/net/dsa/lantiq_pce.h index 180663138e75..e2be31f3672a 100644 --- a/drivers/net/dsa/lantiq_pce.h +++ b/drivers/net/dsa/lantiq_pce.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * PCE microcode extracted from UGW 7.1.1 switch api * From d4f4de5e5ef8efde85febb6876cd3c8ab1631999 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 15 Sep 2019 12:12:39 -0400 Subject: [PATCH 0783/2880] Fix the locking in dcache_readdir() and friends There are two problems in dcache_readdir() - one is that lockless traversal of the list needs non-trivial cooperation of d_alloc() (at least a switch to list_add_rcu(), and probably more than just that) and another is that it assumes that no removal will happen without the directory locked exclusive. Said assumption had always been there, never had been stated explicitly and is violated by several places in the kernel (devpts and selinuxfs). * replacement of next_positive() with different calling conventions: it returns struct list_head * instead of struct dentry *; the latter is passed in and out by reference, grabbing the result and dropping the original value. * scan is under ->d_lock. If we run out of timeslice, cursor is moved after the last position we'd reached and we reschedule; then the scan continues from that place. To avoid livelocks between multiple lseek() (with cursors getting moved past each other, never reaching the real entries) we always skip the cursors, need_resched() or not. * returned list_head * is either ->d_child of dentry we'd found or ->d_subdirs of parent (if we got to the end of the list). * dcache_readdir() and dcache_dir_lseek() switched to new helper. dcache_readdir() always holds a reference to dentry passed to dir_emit() now. Cursor is moved to just before the entry where dir_emit() has failed or into the very end of the list, if we'd run out. * move_cursor() eliminated - it had sucky calling conventions and after fixing that it became simply list_move() (in lseek and scan_positives) or list_move_tail() (in readdir). All operations with the list are under ->d_lock now, and we do not depend upon having all file removals done with parent locked exclusive anymore. Cc: stable@vger.kernel.org Reported-by: "zhengbin (A)" Signed-off-by: Al Viro --- fs/libfs.c | 134 +++++++++++++++++++++++++++-------------------------- 1 file changed, 69 insertions(+), 65 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index c9b2850c0f7c..8e023b08a240 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -89,58 +89,47 @@ int dcache_dir_close(struct inode *inode, struct file *file) EXPORT_SYMBOL(dcache_dir_close); /* parent is locked at least shared */ -static struct dentry *next_positive(struct dentry *parent, - struct list_head *from, - int count) +/* + * Returns an element of siblings' list. + * We are looking for th positive after

; if + * found, dentry is grabbed and passed to caller via *. + * If no such element exists, the anchor of list is returned + * and * is set to NULL. + */ +static struct list_head *scan_positives(struct dentry *cursor, + struct list_head *p, + loff_t count, + struct dentry **res) { - unsigned *seq = &parent->d_inode->i_dir_seq, n; - struct dentry *res; - struct list_head *p; - bool skipped; - int i; + struct dentry *dentry = cursor->d_parent, *found = NULL; -retry: - i = count; - skipped = false; - n = smp_load_acquire(seq) & ~1; - res = NULL; - rcu_read_lock(); - for (p = from->next; p != &parent->d_subdirs; p = p->next) { + spin_lock(&dentry->d_lock); + while ((p = p->next) != &dentry->d_subdirs) { struct dentry *d = list_entry(p, struct dentry, d_child); - if (!simple_positive(d)) { - skipped = true; - } else if (!--i) { - res = d; - break; + // we must at least skip cursors, to avoid livelocks + if (d->d_flags & DCACHE_DENTRY_CURSOR) + continue; + if (simple_positive(d) && !--count) { + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(d)) + found = dget_dlock(d); + spin_unlock(&d->d_lock); + if (likely(found)) + break; + count = 1; + } + if (need_resched()) { + list_move(&cursor->d_child, p); + p = &cursor->d_child; + spin_unlock(&dentry->d_lock); + cond_resched(); + spin_lock(&dentry->d_lock); } } - rcu_read_unlock(); - if (skipped) { - smp_rmb(); - if (unlikely(*seq != n)) - goto retry; - } - return res; -} - -static void move_cursor(struct dentry *cursor, struct list_head *after) -{ - struct dentry *parent = cursor->d_parent; - unsigned n, *seq = &parent->d_inode->i_dir_seq; - spin_lock(&parent->d_lock); - for (;;) { - n = *seq; - if (!(n & 1) && cmpxchg(seq, n, n + 1) == n) - break; - cpu_relax(); - } - __list_del(cursor->d_child.prev, cursor->d_child.next); - if (after) - list_add(&cursor->d_child, after); - else - list_add_tail(&cursor->d_child, &parent->d_subdirs); - smp_store_release(seq, n + 2); - spin_unlock(&parent->d_lock); + spin_unlock(&dentry->d_lock); + dput(*res); + *res = found; + return p; } loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) @@ -158,17 +147,28 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) return -EINVAL; } if (offset != file->f_pos) { - file->f_pos = offset; - if (file->f_pos >= 2) { - struct dentry *cursor = file->private_data; - struct dentry *to; - loff_t n = file->f_pos - 2; + struct dentry *cursor = file->private_data; + struct dentry *to = NULL; + struct list_head *p; - inode_lock_shared(dentry->d_inode); - to = next_positive(dentry, &dentry->d_subdirs, n); - move_cursor(cursor, to ? &to->d_child : NULL); - inode_unlock_shared(dentry->d_inode); + file->f_pos = offset; + inode_lock_shared(dentry->d_inode); + + if (file->f_pos > 2) { + p = scan_positives(cursor, &dentry->d_subdirs, + file->f_pos - 2, &to); + spin_lock(&dentry->d_lock); + list_move(&cursor->d_child, p); + spin_unlock(&dentry->d_lock); + } else { + spin_lock(&dentry->d_lock); + list_del_init(&cursor->d_child); + spin_unlock(&dentry->d_lock); } + + dput(to); + + inode_unlock_shared(dentry->d_inode); } return offset; } @@ -190,25 +190,29 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; - struct list_head *p = &cursor->d_child; - struct dentry *next; - bool moved = false; + struct list_head *anchor = &dentry->d_subdirs; + struct dentry *next = NULL; + struct list_head *p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) - p = &dentry->d_subdirs; - while ((next = next_positive(dentry, p, 1)) != NULL) { + p = anchor; + else + p = &cursor->d_child; + + while ((p = scan_positives(cursor, p, 1, &next)) != anchor) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, dt_type(d_inode(next)))) break; - moved = true; - p = &next->d_child; ctx->pos++; } - if (moved) - move_cursor(cursor, p); + spin_lock(&dentry->d_lock); + list_move_tail(&cursor->d_child, p); + spin_unlock(&dentry->d_lock); + dput(next); + return 0; } EXPORT_SYMBOL(dcache_readdir); From 8581d51055a08cc6eb061c8856062290e8582ce4 Mon Sep 17 00:00:00 2001 From: "Lowry Li (Arm Technology China)" Date: Wed, 31 Jul 2019 11:04:38 +0000 Subject: [PATCH 0784/2880] drm: Free the writeback_job when it with an empty fb Adds the check if the writeback_job with an empty fb, then it should be freed in atomic_check phase. With this change, the driver users will not check empty fb case any more. So refined accordingly. Signed-off-by: Lowry Li (Arm Technology China) Reviewed-by: Liviu Dudau Reviewed-by: James Qian Wang (Arm Technology China) Signed-off-by: james qian wang (Arm Technology China) Link: https://patchwork.freedesktop.org/patch/msgid/1564571048-15029-2-git-send-email-lowry.li@arm.com --- .../drm/arm/display/komeda/komeda_wb_connector.c | 3 +-- drivers/gpu/drm/arm/malidp_mw.c | 4 ++-- drivers/gpu/drm/drm_atomic.c | 13 +++++++++---- drivers/gpu/drm/rcar-du/rcar_du_writeback.c | 4 ++-- drivers/gpu/drm/vc4/vc4_txp.c | 5 ++--- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c index 2851cac94d86..23fbee268119 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c @@ -43,9 +43,8 @@ komeda_wb_encoder_atomic_check(struct drm_encoder *encoder, struct komeda_data_flow_cfg dflow; int err; - if (!writeback_job || !writeback_job->fb) { + if (!writeback_job) return 0; - } if (!crtc_st->active) { DRM_DEBUG_ATOMIC("Cannot write the composition result out on a inactive CRTC.\n"); diff --git a/drivers/gpu/drm/arm/malidp_mw.c b/drivers/gpu/drm/arm/malidp_mw.c index 2e812525025d..a59227b2cdb5 100644 --- a/drivers/gpu/drm/arm/malidp_mw.c +++ b/drivers/gpu/drm/arm/malidp_mw.c @@ -130,7 +130,7 @@ malidp_mw_encoder_atomic_check(struct drm_encoder *encoder, struct drm_framebuffer *fb; int i, n_planes; - if (!conn_state->writeback_job || !conn_state->writeback_job->fb) + if (!conn_state->writeback_job) return 0; fb = conn_state->writeback_job->fb; @@ -247,7 +247,7 @@ void malidp_mw_atomic_commit(struct drm_device *drm, mw_state = to_mw_state(conn_state); - if (conn_state->writeback_job && conn_state->writeback_job->fb) { + if (conn_state->writeback_job) { struct drm_framebuffer *fb = conn_state->writeback_job->fb; DRM_DEV_DEBUG_DRIVER(drm->dev, diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c index 419381abbdd1..14aeaf736321 100644 --- a/drivers/gpu/drm/drm_atomic.c +++ b/drivers/gpu/drm/drm_atomic.c @@ -430,10 +430,15 @@ static int drm_atomic_connector_check(struct drm_connector *connector, return -EINVAL; } - if (writeback_job->out_fence && !writeback_job->fb) { - DRM_DEBUG_ATOMIC("[CONNECTOR:%d:%s] requesting out-fence without framebuffer\n", - connector->base.id, connector->name); - return -EINVAL; + if (!writeback_job->fb) { + if (writeback_job->out_fence) { + DRM_DEBUG_ATOMIC("[CONNECTOR:%d:%s] requesting out-fence without framebuffer\n", + connector->base.id, connector->name); + return -EINVAL; + } + + drm_writeback_cleanup_job(writeback_job); + state->writeback_job = NULL; } return 0; diff --git a/drivers/gpu/drm/rcar-du/rcar_du_writeback.c b/drivers/gpu/drm/rcar-du/rcar_du_writeback.c index ae07290bba6a..04efa78d70b6 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_writeback.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_writeback.c @@ -147,7 +147,7 @@ static int rcar_du_wb_enc_atomic_check(struct drm_encoder *encoder, struct drm_device *dev = encoder->dev; struct drm_framebuffer *fb; - if (!conn_state->writeback_job || !conn_state->writeback_job->fb) + if (!conn_state->writeback_job) return 0; fb = conn_state->writeback_job->fb; @@ -221,7 +221,7 @@ void rcar_du_writeback_setup(struct rcar_du_crtc *rcrtc, unsigned int i; state = rcrtc->writeback.base.state; - if (!state || !state->writeback_job || !state->writeback_job->fb) + if (!state || !state->writeback_job) return; fb = state->writeback_job->fb; diff --git a/drivers/gpu/drm/vc4/vc4_txp.c b/drivers/gpu/drm/vc4/vc4_txp.c index 96f91c1b4b6e..e92fa1275034 100644 --- a/drivers/gpu/drm/vc4/vc4_txp.c +++ b/drivers/gpu/drm/vc4/vc4_txp.c @@ -229,7 +229,7 @@ static int vc4_txp_connector_atomic_check(struct drm_connector *conn, int i; conn_state = drm_atomic_get_new_connector_state(state, conn); - if (!conn_state->writeback_job || !conn_state->writeback_job->fb) + if (!conn_state->writeback_job) return 0; crtc_state = drm_atomic_get_new_crtc_state(state, conn_state->crtc); @@ -269,8 +269,7 @@ static void vc4_txp_connector_atomic_commit(struct drm_connector *conn, u32 ctrl; int i; - if (WARN_ON(!conn_state->writeback_job || - !conn_state->writeback_job->fb)) + if (WARN_ON(!conn_state->writeback_job)) return; mode = &conn_state->crtc->state->adjusted_mode; From b1066a123538044117f0a78ba8c6a50cf5a04c86 Mon Sep 17 00:00:00 2001 From: "Lowry Li (Arm Technology China)" Date: Wed, 31 Jul 2019 11:04:45 +0000 Subject: [PATCH 0785/2880] drm: Clear the fence pointer when writeback job signaled During it signals the completion of a writeback job, after releasing the out_fence, we'd clear the pointer. Check if fence left over in drm_writeback_cleanup_job(), release it. Signed-off-by: Lowry Li (Arm Technology China) Reviewed-by: Brian Starkey Reviewed-by: James Qian Wang (Arm Technology China) Signed-off-by: james qian wang (Arm Technology China) Link: https://patchwork.freedesktop.org/patch/msgid/1564571048-15029-3-git-send-email-lowry.li@arm.com --- drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c index ff138b6ec48b..43d9e3bb3a94 100644 --- a/drivers/gpu/drm/drm_writeback.c +++ b/drivers/gpu/drm/drm_writeback.c @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job) if (job->fb) drm_framebuffer_put(job->fb); + if (job->out_fence) + dma_fence_put(job->out_fence); + kfree(job); } EXPORT_SYMBOL(drm_writeback_cleanup_job); @@ -366,25 +369,29 @@ drm_writeback_signal_completion(struct drm_writeback_connector *wb_connector, { unsigned long flags; struct drm_writeback_job *job; + struct dma_fence *out_fence; spin_lock_irqsave(&wb_connector->job_lock, flags); job = list_first_entry_or_null(&wb_connector->job_queue, struct drm_writeback_job, list_entry); - if (job) { + if (job) list_del(&job->list_entry); - if (job->out_fence) { - if (status) - dma_fence_set_error(job->out_fence, status); - dma_fence_signal(job->out_fence); - dma_fence_put(job->out_fence); - } - } + spin_unlock_irqrestore(&wb_connector->job_lock, flags); if (WARN_ON(!job)) return; + out_fence = job->out_fence; + if (out_fence) { + if (status) + dma_fence_set_error(out_fence, status); + dma_fence_signal(out_fence); + dma_fence_put(out_fence); + job->out_fence = NULL; + } + INIT_WORK(&job->cleanup_work, cleanup_work); queue_work(system_long_wq, &job->cleanup_work); } From be21683e48f26032199504fb60b9a27eeff05fc3 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 22 Sep 2019 12:46:55 +0300 Subject: [PATCH 0786/2880] block: t10-pi: fix -Wswitch warning Changing the switch() statement to symbolic constants made the compiler (at least clang-9, did not check gcc) notice that there is one enum value that is not handled here: block/t10-pi.c:62:11: error: enumeration value 'T10_PI_TYPE0_PROTECTION' not handled in switch [-Werror,-Wswitch] Add a BUG_ON statement if we ever get to t10_pi_verify function with TYPE0 and replace the switch() statement with if/else clause for the valid types. Fixes: 9b2061b1a262 ("block: use symbolic constants for t10_pi type") Cc: Arnd Bergmann Signed-off-by: Max Gurtovoy Signed-off-by: Jens Axboe --- block/t10-pi.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/block/t10-pi.c b/block/t10-pi.c index 0c0120a672f9..9803c7e0376e 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -55,13 +55,14 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, { unsigned int i; + BUG_ON(type == T10_PI_TYPE0_PROTECTION); + for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf; __be16 csum; - switch (type) { - case T10_PI_TYPE1_PROTECTION: - case T10_PI_TYPE2_PROTECTION: + if (type == T10_PI_TYPE1_PROTECTION || + type == T10_PI_TYPE2_PROTECTION) { if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -73,12 +74,10 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, iter->seed, be32_to_cpu(pi->ref_tag)); return BLK_STS_PROTECTION; } - break; - case T10_PI_TYPE3_PROTECTION: + } else if (type == T10_PI_TYPE3_PROTECTION) { if (pi->app_tag == T10_PI_APP_ESCAPE && pi->ref_tag == T10_PI_REF_ESCAPE) goto next; - break; } csum = fn(iter->data_buf, iter->interval); From 4ec8d984895fef43abfc896c8a3346aca7a66649 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 20 Sep 2019 16:03:56 -0700 Subject: [PATCH 0787/2880] perf record: Fix priv level with branch sampling for paranoid=2 Now that the default perf_events paranoid level is set to 2, a regular user cannot monitor kernel level activity anymore. As such, with the following cmdline: $ perf record -e cycles date The perf tool first tries cycles:uk but then falls back to cycles:u as can be seen in the perf report --header-only output: cmdline : /export/hda3/tmp/perf.tip record -e cycles ls event : name = cycles:u, , id = { 436186, ... } This is okay as long as there is way to learn the priv level was changed internally by the tool. But consider a similar example: $ perf record -b -e cycles date Error: You may not have permission to collect stats. Consider tweaking /proc/sys/kernel/perf_event_paranoid, which controls use of the performance events system by unprivileged users (without CAP_SYS_ADMIN). ... Why is that treated differently given that the branch sampling inherits the priv level of the first event in this case, i.e., cycles:u? It turns out that the branch sampling code is more picky and also checks exclude_hv. In the fallback path, perf record is setting exclude_kernel = 1, but it does not change exclude_hv. This does not seem to match the restriction imposed by paranoid = 2. This patch fixes the problem by forcing exclude_hv = 1 in the fallback for paranoid=2. With this in place: $ perf record -b -e cycles date cmdline : /export/hda3/tmp/perf.tip record -b -e cycles ls event : name = cycles:u, , id = { 436847, ... } And the command succeeds as expected. V2 fix a white space. Committer testing: After aplying the patch we get: [acme@quaco ~]$ perf record -b -e cycles date WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted, check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid. Samples in kernel functions may not be resolved if a suitable vmlinux file is not found in the buildid cache or in the vmlinux path. Samples in kernel modules won't be resolved at all. If some relocation was applied (e.g. kexec) symbols may be misresolved even with a suitable vmlinux or kallsyms file. Mon 23 Sep 2019 11:00:59 AM -03 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.005 MB perf.data (14 samples) ] [acme@quaco ~]$ perf evlist -v cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: ANY [acme@quaco ~]$ That warning about restricted kernel maps will be suppressed in a follow up patch, as perf_event_attr.exclude_kernel is set, i.e. no samples for the kernel will be taken and thus no need for those maps. Signed-off-by: Stephane Eranian Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190920230356.41420-1-eranian@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 8e335d168503..502bc3d50e0d 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2535,9 +2535,11 @@ bool perf_evsel__fallback(struct evsel *evsel, int err, if (evsel->name) free(evsel->name); evsel->name = new_name; - scnprintf(msg, msgsize, -"kernel.perf_event_paranoid=%d, trying to fall back to excluding kernel samples", paranoid); + scnprintf(msg, msgsize, "kernel.perf_event_paranoid=%d, trying " + "to fall back to excluding kernel and hypervisor " + " samples", paranoid); evsel->core.attr.exclude_kernel = 1; + evsel->core.attr.exclude_hv = 1; return true; } From c8b567c8a96a35acff746b1eef6f859e2a9fc195 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 23 Sep 2019 11:07:29 -0300 Subject: [PATCH 0788/2880] perf record: Move restricted maps check to after a possible fallback to not collect kernel samples Before: [acme@quaco ~]$ perf record -b -e cycles date WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted, check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid. Samples in kernel functions may not be resolved if a suitable vmlinux file is not found in the buildid cache or in the vmlinux path. Samples in kernel modules won't be resolved at all. If some relocation was applied (e.g. kexec) symbols may be misresolved even with a suitable vmlinux or kallsyms file. Mon 23 Sep 2019 11:00:59 AM -03 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.005 MB perf.data (14 samples) ] [acme@quaco ~]$ But we did a fallback and exclude_kernel was set, so no need for resolving kernel symbols: $ perf evlist -v cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: ANY $ After: [acme@quaco ~]$ perf record -b -e cycles date Mon 23 Sep 2019 11:07:18 AM -03 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.007 MB perf.data (16 samples) ] [acme@quaco ~]$ perf evlist -v cycles:u: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: ANY [acme@quaco ~]$ No needless warning is emitted. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: https://lkml.kernel.org/n/tip-5yqnr8xcqwhr15xktj2097ac@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 3f66a49a997f..1e1f97139f16 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -788,6 +788,17 @@ try_again: pos->supported = true; } + if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) { + pr_warning( +"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n" +"check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n" +"Samples in kernel functions may not be resolved if a suitable vmlinux\n" +"file is not found in the buildid cache or in the vmlinux path.\n\n" +"Samples in kernel modules won't be resolved at all.\n\n" +"If some relocation was applied (e.g. kexec) symbols may be misresolved\n" +"even with a suitable vmlinux or kallsyms file.\n\n"); + } + if (perf_evlist__apply_filters(evlist, &pos)) { pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", pos->filter, perf_evsel__name(pos), errno, @@ -2364,16 +2375,6 @@ int cmd_record(int argc, const char **argv) err = -ENOMEM; - if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist)) - pr_warning( -"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n" -"check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n" -"Samples in kernel functions may not be resolved if a suitable vmlinux\n" -"file is not found in the buildid cache or in the vmlinux path.\n\n" -"Samples in kernel modules won't be resolved at all.\n\n" -"If some relocation was applied (e.g. kexec) symbols may be misresolved\n" -"even with a suitable vmlinux or kallsyms file.\n\n"); - if (rec->no_buildid_cache || rec->no_buildid) { disable_buildid_cache(); } else if (rec->switch_output.enabled) { From 88282297fff00796e81f5e67734a6afdfb31fbc4 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Mon, 26 Aug 2019 08:43:02 -0600 Subject: [PATCH 0789/2880] selftests/seccomp: fix build on older kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The seccomp selftest goes to some length to build against older kernel headers, viz. all the #ifdefs at the beginning of the file. Commit 201766a20e30 ("ptrace: add PTRACE_GET_SYSCALL_INFO request") introduces some additional macros, but doesn't do the #ifdef dance. Let's add that dance here to avoid: gcc -Wl,-no-as-needed -Wall seccomp_bpf.c -lpthread -o seccomp_bpf In file included from seccomp_bpf.c:51: seccomp_bpf.c: In function ‘tracer_ptrace’: seccomp_bpf.c:1787:20: error: ‘PTRACE_EVENTMSG_SYSCALL_ENTRY’ undeclared (first use in this function); did you mean ‘PTRACE_EVENT_CLONE’? EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../kselftest_harness.h:608:13: note: in definition of macro ‘__EXPECT’ __typeof__(_expected) __exp = (_expected); \ ^~~~~~~~~ seccomp_bpf.c:1787:2: note: in expansion of macro ‘EXPECT_EQ’ EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY ^~~~~~~~~ seccomp_bpf.c:1787:20: note: each undeclared identifier is reported only once for each function it appears in EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../kselftest_harness.h:608:13: note: in definition of macro ‘__EXPECT’ __typeof__(_expected) __exp = (_expected); \ ^~~~~~~~~ seccomp_bpf.c:1787:2: note: in expansion of macro ‘EXPECT_EQ’ EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY ^~~~~~~~~ seccomp_bpf.c:1788:6: error: ‘PTRACE_EVENTMSG_SYSCALL_EXIT’ undeclared (first use in this function); did you mean ‘PTRACE_EVENT_EXIT’? : PTRACE_EVENTMSG_SYSCALL_EXIT, msg); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../kselftest_harness.h:608:13: note: in definition of macro ‘__EXPECT’ __typeof__(_expected) __exp = (_expected); \ ^~~~~~~~~ seccomp_bpf.c:1787:2: note: in expansion of macro ‘EXPECT_EQ’ EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY ^~~~~~~~~ make: *** [Makefile:12: seccomp_bpf] Error 1 [skhan@linuxfoundation.org: Fix checkpatch error in commit log] Signed-off-by: Tycho Andersen Fixes: 201766a20e30 ("ptrace: add PTRACE_GET_SYSCALL_INFO request") Acked-by: Kees Cook Signed-off-by: Shuah Khan --- tools/testing/selftests/seccomp/seccomp_bpf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 6ef7f16c4cf5..7f8b5c8982e3 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -199,6 +199,11 @@ struct seccomp_notif_sizes { }; #endif +#ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY +#define PTRACE_EVENTMSG_SYSCALL_ENTRY 1 +#define PTRACE_EVENTMSG_SYSCALL_EXIT 2 +#endif + #ifndef seccomp int seccomp(unsigned int op, unsigned int flags, void *args) { From a4864a33f56caa6592acc69a6d265f3f3dca5eae Mon Sep 17 00:00:00 2001 From: "George G. Davis" Date: Fri, 30 Aug 2019 15:32:23 -0400 Subject: [PATCH 0790/2880] selftests: watchdog: Add optional file argument Some systems have multiple watchdog devices where the first device registered is assigned to the /dev/watchdog device file. In order to test other watchdog devices, add an optional file argument for selecting non-default watchdog devices for testing. Tested-by: Eugeniu Rosca Signed-off-by: George G. Davis Signed-off-by: Shuah Khan --- .../selftests/watchdog/watchdog-test.c | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/watchdog/watchdog-test.c b/tools/testing/selftests/watchdog/watchdog-test.c index c2333c78cf04..6a68b486dd61 100644 --- a/tools/testing/selftests/watchdog/watchdog-test.c +++ b/tools/testing/selftests/watchdog/watchdog-test.c @@ -19,7 +19,7 @@ int fd; const char v = 'V'; -static const char sopts[] = "bdehp:t:Tn:NL"; +static const char sopts[] = "bdehp:t:Tn:NLf:"; static const struct option lopts[] = { {"bootstatus", no_argument, NULL, 'b'}, {"disable", no_argument, NULL, 'd'}, @@ -31,6 +31,7 @@ static const struct option lopts[] = { {"pretimeout", required_argument, NULL, 'n'}, {"getpretimeout", no_argument, NULL, 'N'}, {"gettimeleft", no_argument, NULL, 'L'}, + {"file", required_argument, NULL, 'f'}, {NULL, no_argument, NULL, 0x0} }; @@ -69,6 +70,8 @@ static void term(int sig) static void usage(char *progname) { printf("Usage: %s [options]\n", progname); + printf(" -f, --file Open watchdog device file\n"); + printf(" Default is /dev/watchdog\n"); printf(" -b, --bootstatus Get last boot status (Watchdog/POR)\n"); printf(" -d, --disable Turn off the watchdog timer\n"); printf(" -e, --enable Turn on the watchdog timer\n"); @@ -92,14 +95,20 @@ int main(int argc, char *argv[]) int ret; int c; int oneshot = 0; + char *file = "/dev/watchdog"; setbuf(stdout, NULL); - fd = open("/dev/watchdog", O_WRONLY); + while ((c = getopt_long(argc, argv, sopts, lopts, NULL)) != -1) { + if (c == 'f') + file = optarg; + } + + fd = open(file, O_WRONLY); if (fd == -1) { if (errno == ENOENT) - printf("Watchdog device not enabled.\n"); + printf("Watchdog device (%s) not found.\n", file); else if (errno == EACCES) printf("Run watchdog as root.\n"); else @@ -108,6 +117,8 @@ int main(int argc, char *argv[]) exit(-1); } + optind = 0; + while ((c = getopt_long(argc, argv, sopts, lopts, NULL)) != -1) { switch (c) { case 'b': @@ -190,6 +201,9 @@ int main(int argc, char *argv[]) else printf("WDIOC_GETTIMELEFT error '%s'\n", strerror(errno)); break; + case 'f': + /* Handled above */ + break; default: usage(argv[0]); From 861f47b07b0523f00ffb8159684e7c840390b96e Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sat, 3 Aug 2019 09:01:26 +0900 Subject: [PATCH 0791/2880] selftest/ftrace: Fix typo in trigger-snapshot.tc This patch fixes a spelling typo in trigger-snapshot.tc [skhan@linuxfoundation.org: Fix typo in commit log] Signed-off-by: Masanari Iida Acked-by: Steven Rostedt (VMware) Signed-off-by: Shuah Khan --- .../testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc index 7717c0a09686..ac738500d17f 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc @@ -28,7 +28,7 @@ if [ -z "$FEATURE" ]; then exit_unsupported fi -echo "Test snapshot tigger" +echo "Test snapshot trigger" echo 0 > snapshot echo 1 > events/sched/sched_process_fork/enable ( echo "forked") From a54344ace273cd7fa182c287719b109cfd9e8613 Mon Sep 17 00:00:00 2001 From: "George G. Davis" Date: Fri, 30 Aug 2019 18:31:33 -0400 Subject: [PATCH 0792/2880] selftests: watchdog: cleanup whitespace in usage options Convert hard spaces to tabs in usage options. Suggested-by: Shuah Khan Signed-off-by: George G. Davis Signed-off-by: Shuah Khan --- .../selftests/watchdog/watchdog-test.c | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/watchdog/watchdog-test.c b/tools/testing/selftests/watchdog/watchdog-test.c index 6a68b486dd61..afff120c7be6 100644 --- a/tools/testing/selftests/watchdog/watchdog-test.c +++ b/tools/testing/selftests/watchdog/watchdog-test.c @@ -70,18 +70,19 @@ static void term(int sig) static void usage(char *progname) { printf("Usage: %s [options]\n", progname); - printf(" -f, --file Open watchdog device file\n"); - printf(" Default is /dev/watchdog\n"); - printf(" -b, --bootstatus Get last boot status (Watchdog/POR)\n"); - printf(" -d, --disable Turn off the watchdog timer\n"); - printf(" -e, --enable Turn on the watchdog timer\n"); - printf(" -h, --help Print the help message\n"); - printf(" -p, --pingrate=P Set ping rate to P seconds (default %d)\n", DEFAULT_PING_RATE); - printf(" -t, --timeout=T Set timeout to T seconds\n"); - printf(" -T, --gettimeout Get the timeout\n"); - printf(" -n, --pretimeout=T Set the pretimeout to T seconds\n"); - printf(" -N, --getpretimeout Get the pretimeout\n"); - printf(" -L, --gettimeleft Get the time left until timer expires\n"); + printf(" -f, --file\t\tOpen watchdog device file\n"); + printf("\t\t\tDefault is /dev/watchdog\n"); + printf(" -b, --bootstatus\tGet last boot status (Watchdog/POR)\n"); + printf(" -d, --disable\t\tTurn off the watchdog timer\n"); + printf(" -e, --enable\t\tTurn on the watchdog timer\n"); + printf(" -h, --help\t\tPrint the help message\n"); + printf(" -p, --pingrate=P\tSet ping rate to P seconds (default %d)\n", + DEFAULT_PING_RATE); + printf(" -t, --timeout=T\tSet timeout to T seconds\n"); + printf(" -T, --gettimeout\tGet the timeout\n"); + printf(" -n, --pretimeout=T\tSet the pretimeout to T seconds\n"); + printf(" -N, --getpretimeout\tGet the pretimeout\n"); + printf(" -L, --gettimeleft\tGet the time left until timer expires\n"); printf("\n"); printf("Parameters are parsed left-to-right in real-time.\n"); printf("Example: %s -d -t 10 -p 5 -e\n", progname); From 955a0f3310081731a1756a1d7bd742600ee69ffc Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Wed, 14 Aug 2019 13:16:51 +0200 Subject: [PATCH 0793/2880] selftests: livepatch: add missing fragments to config When generating config with 'make defconfig kselftest-merge' fragment CONFIG_TEST_LIVEPATCH=m isn't set. Rework to enable CONFIG_LIVEPATCH and CONFIG_DYNAMIC_DEBUG as well. Signed-off-by: Anders Roxell Signed-off-by: Shuah Khan --- tools/testing/selftests/livepatch/config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/livepatch/config b/tools/testing/selftests/livepatch/config index 0dd7700464a8..ad23100cb27c 100644 --- a/tools/testing/selftests/livepatch/config +++ b/tools/testing/selftests/livepatch/config @@ -1 +1,3 @@ +CONFIG_LIVEPATCH=y +CONFIG_DYNAMIC_DEBUG=y CONFIG_TEST_LIVEPATCH=m From 721cb3c8bc8890e824b7be53bf951960ff7811f9 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Wed, 14 Aug 2019 13:14:35 +0200 Subject: [PATCH 0794/2880] selftests: tpm2: install python files Both test_smoke.sh and test_space.sh require tpm2.py and tpm2_test.py. Rework so that tpm2.py and tpm2_test.py gets installed, added them to the variable TEST_PROGS_EXTENDED. Signed-off-by: Anders Roxell Signed-off-by: Shuah Khan --- tools/testing/selftests/tpm2/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/tpm2/Makefile b/tools/testing/selftests/tpm2/Makefile index 9dd848427a7b..1a5db1eb8ed5 100644 --- a/tools/testing/selftests/tpm2/Makefile +++ b/tools/testing/selftests/tpm2/Makefile @@ -2,3 +2,4 @@ include ../lib.mk TEST_PROGS := test_smoke.sh test_space.sh +TEST_PROGS_EXTENDED := tpm2.py tpm2_tests.py From 65643f4c8217edb1f40f7fb05f996c3a4798442a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Sep 2019 13:58:59 +0800 Subject: [PATCH 0795/2880] nfsd: Make nfsd_reset_boot_verifier_locked static Fix sparse warning: fs/nfsd/nfssvc.c:364:6: warning: symbol 'nfsd_reset_boot_verifier_locked' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 3caaf5675259..fdf7ed4bd5dd 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -361,7 +361,7 @@ void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) done_seqretry(&nn->boot_lock, seq); } -void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) +static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) { ktime_get_real_ts64(&nn->nfssvc_boot); } From 32960613b7c3352ddf38c42596e28a16ae36335e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 Sep 2019 11:05:34 -0600 Subject: [PATCH 0796/2880] io_uring: correctly handle non ->{read,write}_iter() file_operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we just -EINVAL a read or write to an fd that isn't backed by ->read_iter() or ->write_iter(). But we can handle them just fine, as long as we punt fo async context first. Implement a simple loop function for doing ->read() or ->write() instead, and ensure we call it appropriately. Reported-by: 李通洲 Signed-off-by: Jens Axboe --- fs/io_uring.c | 60 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9d8e703bc851..ca7570aca430 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1298,6 +1298,51 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) } } +/* + * For files that don't have ->read_iter() and ->write_iter(), handle them + * by looping over ->read() or ->write() manually. + */ +static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, + struct iov_iter *iter) +{ + ssize_t ret = 0; + + /* + * Don't support polled IO through this interface, and we can't + * support non-blocking either. For the latter, this just causes + * the kiocb to be handled from an async context. + */ + if (kiocb->ki_flags & IOCB_HIPRI) + return -EOPNOTSUPP; + if (kiocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + while (iov_iter_count(iter)) { + struct iovec iovec = iov_iter_iovec(iter); + ssize_t nr; + + if (rw == READ) { + nr = file->f_op->read(file, iovec.iov_base, + iovec.iov_len, &kiocb->ki_pos); + } else { + nr = file->f_op->write(file, iovec.iov_base, + iovec.iov_len, &kiocb->ki_pos); + } + + if (nr < 0) { + if (!ret) + ret = nr; + break; + } + ret += nr; + if (nr != iovec.iov_len) + break; + iov_iter_advance(iter, nr); + } + + return ret; +} + static int io_read(struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock) { @@ -1315,8 +1360,6 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; - if (unlikely(!file->f_op->read_iter)) - return -EINVAL; ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); if (ret < 0) @@ -1331,7 +1374,11 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, if (!ret) { ssize_t ret2; - ret2 = call_read_iter(file, kiocb, &iter); + if (file->f_op->read_iter) + ret2 = call_read_iter(file, kiocb, &iter); + else + ret2 = loop_rw_iter(READ, file, kiocb, &iter); + /* * In case of a short read, punt to async. This can happen * if we have data partially cached. Alternatively we can @@ -1376,8 +1423,6 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, file = kiocb->ki_filp; if (unlikely(!(file->f_mode & FMODE_WRITE))) return -EBADF; - if (unlikely(!file->f_op->write_iter)) - return -EINVAL; ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); if (ret < 0) @@ -1415,7 +1460,10 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, } kiocb->ki_flags |= IOCB_WRITE; - ret2 = call_write_iter(file, kiocb, &iter); + if (file->f_op->write_iter) + ret2 = call_write_iter(file, kiocb, &iter); + else + ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) { io_rw_done(kiocb, ret2); } else { From d46fe2cb2dce7f5038473b5859e03f5e16b7428e Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Mon, 23 Sep 2019 14:02:02 +0000 Subject: [PATCH 0797/2880] block: drop device references in bsg_queue_rq() Make sure that bsg_queue_rq() calls put_device() if an error is encountered after get_device() was successful. Fixes: cd2f076f1d7a ("bsg: convert to use blk-mq") Signed-off-by: Martin Wilck Signed-off-by: Jens Axboe --- block/bsg-lib.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 785dd58947f1..347dda16c2f4 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -266,6 +266,7 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct bsg_set *bset = container_of(q->tag_set, struct bsg_set, tag_set); + int sts = BLK_STS_IOERR; int ret; blk_mq_start_request(req); @@ -274,14 +275,15 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_IOERR; if (!bsg_prepare_job(dev, req)) - return BLK_STS_IOERR; + goto out; ret = bset->job_fn(blk_mq_rq_to_pdu(req)); - if (ret) - return BLK_STS_IOERR; + if (!ret) + sts = BLK_STS_OK; +out: put_device(dev); - return BLK_STS_OK; + return sts; } /* called right after the request is allocated for the request_queue */ From dd8882a255388ba66175098b1560d4f81c100d30 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 23 Sep 2019 10:32:37 -0700 Subject: [PATCH 0798/2880] clk: ti: dra7: Fix mcasp8 clock bits There's a typo for dra7 mcasp clkctrl bit, it should be 22 like the other macasp instances, and not 24. And in dra7xx_clks[] we have the bits wrong way around. Fixes: dffa9051d546 ("clk: ti: dra7: add new clkctrl data") Cc: linux-clk@vger.kernel.org Cc: Michael Turquette Cc: Stephen Boyd Cc: Suman Anna Cc: Tero Kristo Acked-by: Stephen Boyd Signed-off-by: Tony Lindgren --- drivers/clk/ti/clk-7xx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/clk/ti/clk-7xx.c b/drivers/clk/ti/clk-7xx.c index b57fe09b428b..9dd6185a4b4e 100644 --- a/drivers/clk/ti/clk-7xx.c +++ b/drivers/clk/ti/clk-7xx.c @@ -683,7 +683,7 @@ static const struct omap_clkctrl_reg_data dra7_l4per2_clkctrl_regs[] __initconst { DRA7_L4PER2_MCASP2_CLKCTRL, dra7_mcasp2_bit_data, CLKF_SW_SUP, "l4per2-clkctrl:0154:22" }, { DRA7_L4PER2_MCASP3_CLKCTRL, dra7_mcasp3_bit_data, CLKF_SW_SUP, "l4per2-clkctrl:015c:22" }, { DRA7_L4PER2_MCASP5_CLKCTRL, dra7_mcasp5_bit_data, CLKF_SW_SUP, "l4per2-clkctrl:016c:22" }, - { DRA7_L4PER2_MCASP8_CLKCTRL, dra7_mcasp8_bit_data, CLKF_SW_SUP, "l4per2-clkctrl:0184:24" }, + { DRA7_L4PER2_MCASP8_CLKCTRL, dra7_mcasp8_bit_data, CLKF_SW_SUP, "l4per2-clkctrl:0184:22" }, { DRA7_L4PER2_MCASP4_CLKCTRL, dra7_mcasp4_bit_data, CLKF_SW_SUP, "l4per2-clkctrl:018c:22" }, { DRA7_L4PER2_UART7_CLKCTRL, dra7_uart7_bit_data, CLKF_SW_SUP, "l4per2-clkctrl:01c4:24" }, { DRA7_L4PER2_UART8_CLKCTRL, dra7_uart8_bit_data, CLKF_SW_SUP, "l4per2-clkctrl:01d4:24" }, @@ -828,8 +828,8 @@ static struct ti_dt_clk dra7xx_clks[] = { DT_CLK(NULL, "mcasp6_aux_gfclk_mux", "l4per2-clkctrl:01f8:22"), DT_CLK(NULL, "mcasp7_ahclkx_mux", "l4per2-clkctrl:01fc:24"), DT_CLK(NULL, "mcasp7_aux_gfclk_mux", "l4per2-clkctrl:01fc:22"), - DT_CLK(NULL, "mcasp8_ahclkx_mux", "l4per2-clkctrl:0184:22"), - DT_CLK(NULL, "mcasp8_aux_gfclk_mux", "l4per2-clkctrl:0184:24"), + DT_CLK(NULL, "mcasp8_ahclkx_mux", "l4per2-clkctrl:0184:24"), + DT_CLK(NULL, "mcasp8_aux_gfclk_mux", "l4per2-clkctrl:0184:22"), DT_CLK(NULL, "mmc1_clk32k", "l3init-clkctrl:0008:8"), DT_CLK(NULL, "mmc1_fclk_div", "l3init-clkctrl:0008:25"), DT_CLK(NULL, "mmc1_fclk_mux", "l3init-clkctrl:0008:24"), From 2d3c8ba3cffa00f76bedb713c8c2126c82d8cd13 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 23 Sep 2019 10:32:38 -0700 Subject: [PATCH 0799/2880] ARM: dts: Fix wrong clocks for dra7 mcasp The ahclkr clkctrl clock bit 28 only exists for mcasp 1 and 2 on dra7. This causes the following warning on beagle-x15: ti-sysc 48468000.target-module: could not add child clock ahclkr: -19 Also the mcasp clkctrl clock bits are wrong: For mcasp1 and 2 we have four clocks at bits 28, 24, 22 and 0: bit 28 is ahclkr bit 24 is ahclkx bit 22 is auxclk bit 0 is fck For mcasp3 to 8 we have three clocks at bits 24, 22 and 0. bit 24 is ahclkx bit 22 is auxclk bit 0 is fck We do not have currently mapped auxclk at bit 22 for the drivers, that can be added if needed. Fixes: 5241ccbf2819 ("ARM: dts: Add missing ranges for dra7 mcasp l3 ports") Cc: Suman Anna Cc: Tero Kristo Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/dra7-l4.dtsi | 48 +++++++++++++++------------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/arch/arm/boot/dts/dra7-l4.dtsi b/arch/arm/boot/dts/dra7-l4.dtsi index 21e5914fdd62..099d6fe2a57a 100644 --- a/arch/arm/boot/dts/dra7-l4.dtsi +++ b/arch/arm/boot/dts/dra7-l4.dtsi @@ -2762,7 +2762,7 @@ interrupt-names = "tx", "rx"; dmas = <&edma_xbar 129 1>, <&edma_xbar 128 1>; dma-names = "tx", "rx"; - clocks = <&ipu_clkctrl DRA7_IPU_MCASP1_CLKCTRL 22>, + clocks = <&ipu_clkctrl DRA7_IPU_MCASP1_CLKCTRL 0>, <&ipu_clkctrl DRA7_IPU_MCASP1_CLKCTRL 24>, <&ipu_clkctrl DRA7_IPU_MCASP1_CLKCTRL 28>; clock-names = "fck", "ahclkx", "ahclkr"; @@ -2799,8 +2799,8 @@ interrupt-names = "tx", "rx"; dmas = <&edma_xbar 131 1>, <&edma_xbar 130 1>; dma-names = "tx", "rx"; - clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP2_CLKCTRL 22>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP2_CLKCTRL 24>, + clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP2_CLKCTRL 0>, + <&ipu_clkctrl DRA7_IPU_MCASP1_CLKCTRL 24>, <&l4per2_clkctrl DRA7_L4PER2_MCASP2_CLKCTRL 28>; clock-names = "fck", "ahclkx", "ahclkr"; status = "disabled"; @@ -2818,9 +2818,8 @@ ; /* Domains (P, C): l4per_pwrdm, l4per2_clkdm */ clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP3_CLKCTRL 0>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP3_CLKCTRL 24>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP3_CLKCTRL 28>; - clock-names = "fck", "ahclkx", "ahclkr"; + <&l4per2_clkctrl DRA7_L4PER2_MCASP3_CLKCTRL 24>; + clock-names = "fck", "ahclkx"; #address-cells = <1>; #size-cells = <1>; ranges = <0x0 0x68000 0x2000>, @@ -2836,7 +2835,7 @@ interrupt-names = "tx", "rx"; dmas = <&edma_xbar 133 1>, <&edma_xbar 132 1>; dma-names = "tx", "rx"; - clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP3_CLKCTRL 22>, + clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP3_CLKCTRL 0>, <&l4per2_clkctrl DRA7_L4PER2_MCASP3_CLKCTRL 24>; clock-names = "fck", "ahclkx"; status = "disabled"; @@ -2854,9 +2853,8 @@ ; /* Domains (P, C): l4per_pwrdm, l4per2_clkdm */ clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP4_CLKCTRL 0>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP4_CLKCTRL 24>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP4_CLKCTRL 28>; - clock-names = "fck", "ahclkx", "ahclkr"; + <&l4per2_clkctrl DRA7_L4PER2_MCASP4_CLKCTRL 24>; + clock-names = "fck", "ahclkx"; #address-cells = <1>; #size-cells = <1>; ranges = <0x0 0x6c000 0x2000>, @@ -2872,7 +2870,7 @@ interrupt-names = "tx", "rx"; dmas = <&edma_xbar 135 1>, <&edma_xbar 134 1>; dma-names = "tx", "rx"; - clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP4_CLKCTRL 22>, + clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP4_CLKCTRL 0>, <&l4per2_clkctrl DRA7_L4PER2_MCASP4_CLKCTRL 24>; clock-names = "fck", "ahclkx"; status = "disabled"; @@ -2890,9 +2888,8 @@ ; /* Domains (P, C): l4per_pwrdm, l4per2_clkdm */ clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP5_CLKCTRL 0>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP5_CLKCTRL 24>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP5_CLKCTRL 28>; - clock-names = "fck", "ahclkx", "ahclkr"; + <&l4per2_clkctrl DRA7_L4PER2_MCASP5_CLKCTRL 24>; + clock-names = "fck", "ahclkx"; #address-cells = <1>; #size-cells = <1>; ranges = <0x0 0x70000 0x2000>, @@ -2908,7 +2905,7 @@ interrupt-names = "tx", "rx"; dmas = <&edma_xbar 137 1>, <&edma_xbar 136 1>; dma-names = "tx", "rx"; - clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP5_CLKCTRL 22>, + clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP5_CLKCTRL 0>, <&l4per2_clkctrl DRA7_L4PER2_MCASP5_CLKCTRL 24>; clock-names = "fck", "ahclkx"; status = "disabled"; @@ -2926,9 +2923,8 @@ ; /* Domains (P, C): l4per_pwrdm, l4per2_clkdm */ clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP6_CLKCTRL 0>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP6_CLKCTRL 24>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP6_CLKCTRL 28>; - clock-names = "fck", "ahclkx", "ahclkr"; + <&l4per2_clkctrl DRA7_L4PER2_MCASP6_CLKCTRL 24>; + clock-names = "fck", "ahclkx"; #address-cells = <1>; #size-cells = <1>; ranges = <0x0 0x74000 0x2000>, @@ -2944,7 +2940,7 @@ interrupt-names = "tx", "rx"; dmas = <&edma_xbar 139 1>, <&edma_xbar 138 1>; dma-names = "tx", "rx"; - clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP6_CLKCTRL 22>, + clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP6_CLKCTRL 0>, <&l4per2_clkctrl DRA7_L4PER2_MCASP6_CLKCTRL 24>; clock-names = "fck", "ahclkx"; status = "disabled"; @@ -2962,9 +2958,8 @@ ; /* Domains (P, C): l4per_pwrdm, l4per2_clkdm */ clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP7_CLKCTRL 0>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP7_CLKCTRL 24>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP7_CLKCTRL 28>; - clock-names = "fck", "ahclkx", "ahclkr"; + <&l4per2_clkctrl DRA7_L4PER2_MCASP7_CLKCTRL 24>; + clock-names = "fck", "ahclkx"; #address-cells = <1>; #size-cells = <1>; ranges = <0x0 0x78000 0x2000>, @@ -2980,7 +2975,7 @@ interrupt-names = "tx", "rx"; dmas = <&edma_xbar 141 1>, <&edma_xbar 140 1>; dma-names = "tx", "rx"; - clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP7_CLKCTRL 22>, + clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP7_CLKCTRL 0>, <&l4per2_clkctrl DRA7_L4PER2_MCASP7_CLKCTRL 24>; clock-names = "fck", "ahclkx"; status = "disabled"; @@ -2998,9 +2993,8 @@ ; /* Domains (P, C): l4per_pwrdm, l4per2_clkdm */ clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP8_CLKCTRL 0>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP8_CLKCTRL 24>, - <&l4per2_clkctrl DRA7_L4PER2_MCASP8_CLKCTRL 28>; - clock-names = "fck", "ahclkx", "ahclkr"; + <&l4per2_clkctrl DRA7_L4PER2_MCASP8_CLKCTRL 24>; + clock-names = "fck", "ahclkx"; #address-cells = <1>; #size-cells = <1>; ranges = <0x0 0x7c000 0x2000>, @@ -3016,7 +3010,7 @@ interrupt-names = "tx", "rx"; dmas = <&edma_xbar 143 1>, <&edma_xbar 142 1>; dma-names = "tx", "rx"; - clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP8_CLKCTRL 22>, + clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP8_CLKCTRL 0>, <&l4per2_clkctrl DRA7_L4PER2_MCASP8_CLKCTRL 24>; clock-names = "fck", "ahclkx"; status = "disabled"; From f1f028ff89cb0d37db299d48e7b2ce19be040d52 Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Fri, 20 Sep 2019 18:11:15 +0200 Subject: [PATCH 0800/2880] DTS: ARM: gta04: introduce legacy spi-cs-high to make display work again commit 6953c57ab172 "gpio: of: Handle SPI chipselect legacy bindings" did introduce logic to centrally handle the legacy spi-cs-high property in combination with cs-gpios. This assumes that the polarity of the CS has to be inverted if spi-cs-high is missing, even and especially if non-legacy GPIO_ACTIVE_HIGH is specified. The DTS for the GTA04 was orginally introduced under the assumption that there is no need for spi-cs-high if the gpio is defined with proper polarity GPIO_ACTIVE_HIGH. This was not a problem until gpiolib changed the interpretation of GPIO_ACTIVE_HIGH and missing spi-cs-high. The effect is that the missing spi-cs-high is now interpreted as CS being low (despite GPIO_ACTIVE_HIGH) which turns off the SPI interface when the panel is to be programmed by the panel driver. Therefore, we have to add the redundant and legacy spi-cs-high property to properly activate CS. Cc: stable@vger.kernel.org Signed-off-by: H. Nikolaus Schaller Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/omap3-gta04.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/omap3-gta04.dtsi b/arch/arm/boot/dts/omap3-gta04.dtsi index b295f6fad2a5..954c216140ad 100644 --- a/arch/arm/boot/dts/omap3-gta04.dtsi +++ b/arch/arm/boot/dts/omap3-gta04.dtsi @@ -120,6 +120,7 @@ spi-max-frequency = <100000>; spi-cpol; spi-cpha; + spi-cs-high; backlight= <&backlight>; label = "lcd"; From ca14c996afe7228ff9b480cf225211cc17212688 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Mon, 23 Sep 2019 13:17:54 -0400 Subject: [PATCH 0801/2880] x86/purgatory: Disable the stackleak GCC plugin for the purgatory Since commit: b059f801a937 ("x86/purgatory: Use CFLAGS_REMOVE rather than reset KBUILD_CFLAGS") kexec breaks if GCC_PLUGIN_STACKLEAK=y is enabled, as the purgatory contains undefined references to stackleak_track_stack. Attempting to load a kexec kernel results in this failure: kexec: Undefined symbol: stackleak_track_stack kexec-bzImage64: Loading purgatory failed Fix this by disabling the stackleak plugin for the purgatory. Signed-off-by: Arvind Sankar Reviewed-by: Nick Desaulniers Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: b059f801a937 ("x86/purgatory: Use CFLAGS_REMOVE rather than reset KBUILD_CFLAGS") Link: https://lkml.kernel.org/r/20190923171753.GA2252517@rani.riverdale.lan Signed-off-by: Ingo Molnar --- arch/x86/purgatory/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 10fb42da0007..b81b5172cf99 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -23,6 +23,7 @@ KCOV_INSTRUMENT := n PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss +PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That # in turn leaves some undefined symbols like __fentry__ in purgatory and not From 04e0e1777a792223897b8c21eb4e0a5b2624d9df Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Mon, 23 Sep 2019 11:25:31 -0700 Subject: [PATCH 0802/2880] ARM: omap2plus_defconfig: Enable DRM_TI_TFP410 The TFP410 driver was removed but the replacement driver was never enabled. This patch enableds the DRM_TI_TFP410 Fixes: be3143d8b27f ("drm/omap: Remove TFP410 and DVI connector drivers") Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- arch/arm/configs/omap2plus_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 64eb896907bf..ab046eb41c7a 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -364,6 +364,7 @@ CONFIG_DRM_OMAP_PANEL_TPO_TD043MTEA1=m CONFIG_DRM_OMAP_PANEL_NEC_NL8048HL11=m CONFIG_DRM_TILCDC=m CONFIG_DRM_PANEL_SIMPLE=m +CONFIG_DRM_TI_TFP410=m CONFIG_FB=y CONFIG_FIRMWARE_EDID=y CONFIG_FB_MODE_HELPERS=y From 1d70ded8567ccf70353ff97e3b13f267c889f934 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Sun, 22 Sep 2019 10:57:40 -0700 Subject: [PATCH 0803/2880] ARM: omap2plus_defconfig: Enable more droid4 devices as loadable modules Droid4 needs USB option serial driver for modem, and lm3532 for the LCD backlight. Note that the LCD backlight does not yet get enabled automatically, but needs to be done manually with: # echo 50 > /sys/class/leds/lm3532::backlight/brightness Signed-off-by: Tony Lindgren --- arch/arm/configs/omap2plus_defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index ab046eb41c7a..ae326572c62a 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -424,6 +424,7 @@ CONFIG_USB_SERIAL_GENERIC=y CONFIG_USB_SERIAL_SIMPLE=m CONFIG_USB_SERIAL_FTDI_SIO=m CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_OPTION=m CONFIG_USB_TEST=m CONFIG_NOP_USB_XCEIV=m CONFIG_AM335X_PHY_USB=m @@ -461,6 +462,7 @@ CONFIG_MMC_SDHCI_OMAP=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=m CONFIG_LEDS_CPCAP=m +CONFIG_LEDS_LM3532=m CONFIG_LEDS_GPIO=m CONFIG_LEDS_PCA963X=m CONFIG_LEDS_PWM=m From e20e174ca1bd98241b42d5ccfa228d8c6522e4e7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 18 Sep 2019 17:50:45 -0700 Subject: [PATCH 0804/2880] xfs: convert inode to extent format after extent merge due to shift The collapse range operation can merge extents if two newly adjacent extents are physically contiguous. If the extent count is reduced on a btree format inode, a change to extent format might be necessary. This format change currently occurs as a side effect of the file size update after extents have been shifted for the collapse. This codepath ultimately calls xfs_bunmapi(), which happens to check for and execute the format conversion even if there were no blocks removed from the mapping. While this ultimately puts the inode into the correct state, the fact the format conversion occurs in a separate transaction from the change that called for it is a problem. If an extent shift transaction commits and the filesystem happens to crash before the format conversion, the inode fork is left in a corrupted state after log recovery. The inode fork verifier fails and xfs_repair ultimately nukes the inode. This problem was originally reproduced by generic/388. Similar to how the insert range extent split code handles extent to btree conversion, update the collapse range extent merge code to handle btree to extent format conversion in the same transaction that merges the extents. This ensures that the inode fork format remains consistent if the filesystem happens to crash in the middle of a collapse range operation that changes the inode fork format. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 054b4ce30033..eaf2d4250a26 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5621,6 +5621,11 @@ xfs_bmse_merge( if (error) return error; + /* change to extent format if required after extent removal */ + error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork); + if (error) + return error; + done: xfs_iext_remove(ip, icur, 0); xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur); From 583e4eff98fab8d4d3a44114b44408b6a4ad0737 Mon Sep 17 00:00:00 2001 From: Aliasgar Surti Date: Mon, 23 Sep 2019 13:00:56 -0700 Subject: [PATCH 0805/2880] xfs: removed unneeded variable Returned value directly instead of using variable as it wasn't updated. Signed-off-by: Aliasgar Surti Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index a43d1813c4ff..5533e48e605d 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -97,7 +97,6 @@ xchk_allocbt_rec( xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t bno; xfs_extlen_t len; - int error = 0; bno = be32_to_cpu(rec->alloc.ar_startblock); len = be32_to_cpu(rec->alloc.ar_blockcount); @@ -109,7 +108,7 @@ xchk_allocbt_rec( xchk_allocbt_xref(bs->sc, bno, len); - return error; + return 0; } /* Scrub the freespace btrees for some AG. */ From ce840429260a98bcfe4aaf487bb07fa346d86c41 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 23 Sep 2019 13:02:41 -0700 Subject: [PATCH 0806/2880] xfs: revert 1baa2800e62d ("xfs: remove the unused XFS_ALLOC_USERDATA flag") Revert this commit, as it caused periodic regressions in xfs/173 w/ 1k blocks. [1] https://lore.kernel.org/lkml/20190919014602.GN15734@shao2-debian/ Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_alloc.h | 7 ++++--- fs/xfs/libxfs/xfs_bmap.c | 8 ++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 58fa85cec325..d6ed5d2c07c2 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -81,9 +81,10 @@ typedef struct xfs_alloc_arg { /* * Defines for datatype */ -#define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */ -#define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */ -#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ +#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ +#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ +#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ +#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ static inline bool xfs_alloc_is_userdata(int datatype) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index eaf2d4250a26..4edc25a2ba80 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4042,8 +4042,12 @@ xfs_bmapi_allocate( */ if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK && bma->offset == 0) - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + if (whichfork == XFS_DATA_FORK) { + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + else + bma->datatype |= XFS_ALLOC_USERDATA; + } if (bma->flags & XFS_BMAPI_ZERO) bma->datatype |= XFS_ALLOC_USERDATA_ZERO; } From 83a63072c815e8a042c60fa964dcbde2a6df0e87 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Aug 2019 13:03:11 -0400 Subject: [PATCH 0807/2880] nfsd: fix nfs read eof detection Currently, the knfsd server assumes that a short read indicates an end of file. That assumption is incorrect. The short read means that either we've hit the end of file, or we've hit a read error. In the case of a read error, the client may want to retry (as per the implementation recommendations in RFC1813 and RFC7530), but currently it is being told that it hit an eof. Move the code to detect eof from version specific code into the generic nfsd read. Report eof only in the two following cases: 1) read() returns a zero length short read with no error. 2) the offset+length of the read is >= the file size. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 9 ++------- fs/nfsd/nfs4xdr.c | 11 +++-------- fs/nfsd/nfsproc.c | 4 +++- fs/nfsd/vfs.c | 37 ++++++++++++++++++++++++++----------- fs/nfsd/vfs.h | 28 ++++++---------------------- fs/nfsd/xdr3.h | 2 +- 6 files changed, 41 insertions(+), 50 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9bc32af4e2da..cea68d8411ac 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -172,13 +172,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp) nfserr = nfsd_read(rqstp, &resp->fh, argp->offset, rqstp->rq_vec, argp->vlen, - &resp->count); - if (nfserr == 0) { - struct inode *inode = d_inode(resp->fh.fh_dentry); - resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset, - inode->i_size); - } - + &resp->count, + &resp->eof); RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c1fc2641e3e7..533d0fc3c96b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3472,7 +3472,7 @@ static __be32 nfsd4_encode_splice_read( len = maxcount; nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, - file, read->rd_offset, &maxcount); + file, read->rd_offset, &maxcount, &eof); read->rd_length = maxcount; if (nfserr) { /* @@ -3484,9 +3484,6 @@ static __be32 nfsd4_encode_splice_read( return nfserr; } - eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, - d_inode(read->rd_fhp->fh_dentry)->i_size); - *(p++) = htonl(eof); *(p++) = htonl(maxcount); @@ -3557,15 +3554,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, len = maxcount; nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, &maxcount); + resp->rqstp->rq_vec, read->rd_vlen, &maxcount, + &eof); read->rd_length = maxcount; if (nfserr) return nfserr; xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); - eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, - d_inode(read->rd_fhp->fh_dentry)->i_size); - tmp = htonl(eof); write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); tmp = htonl(maxcount); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0d20fd161225..c83ddac22f38 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -172,6 +172,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; __be32 nfserr; + u32 eof; dprintk("nfsd: READ %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -195,7 +196,8 @@ nfsd_proc_read(struct svc_rqst *rqstp) nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, rqstp->rq_vec, argp->vlen, - &resp->count); + &resp->count, + &eof); if (nfserr) return nfserr; return fh_getattr(&resp->fh, &resp->stat); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0867d5319fdb..bd0a385df3fc 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -834,12 +834,23 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } +static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len, + size_t expected) +{ + if (expected != 0 && len == 0) + return 1; + if (offset+len >= i_size_read(file_inode(file))) + return 1; + return 0; +} + static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - unsigned long *count, int host_err) + unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { nfsdstats.io_read += host_err; + *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); trace_nfsd_read_io_done(rqstp, fhp, offset, *count); @@ -851,7 +862,8 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, } __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, unsigned long *count) + struct file *file, loff_t offset, unsigned long *count, + u32 *eof) { struct splice_desc sd = { .len = 0, @@ -859,25 +871,27 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, .pos = offset, .u.data = rqstp, }; - int host_err; + ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); rqstp->rq_next_page = rqstp->rq_respages + 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); - return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - struct kvec *vec, int vlen, unsigned long *count) + struct kvec *vec, int vlen, unsigned long *count, + u32 *eof) { struct iov_iter iter; - int host_err; + loff_t ppos = offset; + ssize_t host_err; trace_nfsd_read_vector(rqstp, fhp, offset, *count); iov_iter_kvec(&iter, READ, vec, vlen, *count); - host_err = vfs_iter_read(file, &iter, &offset, 0); - return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); + host_err = vfs_iter_read(file, &iter, &ppos, 0); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } /* @@ -984,7 +998,8 @@ out_nfserr: * N.B. After this call fhp needs an fh_put */ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) + loff_t offset, struct kvec *vec, int vlen, unsigned long *count, + u32 *eof) { struct nfsd_file *nf; struct file *file; @@ -997,9 +1012,9 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, file = nf->nf_file; if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) - err = nfsd_splice_read(rqstp, fhp, file, offset, count); + err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); else - err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count); + err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof); nfsd_file_put(nf); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index e0f7792165a6..a13fd9d7e1f5 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -80,13 +80,16 @@ __be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - unsigned long *count); + unsigned long *count, + u32 *eof); __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long *count); + unsigned long *count, + u32 *eof); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, - loff_t, struct kvec *, int, unsigned long *); + loff_t, struct kvec *, int, unsigned long *, + u32 *eof); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *, int); __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -149,23 +152,4 @@ static inline int nfsd_create_is_exclusive(int createmode) || createmode == NFS4_CREATE_EXCLUSIVE4_1; } -static inline bool nfsd_eof_on_read(long requested, long read, - loff_t offset, loff_t size) -{ - /* We assume a short read means eof: */ - if (requested > read) - return true; - /* - * A non-short read might also reach end of file. The spec - * still requires us to set eof in that case. - * - * Further operations may have modified the file size since - * the read, so the following check is not atomic with the read. - * We've only seen that cause a problem for a client in the case - * where the read returned a count of 0 without setting eof. - * That case was fixed by the addition of the above check. - */ - return (offset + read >= size); -} - #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 2cb29e961a76..99ff9f403ff1 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -151,7 +151,7 @@ struct nfsd3_readres { __be32 status; struct svc_fh fh; unsigned long count; - int eof; + __u32 eof; }; struct nfsd3_writeres { From 0ec64895b05269c1814ea5befcadcd1184a12915 Mon Sep 17 00:00:00 2001 From: John Pittman Date: Tue, 17 Sep 2019 14:52:00 -0400 Subject: [PATCH 0808/2880] nvmet: change ppl to lpp In nvmet_bdev_set_limits() the number of logical blocks per physical block is calculated, but the opposite is mentioned in the associated comment and reflected in the variable name. Correct the comment and adjust the variable name to reflect the calculation done. Signed-off-by: John Pittman Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg --- drivers/nvme/target/io-cmd-bdev.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index de0bff70ebb6..32008d85172b 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -11,10 +11,10 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) { const struct queue_limits *ql = &bdev_get_queue(bdev)->limits; - /* Number of physical blocks per logical block. */ - const u32 ppl = ql->physical_block_size / ql->logical_block_size; - /* Physical blocks per logical block, 0's based. */ - const __le16 ppl0b = to0based(ppl); + /* Number of logical blocks per physical block. */ + const u32 lpp = ql->physical_block_size / ql->logical_block_size; + /* Logical blocks per physical block, 0's based. */ + const __le16 lpp0b = to0based(lpp); /* * For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN, @@ -25,9 +25,9 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) * field from the identify controller data structure should be used. */ id->nsfeat |= 1 << 1; - id->nawun = ppl0b; - id->nawupf = ppl0b; - id->nacwu = ppl0b; + id->nawun = lpp0b; + id->nawupf = lpp0b; + id->nacwu = lpp0b; /* * Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and @@ -36,7 +36,7 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) */ id->nsfeat |= 1 << 4; /* NPWG = Namespace Preferred Write Granularity. 0's based */ - id->npwg = ppl0b; + id->npwg = lpp0b; /* NPWA = Namespace Preferred Write Alignment. 0's based */ id->npwa = id->npwg; /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ From b224726de5e496dbf78147a66755c3d81e28bdd2 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 18 Sep 2019 00:27:20 +0000 Subject: [PATCH 0809/2880] nvme-pci: Fix a race in controller removal User space programs like udevd may try to read to partitions at the same time the driver detects a namespace is unusable, and may deadlock if revalidate_disk() is called while such a process is waiting to enter the frozen queue. On detecting a dead namespace, move the disk revalidate after unblocking dispatchers that may be holding bd_butex. changelog Suggested-by: Keith Busch Signed-off-by: Balbir Singh Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 108f60b46804..0c385b1994fe 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -102,10 +102,13 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) */ if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; - revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); /* Forcibly unquiesce queues to avoid blocking dispatch */ blk_mq_unquiesce_queue(ns->queue); + /* + * Revalidate after unblocking dispatchers that may be holding bd_butex + */ + revalidate_disk(ns->disk); } static void nvme_queue_scan(struct nvme_ctrl *ctrl) From fb629fa2587d0c150792d87e3053664bfc8dc78c Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Fri, 20 Sep 2019 15:02:11 +0200 Subject: [PATCH 0810/2880] ASoC: samsung: arndale: Add missing OF node dereferencing Ensure there is no OF node references kept when the driver is removed/unbound. Reviewed-by: Charles Keepax Signed-off-by: Sylwester Nawrocki Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20190920130218.32690-3-s.nawrocki@samsung.com Signed-off-by: Mark Brown --- sound/soc/samsung/arndale_rt5631.c | 34 ++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/sound/soc/samsung/arndale_rt5631.c b/sound/soc/samsung/arndale_rt5631.c index c213913eb984..fd8c6642fb0d 100644 --- a/sound/soc/samsung/arndale_rt5631.c +++ b/sound/soc/samsung/arndale_rt5631.c @@ -5,6 +5,7 @@ // Author: Claude #include +#include #include #include @@ -74,6 +75,17 @@ static struct snd_soc_card arndale_rt5631 = { .num_links = ARRAY_SIZE(arndale_rt5631_dai), }; +static void arndale_put_of_nodes(struct snd_soc_card *card) +{ + struct snd_soc_dai_link *dai_link; + int i; + + for_each_card_prelinks(card, i, dai_link) { + of_node_put(dai_link->cpus->of_node); + of_node_put(dai_link->codecs->of_node); + } +} + static int arndale_audio_probe(struct platform_device *pdev) { int n, ret; @@ -103,18 +115,31 @@ static int arndale_audio_probe(struct platform_device *pdev) if (!arndale_rt5631_dai[0].codecs->of_node) { dev_err(&pdev->dev, "Property 'samsung,audio-codec' missing or invalid\n"); - return -EINVAL; + ret = -EINVAL; + goto err_put_of_nodes; } } ret = devm_snd_soc_register_card(card->dev, card); + if (ret) { + dev_err(&pdev->dev, "snd_soc_register_card() failed: %d\n", ret); + goto err_put_of_nodes; + } + return 0; - if (ret) - dev_err(&pdev->dev, "snd_soc_register_card() failed:%d\n", ret); - +err_put_of_nodes: + arndale_put_of_nodes(card); return ret; } +static int arndale_audio_remove(struct platform_device *pdev) +{ + struct snd_soc_card *card = platform_get_drvdata(pdev); + + arndale_put_of_nodes(card); + return 0; +} + static const struct of_device_id samsung_arndale_rt5631_of_match[] __maybe_unused = { { .compatible = "samsung,arndale-rt5631", }, { .compatible = "samsung,arndale-alc5631", }, @@ -129,6 +154,7 @@ static struct platform_driver arndale_audio_driver = { .of_match_table = of_match_ptr(samsung_arndale_rt5631_of_match), }, .probe = arndale_audio_probe, + .remove = arndale_audio_remove, }; module_platform_driver(arndale_audio_driver); From ca2347190adb5e4eece73a2b16e96e651c46246b Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Fri, 20 Sep 2019 15:02:10 +0200 Subject: [PATCH 0811/2880] ASoC: wm8994: Do not register inapplicable controls for WM1811 In case of WM1811 device there are currently being registered controls referring to registers not existing on that device. It has been noticed when getting values of "AIF1ADC2 Volume", "AIF1DAC2 Volume" controls was failing during ALSA state restoring at boot time: "amixer: Mixer hw:0 load error: Device or resource busy" Reading some registers through I2C was failing with EBUSY error and indeed these registers were not available according to the datasheet. To fix this controls not available on WM1811 are moved to a separate array and registered only for WM8994 and WM8958. There are some further differences between WM8994 and WM1811, e.g. registers 603h, 604h, 605h, which are not covered in this patch. Acked-by: Charles Keepax Acked-by: Krzysztof Kozlowski Signed-off-by: Sylwester Nawrocki Link: https://lore.kernel.org/r/20190920130218.32690-2-s.nawrocki@samsung.com Signed-off-by: Mark Brown --- sound/soc/codecs/wm8994.c | 43 +++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c index c3d06e8bc54f..d5fb7f5dd551 100644 --- a/sound/soc/codecs/wm8994.c +++ b/sound/soc/codecs/wm8994.c @@ -533,13 +533,10 @@ static SOC_ENUM_SINGLE_DECL(dac_osr, static SOC_ENUM_SINGLE_DECL(adc_osr, WM8994_OVERSAMPLING, 1, osr_text); -static const struct snd_kcontrol_new wm8994_snd_controls[] = { +static const struct snd_kcontrol_new wm8994_common_snd_controls[] = { SOC_DOUBLE_R_TLV("AIF1ADC1 Volume", WM8994_AIF1_ADC1_LEFT_VOLUME, WM8994_AIF1_ADC1_RIGHT_VOLUME, 1, 119, 0, digital_tlv), -SOC_DOUBLE_R_TLV("AIF1ADC2 Volume", WM8994_AIF1_ADC2_LEFT_VOLUME, - WM8994_AIF1_ADC2_RIGHT_VOLUME, - 1, 119, 0, digital_tlv), SOC_DOUBLE_R_TLV("AIF2ADC Volume", WM8994_AIF2_ADC_LEFT_VOLUME, WM8994_AIF2_ADC_RIGHT_VOLUME, 1, 119, 0, digital_tlv), @@ -556,8 +553,6 @@ SOC_ENUM("AIF2DACR Source", aif2dacr_src), SOC_DOUBLE_R_TLV("AIF1DAC1 Volume", WM8994_AIF1_DAC1_LEFT_VOLUME, WM8994_AIF1_DAC1_RIGHT_VOLUME, 1, 96, 0, digital_tlv), -SOC_DOUBLE_R_TLV("AIF1DAC2 Volume", WM8994_AIF1_DAC2_LEFT_VOLUME, - WM8994_AIF1_DAC2_RIGHT_VOLUME, 1, 96, 0, digital_tlv), SOC_DOUBLE_R_TLV("AIF2DAC Volume", WM8994_AIF2_DAC_LEFT_VOLUME, WM8994_AIF2_DAC_RIGHT_VOLUME, 1, 96, 0, digital_tlv), @@ -565,17 +560,12 @@ SOC_SINGLE_TLV("AIF1 Boost Volume", WM8994_AIF1_CONTROL_2, 10, 3, 0, aif_tlv), SOC_SINGLE_TLV("AIF2 Boost Volume", WM8994_AIF2_CONTROL_2, 10, 3, 0, aif_tlv), SOC_SINGLE("AIF1DAC1 EQ Switch", WM8994_AIF1_DAC1_EQ_GAINS_1, 0, 1, 0), -SOC_SINGLE("AIF1DAC2 EQ Switch", WM8994_AIF1_DAC2_EQ_GAINS_1, 0, 1, 0), SOC_SINGLE("AIF2 EQ Switch", WM8994_AIF2_EQ_GAINS_1, 0, 1, 0), WM8994_DRC_SWITCH("AIF1DAC1 DRC Switch", WM8994_AIF1_DRC1_1, 2), WM8994_DRC_SWITCH("AIF1ADC1L DRC Switch", WM8994_AIF1_DRC1_1, 1), WM8994_DRC_SWITCH("AIF1ADC1R DRC Switch", WM8994_AIF1_DRC1_1, 0), -WM8994_DRC_SWITCH("AIF1DAC2 DRC Switch", WM8994_AIF1_DRC2_1, 2), -WM8994_DRC_SWITCH("AIF1ADC2L DRC Switch", WM8994_AIF1_DRC2_1, 1), -WM8994_DRC_SWITCH("AIF1ADC2R DRC Switch", WM8994_AIF1_DRC2_1, 0), - WM8994_DRC_SWITCH("AIF2DAC DRC Switch", WM8994_AIF2_DRC_1, 2), WM8994_DRC_SWITCH("AIF2ADCL DRC Switch", WM8994_AIF2_DRC_1, 1), WM8994_DRC_SWITCH("AIF2ADCR DRC Switch", WM8994_AIF2_DRC_1, 0), @@ -594,9 +584,6 @@ SOC_SINGLE("Sidetone HPF Switch", WM8994_SIDETONE, 6, 1, 0), SOC_ENUM("AIF1ADC1 HPF Mode", aif1adc1_hpf), SOC_DOUBLE("AIF1ADC1 HPF Switch", WM8994_AIF1_ADC1_FILTERS, 12, 11, 1, 0), -SOC_ENUM("AIF1ADC2 HPF Mode", aif1adc2_hpf), -SOC_DOUBLE("AIF1ADC2 HPF Switch", WM8994_AIF1_ADC2_FILTERS, 12, 11, 1, 0), - SOC_ENUM("AIF2ADC HPF Mode", aif2adc_hpf), SOC_DOUBLE("AIF2ADC HPF Switch", WM8994_AIF2_ADC_FILTERS, 12, 11, 1, 0), @@ -637,6 +624,24 @@ SOC_SINGLE("AIF2DAC 3D Stereo Switch", WM8994_AIF2_DAC_FILTERS_2, 8, 1, 0), }; +/* Controls not available on WM1811 */ +static const struct snd_kcontrol_new wm8994_snd_controls[] = { +SOC_DOUBLE_R_TLV("AIF1ADC2 Volume", WM8994_AIF1_ADC2_LEFT_VOLUME, + WM8994_AIF1_ADC2_RIGHT_VOLUME, + 1, 119, 0, digital_tlv), +SOC_DOUBLE_R_TLV("AIF1DAC2 Volume", WM8994_AIF1_DAC2_LEFT_VOLUME, + WM8994_AIF1_DAC2_RIGHT_VOLUME, 1, 96, 0, digital_tlv), + +SOC_SINGLE("AIF1DAC2 EQ Switch", WM8994_AIF1_DAC2_EQ_GAINS_1, 0, 1, 0), + +WM8994_DRC_SWITCH("AIF1DAC2 DRC Switch", WM8994_AIF1_DRC2_1, 2), +WM8994_DRC_SWITCH("AIF1ADC2L DRC Switch", WM8994_AIF1_DRC2_1, 1), +WM8994_DRC_SWITCH("AIF1ADC2R DRC Switch", WM8994_AIF1_DRC2_1, 0), + +SOC_ENUM("AIF1ADC2 HPF Mode", aif1adc2_hpf), +SOC_DOUBLE("AIF1ADC2 HPF Switch", WM8994_AIF1_ADC2_FILTERS, 12, 11, 1, 0), +}; + static const struct snd_kcontrol_new wm8994_eq_controls[] = { SOC_SINGLE_TLV("AIF1DAC1 EQ1 Volume", WM8994_AIF1_DAC1_EQ_GAINS_1, 11, 31, 0, eq_tlv), @@ -4258,13 +4263,15 @@ static int wm8994_component_probe(struct snd_soc_component *component) wm8994_handle_pdata(wm8994); wm_hubs_add_analogue_controls(component); - snd_soc_add_component_controls(component, wm8994_snd_controls, - ARRAY_SIZE(wm8994_snd_controls)); + snd_soc_add_component_controls(component, wm8994_common_snd_controls, + ARRAY_SIZE(wm8994_common_snd_controls)); snd_soc_dapm_new_controls(dapm, wm8994_dapm_widgets, ARRAY_SIZE(wm8994_dapm_widgets)); switch (control->type) { case WM8994: + snd_soc_add_component_controls(component, wm8994_snd_controls, + ARRAY_SIZE(wm8994_snd_controls)); snd_soc_dapm_new_controls(dapm, wm8994_specific_dapm_widgets, ARRAY_SIZE(wm8994_specific_dapm_widgets)); if (control->revision < 4) { @@ -4284,8 +4291,10 @@ static int wm8994_component_probe(struct snd_soc_component *component) } break; case WM8958: + snd_soc_add_component_controls(component, wm8994_snd_controls, + ARRAY_SIZE(wm8994_snd_controls)); snd_soc_add_component_controls(component, wm8958_snd_controls, - ARRAY_SIZE(wm8958_snd_controls)); + ARRAY_SIZE(wm8958_snd_controls)); snd_soc_dapm_new_controls(dapm, wm8958_dapm_widgets, ARRAY_SIZE(wm8958_dapm_widgets)); if (control->revision < 1) { From 901e822b2e365dac4727e0ddffb444a2554b0a89 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 23 Sep 2019 17:22:57 +0300 Subject: [PATCH 0812/2880] ASoC: soc-component: fix a couple missing error assignments There were a couple places where the return value wasn't assigned so the error handling wouldn't trigger. Fixes: 5c0769af4caf ("ASoC: soc-dai: add snd_soc_dai_bespoke_trigger()") Fixes: 95aef3553384 ("ASoC: soc-dai: add snd_soc_dai_trigger()") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20190923142257.GB31251@mwanda Signed-off-by: Mark Brown --- sound/soc/soc-pcm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index e163dde5eab1..a1b99ac57d9e 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -1070,7 +1070,7 @@ static int soc_pcm_trigger(struct snd_pcm_substream *substream, int cmd) return ret; } - snd_soc_dai_trigger(cpu_dai, substream, cmd); + ret = snd_soc_dai_trigger(cpu_dai, substream, cmd); if (ret < 0) return ret; @@ -1097,7 +1097,7 @@ static int soc_pcm_bespoke_trigger(struct snd_pcm_substream *substream, return ret; } - snd_soc_dai_bespoke_trigger(cpu_dai, substream, cmd); + ret = snd_soc_dai_bespoke_trigger(cpu_dai, substream, cmd); if (ret < 0) return ret; From 1d6db22ff7d67a17c571543c69c63b1d261249b0 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Sun, 22 Sep 2019 10:29:28 +0800 Subject: [PATCH 0813/2880] regulator: fixed: Prevent NULL pointer dereference when !CONFIG_OF Use of_device_get_match_data which has NULL test for match before dereference match->data. Add NULL test for drvtype so it still works for fixed_voltage_ops when !CONFIG_OF. Signed-off-by: Axel Lin Reviewed-by: Philippe Schenker Link: https://lore.kernel.org/r/20190922022928.28355-1-axel.lin@ingics.com Signed-off-by: Mark Brown --- drivers/regulator/fixed.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c index d90a6fd8cbc7..f81533070058 100644 --- a/drivers/regulator/fixed.c +++ b/drivers/regulator/fixed.c @@ -144,8 +144,7 @@ static int reg_fixed_voltage_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct fixed_voltage_config *config; struct fixed_voltage_data *drvdata; - const struct fixed_dev_type *drvtype = - of_match_device(dev->driver->of_match_table, dev)->data; + const struct fixed_dev_type *drvtype = of_device_get_match_data(dev); struct regulator_config cfg = { }; enum gpiod_flags gflags; int ret; @@ -177,7 +176,7 @@ static int reg_fixed_voltage_probe(struct platform_device *pdev) drvdata->desc.type = REGULATOR_VOLTAGE; drvdata->desc.owner = THIS_MODULE; - if (drvtype->has_enable_clock) { + if (drvtype && drvtype->has_enable_clock) { drvdata->desc.ops = &fixed_voltage_clkenabled_ops; drvdata->enable_clock = devm_clk_get(dev, NULL); From ae89339b08f3fe02457ec9edd512ddc3d246d0f8 Mon Sep 17 00:00:00 2001 From: Sanjay R Mehta Date: Fri, 29 Mar 2019 11:32:50 +0000 Subject: [PATCH 0814/2880] ntb: point to right memory window index second parameter of ntb_peer_mw_get_addr is pointing to wrong memory window index by passing "peer gidx" instead of "local gidx". For ex, "local gidx" value is '0' and "peer gidx" value is '1', then on peer side ntb_mw_set_trans() api is used as below with gidx pointing to local side gidx which is '0', so memroy window '0' is chosen and XLAT '0' will be programmed by peer side. ntb_mw_set_trans(perf->ntb, peer->pidx, peer->gidx, peer->inbuf_xlat, peer->inbuf_size); Now, on local side ntb_peer_mw_get_addr() is been used as below with gidx pointing to "peer gidx" which is '1', so pointing to memory window '1' instead of memory window '0'. ntb_peer_mw_get_addr(perf->ntb, peer->gidx, &phys_addr, &peer->outbuf_size); So this patch pass "local gidx" as parameter to ntb_peer_mw_get_addr(). Signed-off-by: Sanjay R Mehta Signed-off-by: Jon Mason --- drivers/ntb/test/ntb_perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c index d028331558ea..e9b7c2dfc730 100644 --- a/drivers/ntb/test/ntb_perf.c +++ b/drivers/ntb/test/ntb_perf.c @@ -1378,7 +1378,7 @@ static int perf_setup_peer_mw(struct perf_peer *peer) int ret; /* Get outbound MW parameters and map it */ - ret = ntb_peer_mw_get_addr(perf->ntb, peer->gidx, &phys_addr, + ret = ntb_peer_mw_get_addr(perf->ntb, perf->gidx, &phys_addr, &peer->outbuf_size); if (ret) return ret; From 58283636a5a03e64ad5d03bd282e3b66dcfa2c49 Mon Sep 17 00:00:00 2001 From: Philippe Schenker Date: Mon, 23 Sep 2019 08:18:49 +0000 Subject: [PATCH 0815/2880] dt-bindings: fixed-regulator: fix compatible enum Remove 'const:' in the compatible enum. This was breaking make dt_binding_check since it has more than one compatible string. Fixes: 9c86d003d620 ("dt-bindings: regulator: add regulator-fixed-clock binding") Signed-off-by: Philippe Schenker Link: https://lore.kernel.org/r/20190923081840.23391-1-philippe.schenker@toradex.com Signed-off-by: Mark Brown --- .../devicetree/bindings/regulator/fixed-regulator.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml b/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml index a78150c47aa2..f32416968197 100644 --- a/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml @@ -30,8 +30,8 @@ if: properties: compatible: enum: - - const: regulator-fixed - - const: regulator-fixed-clock + - regulator-fixed + - regulator-fixed-clock regulator-name: true From c16c6655605f52cf2107a7c8dc0c798645351976 Mon Sep 17 00:00:00 2001 From: Alexander Fomichev Date: Tue, 16 Jul 2019 20:34:48 +0300 Subject: [PATCH 0816/2880] ntb_hw_switchtec: make ntb_mw_set_trans() work when addr == 0 On switchtec_ntb_mw_set_trans() call, when (only) address == 0, it acts as ntb_mw_clear_trans(). Fix this, since address == 0 and size != 0 is valid combination for setting translation. Signed-off-by: Alexander Fomichev Reviewed-by: Logan Gunthorpe Signed-off-by: Jon Mason --- drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c index f4959458d909..86ffa716eaf2 100644 --- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c +++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c @@ -306,7 +306,7 @@ static int switchtec_ntb_mw_set_trans(struct ntb_dev *ntb, int pidx, int widx, if (rc) return rc; - if (addr == 0 || size == 0) { + if (size == 0) { if (widx < nr_direct_mw) switchtec_ntb_mw_clr_direct(sndev, widx); else From 5e2cbf13d0ec4fe2b6aabbeb6fe44830e8788dd0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 18 Aug 2019 19:53:49 +0100 Subject: [PATCH 0817/2880] NTB: ntb_transport: remove redundant assignment to rc Variable rc is initialized to a value that is never read and it is re-assigned later. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Jon Mason --- drivers/ntb/ntb_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c index 40c90ca10729..00a5d5764993 100644 --- a/drivers/ntb/ntb_transport.c +++ b/drivers/ntb/ntb_transport.c @@ -292,7 +292,7 @@ static int ntb_transport_bus_match(struct device *dev, static int ntb_transport_bus_probe(struct device *dev) { const struct ntb_transport_client *client; - int rc = -EINVAL; + int rc; get_device(dev); From 5f59f6b182f75e5247104c8e0150c3d41f0a4c18 Mon Sep 17 00:00:00 2001 From: Sanjay R Mehta Date: Sun, 15 Sep 2019 17:07:43 +0000 Subject: [PATCH 0818/2880] ntb_hw_amd: Add a new NTB PCI device ID Signed-off-by: Sanjay R Mehta Signed-off-by: Jon Mason --- drivers/ntb/hw/amd/ntb_hw_amd.c | 3 ++- drivers/ntb/hw/amd/ntb_hw_amd.h | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.c b/drivers/ntb/hw/amd/ntb_hw_amd.c index 2859cc99b73e..e9286cf241f0 100644 --- a/drivers/ntb/hw/amd/ntb_hw_amd.c +++ b/drivers/ntb/hw/amd/ntb_hw_amd.c @@ -1124,7 +1124,8 @@ static const struct file_operations amd_ntb_debugfs_info = { }; static const struct pci_device_id amd_ntb_pci_tbl[] = { - {PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_NTB)}, + {PCI_VDEVICE(AMD, 0x145b)}, + {PCI_VDEVICE(AMD, 0x148b)}, {0} }; MODULE_DEVICE_TABLE(pci, amd_ntb_pci_tbl); diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.h b/drivers/ntb/hw/amd/ntb_hw_amd.h index 8f3617a46292..3aac994f3e77 100644 --- a/drivers/ntb/hw/amd/ntb_hw_amd.h +++ b/drivers/ntb/hw/amd/ntb_hw_amd.h @@ -52,7 +52,6 @@ #include #include -#define PCI_DEVICE_ID_AMD_NTB 0x145B #define AMD_LINK_HB_TIMEOUT msecs_to_jiffies(1000) #define AMD_LINK_STATUS_OFFSET 0x68 #define NTB_LIN_STA_ACTIVE_BIT 0x00000002 From a1472e73e3d791eb5eddeceb99f7dc5c17ca98ce Mon Sep 17 00:00:00 2001 From: Sanjay R Mehta Date: Sun, 15 Sep 2019 17:08:35 +0000 Subject: [PATCH 0819/2880] ntb_hw_amd: Add memory window support for new AMD hardware The AMD new hardware uses BAR23 and BAR45 as memory windows as compared to previos where BAR1, BAR23 and BAR45 is used for memory windows. This patch add support for both AMD hardwares. Signed-off-by: Sanjay R Mehta Signed-off-by: Jon Mason --- drivers/ntb/hw/amd/ntb_hw_amd.c | 23 ++++++++++++++++++----- drivers/ntb/hw/amd/ntb_hw_amd.h | 7 ++++++- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.c b/drivers/ntb/hw/amd/ntb_hw_amd.c index e9286cf241f0..156c2a18a239 100644 --- a/drivers/ntb/hw/amd/ntb_hw_amd.c +++ b/drivers/ntb/hw/amd/ntb_hw_amd.c @@ -78,7 +78,7 @@ static int ndev_mw_to_bar(struct amd_ntb_dev *ndev, int idx) if (idx < 0 || idx > ndev->mw_count) return -EINVAL; - return 1 << idx; + return ndev->dev_data->mw_idx << idx; } static int amd_ntb_mw_count(struct ntb_dev *ntb, int pidx) @@ -909,7 +909,7 @@ static int amd_init_ntb(struct amd_ntb_dev *ndev) { void __iomem *mmio = ndev->self_mmio; - ndev->mw_count = AMD_MW_CNT; + ndev->mw_count = ndev->dev_data->mw_count; ndev->spad_count = AMD_SPADS_CNT; ndev->db_count = AMD_DB_CNT; @@ -1069,6 +1069,8 @@ static int amd_ntb_pci_probe(struct pci_dev *pdev, goto err_ndev; } + ndev->dev_data = (struct ntb_dev_data *)id->driver_data; + ndev_init_struct(ndev, pdev); rc = amd_ntb_init_pci(ndev, pdev); @@ -1123,10 +1125,21 @@ static const struct file_operations amd_ntb_debugfs_info = { .read = ndev_debugfs_read, }; +static const struct ntb_dev_data dev_data[] = { + { /* for device 145b */ + .mw_count = 3, + .mw_idx = 1, + }, + { /* for device 148b */ + .mw_count = 2, + .mw_idx = 2, + }, +}; + static const struct pci_device_id amd_ntb_pci_tbl[] = { - {PCI_VDEVICE(AMD, 0x145b)}, - {PCI_VDEVICE(AMD, 0x148b)}, - {0} + { PCI_VDEVICE(AMD, 0x145b), (kernel_ulong_t)&dev_data[0] }, + { PCI_VDEVICE(AMD, 0x148b), (kernel_ulong_t)&dev_data[1] }, + { 0, } }; MODULE_DEVICE_TABLE(pci, amd_ntb_pci_tbl); diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.h b/drivers/ntb/hw/amd/ntb_hw_amd.h index 3aac994f3e77..139a307147bc 100644 --- a/drivers/ntb/hw/amd/ntb_hw_amd.h +++ b/drivers/ntb/hw/amd/ntb_hw_amd.h @@ -92,7 +92,6 @@ static inline void _write64(u64 val, void __iomem *mmio) enum { /* AMD NTB Capability */ - AMD_MW_CNT = 3, AMD_DB_CNT = 16, AMD_MSIX_VECTOR_CNT = 24, AMD_SPADS_CNT = 16, @@ -169,6 +168,11 @@ enum { AMD_PEER_OFFSET = 0x400, }; +struct ntb_dev_data { + const unsigned char mw_count; + const unsigned int mw_idx; +}; + struct amd_ntb_dev; struct amd_ntb_vec { @@ -184,6 +188,7 @@ struct amd_ntb_dev { u32 cntl_sta; u32 peer_sta; + struct ntb_dev_data *dev_data; unsigned char mw_count; unsigned char spad_count; unsigned char db_count; From 4720101fab62d0453babb0287b58a9c5bf78fb80 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 18 Sep 2019 13:58:31 -0700 Subject: [PATCH 0820/2880] NTB: fix IDT Kconfig typos/spellos Fix typos in drivers/ntb/hw/idt/Kconfig. Use consistent spelling and capitalization. Fixes: bf2a952d31d2 ("NTB: Add IDT 89HPESxNTx PCIe-switches support") Signed-off-by: Randy Dunlap Cc: Dave Jiang Cc: Allen Hubbe Cc: Serge Semin Signed-off-by: Jon Mason --- drivers/ntb/hw/idt/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ntb/hw/idt/Kconfig b/drivers/ntb/hw/idt/Kconfig index bfc7cac94102..c79b54c1747d 100644 --- a/drivers/ntb/hw/idt/Kconfig +++ b/drivers/ntb/hw/idt/Kconfig @@ -4,11 +4,11 @@ config NTB_IDT depends on PCI select HWMON help - This driver supports NTB of cappable IDT PCIe-switches. + This driver supports NTB of capable IDT PCIe-switches. Some of the pre-initializations must be made before IDT PCIe-switch - exposes it NT-functions correctly. It should be done by either proper - initialisation of EEPROM connected to master smbus of the switch or + exposes its NT-functions correctly. It should be done by either proper + initialization of EEPROM connected to master SMbus of the switch or by BIOS using slave-SMBus interface changing corresponding registers value. Evidently it must be done before PCI bus enumeration is finished in Linux kernel. From a72865f057820ea9f57597915da4b651d65eb92f Mon Sep 17 00:00:00 2001 From: Marco Felsch Date: Tue, 17 Sep 2019 14:42:42 +0200 Subject: [PATCH 0821/2880] regulator: da9062: fix suspend_enable/disable preparation Currently the suspend reg_field maps to the pmic voltage selection bits and is used during suspend_enabe/disable() and during get_mode(). This seems to be wrong for both use cases. Use case one (suspend_enabe/disable): Those callbacks are used to mark a regulator device as enabled/disabled during suspend. Marking the regulator enabled during suspend is done by the LDOx_CONF/BUCKx_CONF bit within the LDOx_CONT/BUCKx_CONT registers. Setting this bit tells the DA9062 PMIC state machine to keep the regulator on in POWERDOWN mode and switch to suspend voltage. Use case two (get_mode): The get_mode callback is used to retrieve the active mode state. Since the regulator-setting-A is used for the active state and regulator-setting-B for the suspend state there is no need to check which regulator setting is active. Fixes: 4068e5182ada ("regulator: da9062: DA9062 regulator driver") Signed-off-by: Marco Felsch Reviewed-by: Adam Thomson Link: https://lore.kernel.org/r/20190917124246.11732-2-m.felsch@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/da9062-regulator.c | 118 +++++++++++---------------- 1 file changed, 47 insertions(+), 71 deletions(-) diff --git a/drivers/regulator/da9062-regulator.c b/drivers/regulator/da9062-regulator.c index 56f3f72d7707..710e67081d53 100644 --- a/drivers/regulator/da9062-regulator.c +++ b/drivers/regulator/da9062-regulator.c @@ -136,7 +136,6 @@ static int da9062_buck_set_mode(struct regulator_dev *rdev, unsigned mode) static unsigned da9062_buck_get_mode(struct regulator_dev *rdev) { struct da9062_regulator *regl = rdev_get_drvdata(rdev); - struct regmap_field *field; unsigned int val, mode = 0; int ret; @@ -158,18 +157,7 @@ static unsigned da9062_buck_get_mode(struct regulator_dev *rdev) return REGULATOR_MODE_NORMAL; } - /* Detect current regulator state */ - ret = regmap_field_read(regl->suspend, &val); - if (ret < 0) - return 0; - - /* Read regulator mode from proper register, depending on state */ - if (val) - field = regl->suspend_sleep; - else - field = regl->sleep; - - ret = regmap_field_read(field, &val); + ret = regmap_field_read(regl->sleep, &val); if (ret < 0) return 0; @@ -208,21 +196,9 @@ static int da9062_ldo_set_mode(struct regulator_dev *rdev, unsigned mode) static unsigned da9062_ldo_get_mode(struct regulator_dev *rdev) { struct da9062_regulator *regl = rdev_get_drvdata(rdev); - struct regmap_field *field; int ret, val; - /* Detect current regulator state */ - ret = regmap_field_read(regl->suspend, &val); - if (ret < 0) - return 0; - - /* Read regulator mode from proper register, depending on state */ - if (val) - field = regl->suspend_sleep; - else - field = regl->sleep; - - ret = regmap_field_read(field, &val); + ret = regmap_field_read(regl->sleep, &val); if (ret < 0) return 0; @@ -408,10 +384,10 @@ static const struct da9062_regulator_info local_da9061_regulator_info[] = { __builtin_ffs((int)DA9062AA_BUCK1_MODE_MASK) - 1, sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_BUCK1_MODE_MASK)) - 1), - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VBUCK1_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_BUCK1_CONT, + __builtin_ffs((int)DA9062AA_BUCK1_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VBUCK1_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_BUCK1_CONF_MASK) - 1), }, { .desc.id = DA9061_ID_BUCK2, @@ -444,10 +420,10 @@ static const struct da9062_regulator_info local_da9061_regulator_info[] = { __builtin_ffs((int)DA9062AA_BUCK3_MODE_MASK) - 1, sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_BUCK3_MODE_MASK)) - 1), - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VBUCK3_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_BUCK3_CONT, + __builtin_ffs((int)DA9062AA_BUCK3_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VBUCK3_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_BUCK3_CONF_MASK) - 1), }, { .desc.id = DA9061_ID_BUCK3, @@ -480,10 +456,10 @@ static const struct da9062_regulator_info local_da9061_regulator_info[] = { __builtin_ffs((int)DA9062AA_BUCK4_MODE_MASK) - 1, sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_BUCK4_MODE_MASK)) - 1), - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VBUCK4_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_BUCK4_CONT, + __builtin_ffs((int)DA9062AA_BUCK4_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VBUCK4_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_BUCK4_CONF_MASK) - 1), }, { .desc.id = DA9061_ID_LDO1, @@ -509,10 +485,10 @@ static const struct da9062_regulator_info local_da9061_regulator_info[] = { sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_LDO1_SL_B_MASK)) - 1), .suspend_vsel_reg = DA9062AA_VLDO1_B, - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VLDO1_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_LDO1_CONT, + __builtin_ffs((int)DA9062AA_LDO1_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VLDO1_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_LDO1_CONF_MASK) - 1), .oc_event = REG_FIELD(DA9062AA_STATUS_D, __builtin_ffs((int)DA9062AA_LDO1_ILIM_MASK) - 1, sizeof(unsigned int) * 8 - @@ -542,10 +518,10 @@ static const struct da9062_regulator_info local_da9061_regulator_info[] = { sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_LDO2_SL_B_MASK)) - 1), .suspend_vsel_reg = DA9062AA_VLDO2_B, - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VLDO2_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_LDO2_CONT, + __builtin_ffs((int)DA9062AA_LDO2_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VLDO2_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_LDO2_CONF_MASK) - 1), .oc_event = REG_FIELD(DA9062AA_STATUS_D, __builtin_ffs((int)DA9062AA_LDO2_ILIM_MASK) - 1, sizeof(unsigned int) * 8 - @@ -575,10 +551,10 @@ static const struct da9062_regulator_info local_da9061_regulator_info[] = { sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_LDO3_SL_B_MASK)) - 1), .suspend_vsel_reg = DA9062AA_VLDO3_B, - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VLDO3_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_LDO3_CONT, + __builtin_ffs((int)DA9062AA_LDO3_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VLDO3_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_LDO3_CONF_MASK) - 1), .oc_event = REG_FIELD(DA9062AA_STATUS_D, __builtin_ffs((int)DA9062AA_LDO3_ILIM_MASK) - 1, sizeof(unsigned int) * 8 - @@ -608,10 +584,10 @@ static const struct da9062_regulator_info local_da9061_regulator_info[] = { sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_LDO4_SL_B_MASK)) - 1), .suspend_vsel_reg = DA9062AA_VLDO4_B, - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VLDO4_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_LDO4_CONT, + __builtin_ffs((int)DA9062AA_LDO4_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VLDO4_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_LDO4_CONF_MASK) - 1), .oc_event = REG_FIELD(DA9062AA_STATUS_D, __builtin_ffs((int)DA9062AA_LDO4_ILIM_MASK) - 1, sizeof(unsigned int) * 8 - @@ -652,10 +628,10 @@ static const struct da9062_regulator_info local_da9062_regulator_info[] = { __builtin_ffs((int)DA9062AA_BUCK1_MODE_MASK) - 1, sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_BUCK1_MODE_MASK)) - 1), - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VBUCK1_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_BUCK1_CONT, + __builtin_ffs((int)DA9062AA_BUCK1_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VBUCK1_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_BUCK1_CONF_MASK) - 1), }, { .desc.id = DA9062_ID_BUCK2, @@ -688,10 +664,10 @@ static const struct da9062_regulator_info local_da9062_regulator_info[] = { __builtin_ffs((int)DA9062AA_BUCK2_MODE_MASK) - 1, sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_BUCK2_MODE_MASK)) - 1), - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VBUCK2_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_BUCK2_CONT, + __builtin_ffs((int)DA9062AA_BUCK2_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VBUCK2_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_BUCK2_CONF_MASK) - 1), }, { .desc.id = DA9062_ID_BUCK3, @@ -724,10 +700,10 @@ static const struct da9062_regulator_info local_da9062_regulator_info[] = { __builtin_ffs((int)DA9062AA_BUCK3_MODE_MASK) - 1, sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_BUCK3_MODE_MASK)) - 1), - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VBUCK3_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_BUCK3_CONT, + __builtin_ffs((int)DA9062AA_BUCK3_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VBUCK3_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_BUCK3_CONF_MASK) - 1), }, { .desc.id = DA9062_ID_BUCK4, @@ -760,10 +736,10 @@ static const struct da9062_regulator_info local_da9062_regulator_info[] = { __builtin_ffs((int)DA9062AA_BUCK4_MODE_MASK) - 1, sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_BUCK4_MODE_MASK)) - 1), - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VBUCK4_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_BUCK4_CONT, + __builtin_ffs((int)DA9062AA_BUCK4_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VBUCK4_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_BUCK4_CONF_MASK) - 1), }, { .desc.id = DA9062_ID_LDO1, @@ -789,10 +765,10 @@ static const struct da9062_regulator_info local_da9062_regulator_info[] = { sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_LDO1_SL_B_MASK)) - 1), .suspend_vsel_reg = DA9062AA_VLDO1_B, - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VLDO1_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_LDO1_CONT, + __builtin_ffs((int)DA9062AA_LDO1_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VLDO1_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_LDO1_CONF_MASK) - 1), .oc_event = REG_FIELD(DA9062AA_STATUS_D, __builtin_ffs((int)DA9062AA_LDO1_ILIM_MASK) - 1, sizeof(unsigned int) * 8 - @@ -822,10 +798,10 @@ static const struct da9062_regulator_info local_da9062_regulator_info[] = { sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_LDO2_SL_B_MASK)) - 1), .suspend_vsel_reg = DA9062AA_VLDO2_B, - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VLDO2_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_LDO2_CONT, + __builtin_ffs((int)DA9062AA_LDO2_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VLDO2_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_LDO2_CONF_MASK) - 1), .oc_event = REG_FIELD(DA9062AA_STATUS_D, __builtin_ffs((int)DA9062AA_LDO2_ILIM_MASK) - 1, sizeof(unsigned int) * 8 - @@ -855,10 +831,10 @@ static const struct da9062_regulator_info local_da9062_regulator_info[] = { sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_LDO3_SL_B_MASK)) - 1), .suspend_vsel_reg = DA9062AA_VLDO3_B, - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VLDO3_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_LDO3_CONT, + __builtin_ffs((int)DA9062AA_LDO3_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VLDO3_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_LDO3_CONF_MASK) - 1), .oc_event = REG_FIELD(DA9062AA_STATUS_D, __builtin_ffs((int)DA9062AA_LDO3_ILIM_MASK) - 1, sizeof(unsigned int) * 8 - @@ -888,10 +864,10 @@ static const struct da9062_regulator_info local_da9062_regulator_info[] = { sizeof(unsigned int) * 8 - __builtin_clz((DA9062AA_LDO4_SL_B_MASK)) - 1), .suspend_vsel_reg = DA9062AA_VLDO4_B, - .suspend = REG_FIELD(DA9062AA_DVC_1, - __builtin_ffs((int)DA9062AA_VLDO4_SEL_MASK) - 1, + .suspend = REG_FIELD(DA9062AA_LDO4_CONT, + __builtin_ffs((int)DA9062AA_LDO4_CONF_MASK) - 1, sizeof(unsigned int) * 8 - - __builtin_clz((DA9062AA_VLDO4_SEL_MASK)) - 1), + __builtin_clz(DA9062AA_LDO4_CONF_MASK) - 1), .oc_event = REG_FIELD(DA9062AA_STATUS_D, __builtin_ffs((int)DA9062AA_LDO4_ILIM_MASK) - 1, sizeof(unsigned int) * 8 - From f3122a79a1b0a113d3aea748e0ec26f2cb2889de Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Tue, 17 Sep 2019 22:59:03 +0200 Subject: [PATCH 0822/2880] s390/topology: avoid firing events before kobjs are created arch_update_cpu_topology is first called from: kernel_init_freeable->sched_init_smp->sched_init_domains even before cpus has been registered in: kernel_init_freeable->do_one_initcall->s390_smp_init Do not trigger kobject_uevent change events until cpu devices are actually created. Fixes the following kasan findings: BUG: KASAN: global-out-of-bounds in kobject_uevent_env+0xb40/0xee0 Read of size 8 at addr 0000000000000020 by task swapper/0/1 BUG: KASAN: global-out-of-bounds in kobject_uevent_env+0xb36/0xee0 Read of size 8 at addr 0000000000000018 by task swapper/0/1 CPU: 0 PID: 1 Comm: swapper/0 Tainted: G B Hardware name: IBM 3906 M04 704 (LPAR) Call Trace: ([<0000000143c6db7e>] show_stack+0x14e/0x1a8) [<0000000145956498>] dump_stack+0x1d0/0x218 [<000000014429fb4c>] print_address_description+0x64/0x380 [<000000014429f630>] __kasan_report+0x138/0x168 [<0000000145960b96>] kobject_uevent_env+0xb36/0xee0 [<0000000143c7c47c>] arch_update_cpu_topology+0x104/0x108 [<0000000143df9e22>] sched_init_domains+0x62/0xe8 [<000000014644c94a>] sched_init_smp+0x3a/0xc0 [<0000000146433a20>] kernel_init_freeable+0x558/0x958 [<000000014599002a>] kernel_init+0x22/0x160 [<00000001459a71d4>] ret_from_fork+0x28/0x30 [<00000001459a71dc>] kernel_thread_starter+0x0/0x10 Cc: stable@vger.kernel.org Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/topology.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 2db6fb405a9a..3627953007ed 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -311,7 +311,8 @@ int arch_update_cpu_topology(void) on_each_cpu(__arch_update_dedicated_flag, NULL, 0); for_each_online_cpu(cpu) { dev = get_cpu_device(cpu); - kobject_uevent(&dev->kobj, KOBJ_CHANGE); + if (dev) + kobject_uevent(&dev->kobj, KOBJ_CHANGE); } return rc; } From ea298e6ee8b34b3ed4366be7eb799d0650ebe555 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Tue, 17 Sep 2019 20:04:04 +0200 Subject: [PATCH 0823/2880] s390/cio: avoid calling strlen on null pointer Fix the following kasan finding: BUG: KASAN: global-out-of-bounds in ccwgroup_create_dev+0x850/0x1140 Read of size 1 at addr 0000000000000000 by task systemd-udevd.r/561 CPU: 30 PID: 561 Comm: systemd-udevd.r Tainted: G B Hardware name: IBM 3906 M04 704 (LPAR) Call Trace: ([<0000000231b3db7e>] show_stack+0x14e/0x1a8) [<0000000233826410>] dump_stack+0x1d0/0x218 [<000000023216fac4>] print_address_description+0x64/0x380 [<000000023216f5a8>] __kasan_report+0x138/0x168 [<00000002331b8378>] ccwgroup_create_dev+0x850/0x1140 [<00000002332b618a>] group_store+0x3a/0x50 [<00000002323ac706>] kernfs_fop_write+0x246/0x3b8 [<00000002321d409a>] vfs_write+0x132/0x450 [<00000002321d47da>] ksys_write+0x122/0x208 [<0000000233877102>] system_call+0x2a6/0x2c8 Triggered by: openat(AT_FDCWD, "/sys/bus/ccwgroup/drivers/qeth/group", O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, 0666) = 16 write(16, "0.0.bd00,0.0.bd01,0.0.bd02", 26) = 26 The problem is that __get_next_id in ccwgroup_create_dev might set "buf" buffer pointer to NULL and explicit check for that is required. Cc: stable@vger.kernel.org Reviewed-by: Sebastian Ott Signed-off-by: Vasily Gorbik --- drivers/s390/cio/ccwgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c index c522e9313c50..ae66875a934d 100644 --- a/drivers/s390/cio/ccwgroup.c +++ b/drivers/s390/cio/ccwgroup.c @@ -372,7 +372,7 @@ int ccwgroup_create_dev(struct device *parent, struct ccwgroup_driver *gdrv, goto error; } /* Check for trailing stuff. */ - if (i == num_devices && strlen(buf) > 0) { + if (i == num_devices && buf && strlen(buf) > 0) { rc = -EINVAL; goto error; } From ab5758848039de9a4b249d46e4ab591197eebaf2 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 19 Sep 2019 15:55:17 +0200 Subject: [PATCH 0824/2880] s390/cio: exclude subchannels with no parent from pseudo check ccw console is created early in start_kernel and used before css is initialized or ccw console subchannel is registered. Until then console subchannel does not have a parent. For that reason assume subchannels with no parent are not pseudo subchannels. This fixes the following kasan finding: BUG: KASAN: global-out-of-bounds in sch_is_pseudo_sch+0x8e/0x98 Read of size 8 at addr 00000000000005e8 by task swapper/0/0 CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.0-rc8-07370-g6ac43dd12538 #2 Hardware name: IBM 2964 NC9 702 (z/VM 6.4.0) Call Trace: ([<000000000012cd76>] show_stack+0x14e/0x1e0) [<0000000001f7fb44>] dump_stack+0x1a4/0x1f8 [<00000000007d7afc>] print_address_description+0x64/0x3c8 [<00000000007d75f6>] __kasan_report+0x14e/0x180 [<00000000018a2986>] sch_is_pseudo_sch+0x8e/0x98 [<000000000189b950>] cio_enable_subchannel+0x1d0/0x510 [<00000000018cac7c>] ccw_device_recognition+0x12c/0x188 [<0000000002ceb1a8>] ccw_device_enable_console+0x138/0x340 [<0000000002cf1cbe>] con3215_init+0x25e/0x300 [<0000000002c8770a>] console_init+0x68a/0x9b8 [<0000000002c6a3d6>] start_kernel+0x4fe/0x728 [<0000000000100070>] startup_continue+0x70/0xd0 Cc: stable@vger.kernel.org Reviewed-by: Sebastian Ott Signed-off-by: Vasily Gorbik --- drivers/s390/cio/css.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 22c55816100b..1fbfb0a93f5f 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -1388,6 +1388,8 @@ device_initcall(cio_settle_init); int sch_is_pseudo_sch(struct subchannel *sch) { + if (!sch->dev.parent) + return 0; return sch == to_css(sch->dev.parent)->pseudo_subchannel; } From 82a9ac7130cf51c2640800fb0ef19d3a05cb8fff Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Wed, 7 Aug 2019 16:49:47 +0200 Subject: [PATCH 0825/2880] scsi: core: fix missing .cleanup_rq for SCSI hosts without request batching This was missing from scsi_mq_ops_no_commit of linux-next commit 8930a6c20791 ("scsi: core: add support for request batching") from Martin's scsi/5.4/scsi-queue or James' scsi/misc. See also linux-next commit b7e9e1fb7a92 ("scsi: implement .cleanup_rq callback") from block/for-next. Signed-off-by: Steffen Maier Fixes: 8930a6c20791 ("scsi: core: add support for request batching") Cc: Paolo Bonzini Cc: Ming Lei Reviewed-by: Paolo Bonzini Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index dc210b9d4896..12db0c13a927 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1834,6 +1834,7 @@ static const struct blk_mq_ops scsi_mq_ops_no_commit = { .init_request = scsi_mq_init_request, .exit_request = scsi_mq_exit_request, .initialize_rq_fn = scsi_initialize_rq, + .cleanup_rq = scsi_cleanup_rq, .busy = scsi_mq_lld_busy, .map_queues = scsi_map_queues, }; From 6b6fa7a5c86e1269d9f0c9a5b902072351317387 Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Wed, 7 Aug 2019 16:49:48 +0200 Subject: [PATCH 0826/2880] scsi: core: fix dh and multipathing for SCSI hosts without request batching This was missing from scsi_device_from_queue() due to the introduction of another new scsi_mq_ops_no_commit of linux-next commit 8930a6c20791 ("scsi: core: add support for request batching") from Martin's scsi/5.4/scsi-queue or James' scsi/misc. Only devicehandler code seems to call scsi_device_from_queue(): *** drivers/scsi/scsi_dh.c: scsi_dh_activate[255] sdev = scsi_device_from_queue(q); scsi_dh_set_params[302] sdev = scsi_device_from_queue(q); scsi_dh_attach[325] sdev = scsi_device_from_queue(q); scsi_dh_attached_handler_name[363] sdev = scsi_device_from_queue(q); Fixes multipath tools follow-on errors: $ multipath -v6 ... libdevmapper: ioctl/libdm-iface.c(1887): device-mapper: reload ioctl on mpatha failed: No such device ... mpatha: failed to load map, error 19 ... showing also as kernel messages: device-mapper: table: 252:0: multipath: error attaching hardware handler device-mapper: ioctl: error adding target to table Signed-off-by: Steffen Maier Fixes: 8930a6c20791 ("scsi: core: add support for request batching") Cc: Paolo Bonzini Reviewed-by: Paolo Bonzini Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 12db0c13a927..5447738906ac 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1922,7 +1922,8 @@ struct scsi_device *scsi_device_from_queue(struct request_queue *q) { struct scsi_device *sdev = NULL; - if (q->mq_ops == &scsi_mq_ops) + if (q->mq_ops == &scsi_mq_ops_no_commit || + q->mq_ops == &scsi_mq_ops) sdev = q->queuedata; if (!sdev || !get_device(&sdev->sdev_gendev)) sdev = NULL; From c669675b56b4eb85e8daa3d2ae76db2fd6378b2f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 1 Aug 2019 15:39:37 -0700 Subject: [PATCH 0827/2880] thermal: int340x: processor_thermal: Add Ice Lake support Add new PCI id for Ice lake processor thermal device. Also enabled the RAPL mmio interface. The MMIO offsets match Skylake. Signed-off-by: Srinivas Pandruvada Signed-off-by: Zhang Rui --- .../thermal/intel/int340x_thermal/processor_thermal_device.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c index 97333fc4be42..89a015387283 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c @@ -39,6 +39,9 @@ /* GeminiLake thermal reporting device */ #define PCI_DEVICE_ID_PROC_GLK_THERMAL 0x318C +/* IceLake thermal reporting device */ +#define PCI_DEVICE_ID_PROC_ICL_THERMAL 0x8a03 + #define DRV_NAME "proc_thermal" struct power_config { @@ -719,6 +722,8 @@ static const struct pci_device_id proc_thermal_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_CNL_THERMAL)}, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_CFL_THERMAL)}, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_GLK_THERMAL)}, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_ICL_THERMAL), + .driver_data = (kernel_ulong_t)&rapl_mmio_hsw, }, { 0, }, }; From f639cff55fb414c2ee6d1032bc3244e1d15d6d40 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 17 Jul 2019 13:26:39 -0600 Subject: [PATCH 0828/2880] thermal: intel: int340x_thermal: Remove unnecessary acpi_has_method() uses acpi_evaluate_object() will already return in error if the method does not exist. Checking if the method is absent before the acpi_evaluate_object() call is not needed. Remove acpi_has_method() calls to avoid additional work. Signed-off-by: Kelsey Skunberg Reviewed-by: Rafael J. Wysocki Signed-off-by: Zhang Rui --- drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c b/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c index 9716bc3abaf9..7130e90773ed 100644 --- a/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c +++ b/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c @@ -77,9 +77,6 @@ int acpi_parse_trt(acpi_handle handle, int *trt_count, struct trt **trtp, struct acpi_buffer element = { 0, NULL }; struct acpi_buffer trt_format = { sizeof("RRNNNNNN"), "RRNNNNNN" }; - if (!acpi_has_method(handle, "_TRT")) - return -ENODEV; - status = acpi_evaluate_object(handle, "_TRT", NULL, &buffer); if (ACPI_FAILURE(status)) return -ENODEV; @@ -158,9 +155,6 @@ int acpi_parse_art(acpi_handle handle, int *art_count, struct art **artp, struct acpi_buffer art_format = { sizeof("RRNNNNNNNNNNN"), "RRNNNNNNNNNNN" }; - if (!acpi_has_method(handle, "_ART")) - return -ENODEV; - status = acpi_evaluate_object(handle, "_ART", NULL, &buffer); if (ACPI_FAILURE(status)) return -ENODEV; From 4c8a342c118ae5f09a1729efdfcc42d8f4485bd7 Mon Sep 17 00:00:00 2001 From: Rishi Gupta Date: Fri, 23 Aug 2019 07:01:42 +0530 Subject: [PATCH 0829/2880] thermal: intel: int3403: replace printk(KERN_WARN...) with pr_warn(...) Direct invocation of printk() is not preferred to emit logs. This commit replaces printk(KERN_WARNING) with corresponding pr_warn() function call. Signed-off-by: Rishi Gupta Signed-off-by: Zhang Rui --- drivers/thermal/intel/int340x_thermal/int3403_thermal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c index f5749d4418ae..a7bbd8584ae2 100644 --- a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c @@ -181,7 +181,7 @@ static int int3403_cdev_add(struct int3403_priv *priv) p = buf.pointer; if (!p || (p->type != ACPI_TYPE_PACKAGE)) { - printk(KERN_WARNING "Invalid PPSS data\n"); + pr_warn("Invalid PPSS data\n"); kfree(buf.pointer); return -EFAULT; } From 97e9cafe85a9111fd72cefb18cfc9f2e0b6f85f1 Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Wed, 24 Jul 2019 20:23:37 +0800 Subject: [PATCH 0830/2880] thermal: intel: Use dev_get_drvdata Instead of using to_pci_dev + pci_get_drvdata, use dev_get_drvdata to make code simpler. Signed-off-by: Chuhong Yuan Signed-off-by: Zhang Rui --- drivers/thermal/intel/intel_pch_thermal.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/intel/intel_pch_thermal.c b/drivers/thermal/intel/intel_pch_thermal.c index 99f8b2540f18..4f0bb8f502e1 100644 --- a/drivers/thermal/intel/intel_pch_thermal.c +++ b/drivers/thermal/intel/intel_pch_thermal.c @@ -371,16 +371,14 @@ static void intel_pch_thermal_remove(struct pci_dev *pdev) static int intel_pch_thermal_suspend(struct device *device) { - struct pci_dev *pdev = to_pci_dev(device); - struct pch_thermal_device *ptd = pci_get_drvdata(pdev); + struct pch_thermal_device *ptd = dev_get_drvdata(device); return ptd->ops->suspend(ptd); } static int intel_pch_thermal_resume(struct device *device) { - struct pci_dev *pdev = to_pci_dev(device); - struct pch_thermal_device *ptd = pci_get_drvdata(pdev); + struct pch_thermal_device *ptd = dev_get_drvdata(device); return ptd->ops->resume(ptd); } From 8c7aa184281c01fc26f319059efb94725012921d Mon Sep 17 00:00:00 2001 From: Stefan Mavrodiev Date: Fri, 26 Jul 2019 16:32:36 +0300 Subject: [PATCH 0831/2880] thermal_hwmon: Sanitize thermal_zone type When calling thermal_add_hwmon_sysfs(), the device type is sanitized by replacing '-' with '_'. However tz->type remains unsanitized. Thus calling thermal_hwmon_lookup_by_type() returns no device. And if there is no device, thermal_remove_hwmon_sysfs() fails with "hwmon device lookup failed!". The result is unregisted hwmon devices in the sysfs. Fixes: 409ef0bacacf ("thermal_hwmon: Sanitize attribute name passed to hwmon") Signed-off-by: Stefan Mavrodiev Signed-off-by: Zhang Rui --- drivers/thermal/thermal_hwmon.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/thermal_hwmon.c b/drivers/thermal/thermal_hwmon.c index 40c69a533b24..dd5d8ee37928 100644 --- a/drivers/thermal/thermal_hwmon.c +++ b/drivers/thermal/thermal_hwmon.c @@ -87,13 +87,17 @@ static struct thermal_hwmon_device * thermal_hwmon_lookup_by_type(const struct thermal_zone_device *tz) { struct thermal_hwmon_device *hwmon; + char type[THERMAL_NAME_LENGTH]; mutex_lock(&thermal_hwmon_list_lock); - list_for_each_entry(hwmon, &thermal_hwmon_list, node) - if (!strcmp(hwmon->type, tz->type)) { + list_for_each_entry(hwmon, &thermal_hwmon_list, node) { + strcpy(type, tz->type); + strreplace(type, '-', '_'); + if (!strcmp(hwmon->type, type)) { mutex_unlock(&thermal_hwmon_list_lock); return hwmon; } + } mutex_unlock(&thermal_hwmon_list_lock); return NULL; From adc8749b150c51e857ac248017971371f70de197 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Wed, 7 Aug 2019 11:01:30 +0800 Subject: [PATCH 0832/2880] thermal/drivers/core: Use put_device() if device_register() fails Never directly free @dev after calling device_register(), even if it returned an error! Always use put_device() to give up the reference initialized. Clean up the rollback block also. Signed-off-by: Yue Hu Signed-off-by: Zhang Rui --- drivers/thermal/thermal_core.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 6bab66e84eb5..bae1e412f57e 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -985,7 +985,7 @@ __thermal_cooling_device_register(struct device_node *np, result = device_register(&cdev->device); if (result) { ida_simple_remove(&thermal_cdev_ida, cdev->id); - kfree(cdev); + put_device(&cdev->device); return ERR_PTR(result); } @@ -1240,6 +1240,7 @@ thermal_zone_device_register(const char *type, int trips, int mask, struct thermal_zone_device *tz; enum thermal_trip_type trip_type; int trip_temp; + int id; int result; int count; struct thermal_governor *governor; @@ -1266,11 +1267,13 @@ thermal_zone_device_register(const char *type, int trips, int mask, INIT_LIST_HEAD(&tz->thermal_instances); ida_init(&tz->ida); mutex_init(&tz->lock); - result = ida_simple_get(&thermal_tz_ida, 0, 0, GFP_KERNEL); - if (result < 0) + id = ida_simple_get(&thermal_tz_ida, 0, 0, GFP_KERNEL); + if (id < 0) { + result = id; goto free_tz; + } - tz->id = result; + tz->id = id; strlcpy(tz->type, type, sizeof(tz->type)); tz->ops = ops; tz->tzp = tzp; @@ -1292,7 +1295,7 @@ thermal_zone_device_register(const char *type, int trips, int mask, dev_set_name(&tz->device, "thermal_zone%d", tz->id); result = device_register(&tz->device); if (result) - goto remove_device_groups; + goto release_device; for (count = 0; count < trips; count++) { if (tz->ops->get_trip_type(tz, count, &trip_type)) @@ -1343,14 +1346,12 @@ thermal_zone_device_register(const char *type, int trips, int mask, return tz; unregister: - ida_simple_remove(&thermal_tz_ida, tz->id); - device_unregister(&tz->device); - return ERR_PTR(result); - -remove_device_groups: - thermal_zone_destroy_device_groups(tz); + device_del(&tz->device); +release_device: + put_device(&tz->device); + tz = NULL; remove_id: - ida_simple_remove(&thermal_tz_ida, tz->id); + ida_simple_remove(&thermal_tz_ida, id); free_tz: kfree(tz); return ERR_PTR(result); From 1851799e1d2978f68eea5d9dff322e121dcf59c1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 10 Jul 2019 13:14:52 +0300 Subject: [PATCH 0833/2880] thermal: Fix use-after-free when unregistering thermal zone device thermal_zone_device_unregister() cancels the delayed work that polls the thermal zone, but it does not wait for it to finish. This is racy with respect to the freeing of the thermal zone device, which can result in a use-after-free [1]. Fix this by waiting for the delayed work to finish before freeing the thermal zone device. Note that thermal_zone_device_set_polling() is never invoked from an atomic context, so it is safe to call cancel_delayed_work_sync() that can block. [1] [ +0.002221] ================================================================== [ +0.000064] BUG: KASAN: use-after-free in __mutex_lock+0x1076/0x11c0 [ +0.000016] Read of size 8 at addr ffff8881e48e0450 by task kworker/1:0/17 [ +0.000023] CPU: 1 PID: 17 Comm: kworker/1:0 Not tainted 5.2.0-rc6-custom-02495-g8e73ca3be4af #1701 [ +0.000010] Hardware name: Mellanox Technologies Ltd. MSN2100-CB2FO/SA001017, BIOS 5.6.5 06/07/2016 [ +0.000016] Workqueue: events_freezable_power_ thermal_zone_device_check [ +0.000012] Call Trace: [ +0.000021] dump_stack+0xa9/0x10e [ +0.000020] print_address_description.cold.2+0x9/0x25e [ +0.000018] __kasan_report.cold.3+0x78/0x9d [ +0.000016] kasan_report+0xe/0x20 [ +0.000016] __mutex_lock+0x1076/0x11c0 [ +0.000014] step_wise_throttle+0x72/0x150 [ +0.000018] handle_thermal_trip+0x167/0x760 [ +0.000019] thermal_zone_device_update+0x19e/0x5f0 [ +0.000019] process_one_work+0x969/0x16f0 [ +0.000017] worker_thread+0x91/0xc40 [ +0.000014] kthread+0x33d/0x400 [ +0.000015] ret_from_fork+0x3a/0x50 [ +0.000020] Allocated by task 1: [ +0.000015] save_stack+0x19/0x80 [ +0.000015] __kasan_kmalloc.constprop.4+0xc1/0xd0 [ +0.000014] kmem_cache_alloc_trace+0x152/0x320 [ +0.000015] thermal_zone_device_register+0x1b4/0x13a0 [ +0.000015] mlxsw_thermal_init+0xc92/0x23d0 [ +0.000014] __mlxsw_core_bus_device_register+0x659/0x11b0 [ +0.000013] mlxsw_core_bus_device_register+0x3d/0x90 [ +0.000013] mlxsw_pci_probe+0x355/0x4b0 [ +0.000014] local_pci_probe+0xc3/0x150 [ +0.000013] pci_device_probe+0x280/0x410 [ +0.000013] really_probe+0x26a/0xbb0 [ +0.000013] driver_probe_device+0x208/0x2e0 [ +0.000013] device_driver_attach+0xfe/0x140 [ +0.000013] __driver_attach+0x110/0x310 [ +0.000013] bus_for_each_dev+0x14b/0x1d0 [ +0.000013] driver_register+0x1c0/0x400 [ +0.000015] mlxsw_sp_module_init+0x5d/0xd3 [ +0.000014] do_one_initcall+0x239/0x4dd [ +0.000013] kernel_init_freeable+0x42b/0x4e8 [ +0.000012] kernel_init+0x11/0x18b [ +0.000013] ret_from_fork+0x3a/0x50 [ +0.000015] Freed by task 581: [ +0.000013] save_stack+0x19/0x80 [ +0.000014] __kasan_slab_free+0x125/0x170 [ +0.000013] kfree+0xf3/0x310 [ +0.000013] thermal_release+0xc7/0xf0 [ +0.000014] device_release+0x77/0x200 [ +0.000014] kobject_put+0x1a8/0x4c0 [ +0.000014] device_unregister+0x38/0xc0 [ +0.000014] thermal_zone_device_unregister+0x54e/0x6a0 [ +0.000014] mlxsw_thermal_fini+0x184/0x35a [ +0.000014] mlxsw_core_bus_device_unregister+0x10a/0x640 [ +0.000013] mlxsw_devlink_core_bus_device_reload+0x92/0x210 [ +0.000015] devlink_nl_cmd_reload+0x113/0x1f0 [ +0.000014] genl_family_rcv_msg+0x700/0xee0 [ +0.000013] genl_rcv_msg+0xca/0x170 [ +0.000013] netlink_rcv_skb+0x137/0x3a0 [ +0.000012] genl_rcv+0x29/0x40 [ +0.000013] netlink_unicast+0x49b/0x660 [ +0.000013] netlink_sendmsg+0x755/0xc90 [ +0.000013] __sys_sendto+0x3de/0x430 [ +0.000013] __x64_sys_sendto+0xe2/0x1b0 [ +0.000013] do_syscall_64+0xa4/0x4d0 [ +0.000013] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ +0.000017] The buggy address belongs to the object at ffff8881e48e0008 which belongs to the cache kmalloc-2k of size 2048 [ +0.000012] The buggy address is located 1096 bytes inside of 2048-byte region [ffff8881e48e0008, ffff8881e48e0808) [ +0.000007] The buggy address belongs to the page: [ +0.000012] page:ffffea0007923800 refcount:1 mapcount:0 mapping:ffff88823680d0c0 index:0x0 compound_mapcount: 0 [ +0.000020] flags: 0x200000000010200(slab|head) [ +0.000019] raw: 0200000000010200 ffffea0007682008 ffffea00076ab808 ffff88823680d0c0 [ +0.000016] raw: 0000000000000000 00000000000d000d 00000001ffffffff 0000000000000000 [ +0.000007] page dumped because: kasan: bad access detected [ +0.000012] Memory state around the buggy address: [ +0.000012] ffff8881e48e0300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ +0.000012] ffff8881e48e0380: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ +0.000012] >ffff8881e48e0400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ +0.000008] ^ [ +0.000012] ffff8881e48e0480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ +0.000012] ffff8881e48e0500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ +0.000007] ================================================================== Fixes: b1569e99c795 ("ACPI: move thermal trip handling to generic thermal layer") Reported-by: Jiri Pirko Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: Zhang Rui --- drivers/thermal/thermal_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index bae1e412f57e..892f3548ad6d 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -304,7 +304,7 @@ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz, &tz->poll_queue, msecs_to_jiffies(delay)); else - cancel_delayed_work(&tz->poll_queue); + cancel_delayed_work_sync(&tz->poll_queue); } static void monitor_thermal_zone(struct thermal_zone_device *tz) From 67eed44b8a8ae7ca1a1a77c64d2c4815f00e361b Mon Sep 17 00:00:00 2001 From: Amit Kucheria Date: Fri, 12 Jul 2019 02:01:58 +0530 Subject: [PATCH 0834/2880] thermal: Add some error messages When registering a thermal zone device, we currently return -EINVAL in four cases. This makes it a little hard to debug the real cause of the failure. Print some error messages to make it easier for developer to figure out what happened. Signed-off-by: Amit Kucheria Signed-off-by: Zhang Rui --- drivers/thermal/thermal_core.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 892f3548ad6d..d4481cc8958f 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1245,17 +1245,26 @@ thermal_zone_device_register(const char *type, int trips, int mask, int count; struct thermal_governor *governor; - if (!type || strlen(type) == 0) + if (!type || strlen(type) == 0) { + pr_err("Error: No thermal zone type defined\n"); return ERR_PTR(-EINVAL); + } - if (type && strlen(type) >= THERMAL_NAME_LENGTH) + if (type && strlen(type) >= THERMAL_NAME_LENGTH) { + pr_err("Error: Thermal zone name (%s) too long, should be under %d chars\n", + type, THERMAL_NAME_LENGTH); return ERR_PTR(-EINVAL); + } - if (trips > THERMAL_MAX_TRIPS || trips < 0 || mask >> trips) + if (trips > THERMAL_MAX_TRIPS || trips < 0 || mask >> trips) { + pr_err("Error: Incorrect number of thermal trips\n"); return ERR_PTR(-EINVAL); + } - if (!ops) + if (!ops) { + pr_err("Error: Thermal zone device ops not defined\n"); return ERR_PTR(-EINVAL); + } if (trips > 0 && (!ops->get_trip_type || !ops->get_trip_temp)) return ERR_PTR(-EINVAL); From bf8ca04d8bfdb64b10a8fc8a08013aeb8ad1fb10 Mon Sep 17 00:00:00 2001 From: Amit Kucheria Date: Wed, 21 Aug 2019 14:38:31 +0530 Subject: [PATCH 0835/2880] MAINTAINERS: Add Amit Kucheria as reviewer for thermal Add Amit Kucheria as the reviewer for thermal as he would like to participate in the review process effort for the thermal framework. Signed-off-by: Amit Kucheria Signed-off-by: Zhang Rui --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 43604d6ab96c..38f9c2f3b4ff 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15910,6 +15910,7 @@ THERMAL M: Zhang Rui M: Eduardo Valentin R: Daniel Lezcano +R: Amit Kucheria L: linux-pm@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/evalenti/linux-soc-thermal.git From c9c53749375ccce0191f89b86732e642f914bd82 Mon Sep 17 00:00:00 2001 From: Laurence Oberman Date: Wed, 11 Sep 2019 09:56:42 -0400 Subject: [PATCH 0836/2880] scsi: bnx2fc: Handle scope bits when array returns BUSY or TSF The qla2xxx driver had this issue as well when the newer array firmware returned the retry_delay_timer in the fcp_rsp. The bnx2fc is not handling the masking of the scope bits either so the retry_delay_timestamp value lands up being a large value added to the timer timestamp delaying I/O for up to 27 Minutes. This patch adds similar code to handle this to the bnx2fc driver to avoid the huge delay. Link: https://lore.kernel.org/r/1568210202-12794-1-git-send-email-loberman@redhat.com Signed-off-by: Laurence Oberman Reported-by: David Jeffery Reported-by: kbuild test robot Signed-off-by: Martin K. Petersen --- drivers/scsi/bnx2fc/bnx2fc_io.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/bnx2fc/bnx2fc_io.c b/drivers/scsi/bnx2fc/bnx2fc_io.c index da00ca5fa5dc..401743e2b429 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_io.c +++ b/drivers/scsi/bnx2fc/bnx2fc_io.c @@ -1923,6 +1923,7 @@ void bnx2fc_process_scsi_cmd_compl(struct bnx2fc_cmd *io_req, struct fcoe_fcp_rsp_payload *fcp_rsp; struct bnx2fc_rport *tgt = io_req->tgt; struct scsi_cmnd *sc_cmd; + u16 scope = 0, qualifier = 0; /* scsi_cmd_cmpl is called with tgt lock held */ @@ -1990,12 +1991,30 @@ void bnx2fc_process_scsi_cmd_compl(struct bnx2fc_cmd *io_req, if (io_req->cdb_status == SAM_STAT_TASK_SET_FULL || io_req->cdb_status == SAM_STAT_BUSY) { - /* Set the jiffies + retry_delay_timer * 100ms - for the rport/tgt */ - tgt->retry_delay_timestamp = jiffies + - fcp_rsp->retry_delay_timer * HZ / 10; + /* Newer array firmware with BUSY or + * TASK_SET_FULL may return a status that needs + * the scope bits masked. + * Or a huge delay timestamp up to 27 minutes + * can result. + */ + if (fcp_rsp->retry_delay_timer) { + /* Upper 2 bits */ + scope = fcp_rsp->retry_delay_timer + & 0xC000; + /* Lower 14 bits */ + qualifier = fcp_rsp->retry_delay_timer + & 0x3FFF; + } + if (scope > 0 && qualifier > 0 && + qualifier <= 0x3FEF) { + /* Set the jiffies + + * retry_delay_timer * 100ms + * for the rport/tgt + */ + tgt->retry_delay_timestamp = jiffies + + (qualifier * HZ / 10); + } } - } if (io_req->fcp_resid) scsi_set_resid(sc_cmd, io_req->fcp_resid); From 3a83f677a6eeff65751b29e3648d7c69c3be83f3 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 11 Sep 2019 17:31:55 -0500 Subject: [PATCH 0837/2880] KVM: PPC: Book3S HV: use smp_mb() when setting/clearing host_ipi flag On a 2-socket Power9 system with 32 cores/128 threads (SMT4) and 1TB of memory running the following guest configs: guest A: - 224GB of memory - 56 VCPUs (sockets=1,cores=28,threads=2), where: VCPUs 0-1 are pinned to CPUs 0-3, VCPUs 2-3 are pinned to CPUs 4-7, ... VCPUs 54-55 are pinned to CPUs 108-111 guest B: - 4GB of memory - 4 VCPUs (sockets=1,cores=4,threads=1) with the following workloads (with KSM and THP enabled in all): guest A: stress --cpu 40 --io 20 --vm 20 --vm-bytes 512M guest B: stress --cpu 4 --io 4 --vm 4 --vm-bytes 512M host: stress --cpu 4 --io 4 --vm 2 --vm-bytes 256M the below soft-lockup traces were observed after an hour or so and persisted until the host was reset (this was found to be reliably reproducible for this configuration, for kernels 4.15, 4.18, 5.0, and 5.3-rc5): [ 1253.183290] rcu: INFO: rcu_sched self-detected stall on CPU [ 1253.183319] rcu: 124-....: (5250 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=1941 [ 1256.287426] watchdog: BUG: soft lockup - CPU#105 stuck for 23s! [CPU 52/KVM:19709] [ 1264.075773] watchdog: BUG: soft lockup - CPU#24 stuck for 23s! [worker:19913] [ 1264.079769] watchdog: BUG: soft lockup - CPU#31 stuck for 23s! [worker:20331] [ 1264.095770] watchdog: BUG: soft lockup - CPU#45 stuck for 23s! [worker:20338] [ 1264.131773] watchdog: BUG: soft lockup - CPU#64 stuck for 23s! [avocado:19525] [ 1280.408480] watchdog: BUG: soft lockup - CPU#124 stuck for 22s! [ksmd:791] [ 1316.198012] rcu: INFO: rcu_sched self-detected stall on CPU [ 1316.198032] rcu: 124-....: (21003 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=8243 [ 1340.411024] watchdog: BUG: soft lockup - CPU#124 stuck for 22s! [ksmd:791] [ 1379.212609] rcu: INFO: rcu_sched self-detected stall on CPU [ 1379.212629] rcu: 124-....: (36756 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=14714 [ 1404.413615] watchdog: BUG: soft lockup - CPU#124 stuck for 22s! [ksmd:791] [ 1442.227095] rcu: INFO: rcu_sched self-detected stall on CPU [ 1442.227115] rcu: 124-....: (52509 ticks this GP) idle=10a/1/0x4000000000000002 softirq=5408/5408 fqs=21403 [ 1455.111787] INFO: task worker:19907 blocked for more than 120 seconds. [ 1455.111822] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.111833] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.111884] INFO: task worker:19908 blocked for more than 120 seconds. [ 1455.111905] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.111925] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.111966] INFO: task worker:20328 blocked for more than 120 seconds. [ 1455.111986] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.111998] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112048] INFO: task worker:20330 blocked for more than 120 seconds. [ 1455.112068] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.112097] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112138] INFO: task worker:20332 blocked for more than 120 seconds. [ 1455.112159] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.112179] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112210] INFO: task worker:20333 blocked for more than 120 seconds. [ 1455.112231] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.112242] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112282] INFO: task worker:20335 blocked for more than 120 seconds. [ 1455.112303] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 [ 1455.112332] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1455.112372] INFO: task worker:20336 blocked for more than 120 seconds. [ 1455.112392] Tainted: G L 5.3.0-rc5-mdr-vanilla+ #1 CPUs 45, 24, and 124 are stuck on spin locks, likely held by CPUs 105 and 31. CPUs 105 and 31 are stuck in smp_call_function_many(), waiting on target CPU 42. For instance: # CPU 105 registers (via xmon) R00 = c00000000020b20c R16 = 00007d1bcd800000 R01 = c00000363eaa7970 R17 = 0000000000000001 R02 = c0000000019b3a00 R18 = 000000000000006b R03 = 000000000000002a R19 = 00007d537d7aecf0 R04 = 000000000000002a R20 = 60000000000000e0 R05 = 000000000000002a R21 = 0801000000000080 R06 = c0002073fb0caa08 R22 = 0000000000000d60 R07 = c0000000019ddd78 R23 = 0000000000000001 R08 = 000000000000002a R24 = c00000000147a700 R09 = 0000000000000001 R25 = c0002073fb0ca908 R10 = c000008ffeb4e660 R26 = 0000000000000000 R11 = c0002073fb0ca900 R27 = c0000000019e2464 R12 = c000000000050790 R28 = c0000000000812b0 R13 = c000207fff623e00 R29 = c0002073fb0ca808 R14 = 00007d1bbee00000 R30 = c0002073fb0ca800 R15 = 00007d1bcd600000 R31 = 0000000000000800 pc = c00000000020b260 smp_call_function_many+0x3d0/0x460 cfar= c00000000020b270 smp_call_function_many+0x3e0/0x460 lr = c00000000020b20c smp_call_function_many+0x37c/0x460 msr = 900000010288b033 cr = 44024824 ctr = c000000000050790 xer = 0000000000000000 trap = 100 CPU 42 is running normally, doing VCPU work: # CPU 42 stack trace (via xmon) [link register ] c00800001be17188 kvmppc_book3s_radix_page_fault+0x90/0x2b0 [kvm_hv] [c000008ed3343820] c000008ed3343850 (unreliable) [c000008ed33438d0] c00800001be11b6c kvmppc_book3s_hv_page_fault+0x264/0xe30 [kvm_hv] [c000008ed33439d0] c00800001be0d7b4 kvmppc_vcpu_run_hv+0x8dc/0xb50 [kvm_hv] [c000008ed3343ae0] c00800001c10891c kvmppc_vcpu_run+0x34/0x48 [kvm] [c000008ed3343b00] c00800001c10475c kvm_arch_vcpu_ioctl_run+0x244/0x420 [kvm] [c000008ed3343b90] c00800001c0f5a78 kvm_vcpu_ioctl+0x470/0x7c8 [kvm] [c000008ed3343d00] c000000000475450 do_vfs_ioctl+0xe0/0xc70 [c000008ed3343db0] c0000000004760e4 ksys_ioctl+0x104/0x120 [c000008ed3343e00] c000000000476128 sys_ioctl+0x28/0x80 [c000008ed3343e20] c00000000000b388 system_call+0x5c/0x70 --- Exception: c00 (System Call) at 00007d545cfd7694 SP (7d53ff7edf50) is in userspace It was subsequently found that ipi_message[PPC_MSG_CALL_FUNCTION] was set for CPU 42 by at least 1 of the CPUs waiting in smp_call_function_many(), but somehow the corresponding call_single_queue entries were never processed by CPU 42, causing the callers to spin in csd_lock_wait() indefinitely. Nick Piggin suggested something similar to the following sequence as a possible explanation (interleaving of CALL_FUNCTION/RESCHEDULE IPI messages seems to be most common, but any mix of CALL_FUNCTION and !CALL_FUNCTION messages could trigger it): CPU X: smp_muxed_ipi_set_message(): X: smp_mb() X: message[RESCHEDULE] = 1 X: doorbell_global_ipi(42): X: kvmppc_set_host_ipi(42, 1) X: ppc_msgsnd_sync()/smp_mb() X: ppc_msgsnd() -> 42 42: doorbell_exception(): // from CPU X 42: ppc_msgsync() 105: smp_muxed_ipi_set_message(): 105: smb_mb() // STORE DEFERRED DUE TO RE-ORDERING --105: message[CALL_FUNCTION] = 1 | 105: doorbell_global_ipi(42): | 105: kvmppc_set_host_ipi(42, 1) | 42: kvmppc_set_host_ipi(42, 0) | 42: smp_ipi_demux_relaxed() | 42: // returns to executing guest | // RE-ORDERED STORE COMPLETES ->105: message[CALL_FUNCTION] = 1 105: ppc_msgsnd_sync()/smp_mb() 105: ppc_msgsnd() -> 42 42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored 105: // hangs waiting on 42 to process messages/call_single_queue This can be prevented with an smp_mb() at the beginning of kvmppc_set_host_ipi(), such that stores to message[] (or other state indicated by the host_ipi flag) are ordered vs. the store to to host_ipi. However, doing so might still allow for the following scenario (not yet observed): CPU X: smp_muxed_ipi_set_message(): X: smp_mb() X: message[RESCHEDULE] = 1 X: doorbell_global_ipi(42): X: kvmppc_set_host_ipi(42, 1) X: ppc_msgsnd_sync()/smp_mb() X: ppc_msgsnd() -> 42 42: doorbell_exception(): // from CPU X 42: ppc_msgsync() // STORE DEFERRED DUE TO RE-ORDERING -- 42: kvmppc_set_host_ipi(42, 0) | 42: smp_ipi_demux_relaxed() | 105: smp_muxed_ipi_set_message(): | 105: smb_mb() | 105: message[CALL_FUNCTION] = 1 | 105: doorbell_global_ipi(42): | 105: kvmppc_set_host_ipi(42, 1) | // RE-ORDERED STORE COMPLETES -> 42: kvmppc_set_host_ipi(42, 0) 42: // returns to executing guest 105: ppc_msgsnd_sync()/smp_mb() 105: ppc_msgsnd() -> 42 42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored 105: // hangs waiting on 42 to process messages/call_single_queue Fixing this scenario would require an smp_mb() *after* clearing host_ipi flag in kvmppc_set_host_ipi() to order the store vs. subsequent processing of IPI messages. To handle both cases, this patch splits kvmppc_set_host_ipi() into separate set/clear functions, where we execute smp_mb() prior to setting host_ipi flag, and after clearing host_ipi flag. These functions pair with each other to synchronize the sender and receiver sides. With that change in place the above workload ran for 20 hours without triggering any lock-ups. Fixes: 755563bc79c7 ("powerpc/powernv: Fixes for hypervisor doorbell handling") # v4.0 Signed-off-by: Michael Roth Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190911223155.16045-1-mdroth@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_ppc.h | 100 +++++++++++++++++++++++++- arch/powerpc/kernel/dbell.c | 6 +- arch/powerpc/kvm/book3s_hv_rm_xics.c | 2 +- arch/powerpc/platforms/powernv/smp.c | 2 +- arch/powerpc/sysdev/xics/icp-native.c | 6 +- arch/powerpc/sysdev/xics/icp-opal.c | 6 +- 6 files changed, 108 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 8e8514efb124..ee62776e5433 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -452,9 +452,100 @@ static inline u32 kvmppc_get_xics_latch(void) return xirr; } -static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi) +/* + * To avoid the need to unnecessarily exit fully to the host kernel, an IPI to + * a CPU thread that's running/napping inside of a guest is by default regarded + * as a request to wake the CPU (if needed) and continue execution within the + * guest, potentially to process new state like externally-generated + * interrupts or IPIs sent from within the guest itself (e.g. H_PROD/H_IPI). + * + * To force an exit to the host kernel, kvmppc_set_host_ipi() must be called + * prior to issuing the IPI to set the corresponding 'host_ipi' flag in the + * target CPU's PACA. To avoid unnecessary exits to the host, this flag should + * be immediately cleared via kvmppc_clear_host_ipi() by the IPI handler on + * the receiving side prior to processing the IPI work. + * + * NOTE: + * + * We currently issue an smp_mb() at the beginning of kvmppc_set_host_ipi(). + * This is to guard against sequences such as the following: + * + * CPU + * X: smp_muxed_ipi_set_message(): + * X: smp_mb() + * X: message[RESCHEDULE] = 1 + * X: doorbell_global_ipi(42): + * X: kvmppc_set_host_ipi(42) + * X: ppc_msgsnd_sync()/smp_mb() + * X: ppc_msgsnd() -> 42 + * 42: doorbell_exception(): // from CPU X + * 42: ppc_msgsync() + * 105: smp_muxed_ipi_set_message(): + * 105: smb_mb() + * // STORE DEFERRED DUE TO RE-ORDERING + * --105: message[CALL_FUNCTION] = 1 + * | 105: doorbell_global_ipi(42): + * | 105: kvmppc_set_host_ipi(42) + * | 42: kvmppc_clear_host_ipi(42) + * | 42: smp_ipi_demux_relaxed() + * | 42: // returns to executing guest + * | // RE-ORDERED STORE COMPLETES + * ->105: message[CALL_FUNCTION] = 1 + * 105: ppc_msgsnd_sync()/smp_mb() + * 105: ppc_msgsnd() -> 42 + * 42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored + * 105: // hangs waiting on 42 to process messages/call_single_queue + * + * We also issue an smp_mb() at the end of kvmppc_clear_host_ipi(). This is + * to guard against sequences such as the following (as well as to create + * a read-side pairing with the barrier in kvmppc_set_host_ipi()): + * + * CPU + * X: smp_muxed_ipi_set_message(): + * X: smp_mb() + * X: message[RESCHEDULE] = 1 + * X: doorbell_global_ipi(42): + * X: kvmppc_set_host_ipi(42) + * X: ppc_msgsnd_sync()/smp_mb() + * X: ppc_msgsnd() -> 42 + * 42: doorbell_exception(): // from CPU X + * 42: ppc_msgsync() + * // STORE DEFERRED DUE TO RE-ORDERING + * -- 42: kvmppc_clear_host_ipi(42) + * | 42: smp_ipi_demux_relaxed() + * | 105: smp_muxed_ipi_set_message(): + * | 105: smb_mb() + * | 105: message[CALL_FUNCTION] = 1 + * | 105: doorbell_global_ipi(42): + * | 105: kvmppc_set_host_ipi(42) + * | // RE-ORDERED STORE COMPLETES + * -> 42: kvmppc_clear_host_ipi(42) + * 42: // returns to executing guest + * 105: ppc_msgsnd_sync()/smp_mb() + * 105: ppc_msgsnd() -> 42 + * 42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored + * 105: // hangs waiting on 42 to process messages/call_single_queue + */ +static inline void kvmppc_set_host_ipi(int cpu) { - paca_ptrs[cpu]->kvm_hstate.host_ipi = host_ipi; + /* + * order stores of IPI messages vs. setting of host_ipi flag + * + * pairs with the barrier in kvmppc_clear_host_ipi() + */ + smp_mb(); + paca_ptrs[cpu]->kvm_hstate.host_ipi = 1; +} + +static inline void kvmppc_clear_host_ipi(int cpu) +{ + paca_ptrs[cpu]->kvm_hstate.host_ipi = 0; + /* + * order clearing of host_ipi flag vs. processing of IPI messages + * + * pairs with the barrier in kvmppc_set_host_ipi() + */ + smp_mb(); } static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) @@ -486,7 +577,10 @@ static inline u32 kvmppc_get_xics_latch(void) return 0; } -static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi) +static inline void kvmppc_set_host_ipi(int cpu) +{} + +static inline void kvmppc_clear_host_ipi(int cpu) {} static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index 804b1a6196fa..f17ff1200eaa 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -33,7 +33,7 @@ void doorbell_global_ipi(int cpu) { u32 tag = get_hard_smp_processor_id(cpu); - kvmppc_set_host_ipi(cpu, 1); + kvmppc_set_host_ipi(cpu); /* Order previous accesses vs. msgsnd, which is treated as a store */ ppc_msgsnd_sync(); ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); @@ -48,7 +48,7 @@ void doorbell_core_ipi(int cpu) { u32 tag = cpu_thread_in_core(cpu); - kvmppc_set_host_ipi(cpu, 1); + kvmppc_set_host_ipi(cpu); /* Order previous accesses vs. msgsnd, which is treated as a store */ ppc_msgsnd_sync(); ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); @@ -84,7 +84,7 @@ void doorbell_exception(struct pt_regs *regs) may_hard_irq_enable(); - kvmppc_set_host_ipi(smp_processor_id(), 0); + kvmppc_clear_host_ipi(smp_processor_id()); __this_cpu_inc(irq_stat.doorbell_irqs); smp_ipi_demux_relaxed(); /* already performed the barrier */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 4d2ec77d806c..287d5911df0f 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -58,7 +58,7 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) hcpu = hcore << threads_shift; kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); - kvmppc_set_host_ipi(hcpu, 1); + kvmppc_set_host_ipi(hcpu); smp_mb(); kvmhv_rm_send_ipi(hcpu); } diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 94cd96b9b7bb..fbd6e6b7bbf2 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -193,7 +193,7 @@ static void pnv_smp_cpu_kill_self(void) * for coming online, which are handled via * generic_check_cpu_restart() calls. */ - kvmppc_set_host_ipi(cpu, 0); + kvmppc_clear_host_ipi(cpu); srr1 = pnv_cpu_offline(cpu); diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index 485569ff7ef1..7d13d2ef5a90 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -140,7 +140,7 @@ static unsigned int icp_native_get_irq(void) static void icp_native_cause_ipi(int cpu) { - kvmppc_set_host_ipi(cpu, 1); + kvmppc_set_host_ipi(cpu); icp_native_set_qirr(cpu, IPI_PRIORITY); } @@ -179,7 +179,7 @@ void icp_native_flush_interrupt(void) if (vec == XICS_IPI) { /* Clear pending IPI */ int cpu = smp_processor_id(); - kvmppc_set_host_ipi(cpu, 0); + kvmppc_clear_host_ipi(cpu); icp_native_set_qirr(cpu, 0xff); } else { pr_err("XICS: hw interrupt 0x%x to offline cpu, disabling\n", @@ -200,7 +200,7 @@ static irqreturn_t icp_native_ipi_action(int irq, void *dev_id) { int cpu = smp_processor_id(); - kvmppc_set_host_ipi(cpu, 0); + kvmppc_clear_host_ipi(cpu); icp_native_set_qirr(cpu, 0xff); return smp_ipi_demux(); diff --git a/arch/powerpc/sysdev/xics/icp-opal.c b/arch/powerpc/sysdev/xics/icp-opal.c index 8bb8dd7dd6ad..68fd2540b093 100644 --- a/arch/powerpc/sysdev/xics/icp-opal.c +++ b/arch/powerpc/sysdev/xics/icp-opal.c @@ -126,7 +126,7 @@ static void icp_opal_cause_ipi(int cpu) { int hw_cpu = get_hard_smp_processor_id(cpu); - kvmppc_set_host_ipi(cpu, 1); + kvmppc_set_host_ipi(cpu); opal_int_set_mfrr(hw_cpu, IPI_PRIORITY); } @@ -134,7 +134,7 @@ static irqreturn_t icp_opal_ipi_action(int irq, void *dev_id) { int cpu = smp_processor_id(); - kvmppc_set_host_ipi(cpu, 0); + kvmppc_clear_host_ipi(cpu); opal_int_set_mfrr(get_hard_smp_processor_id(cpu), 0xff); return smp_ipi_demux(); @@ -157,7 +157,7 @@ void icp_opal_flush_interrupt(void) if (vec == XICS_IPI) { /* Clear pending IPI */ int cpu = smp_processor_id(); - kvmppc_set_host_ipi(cpu, 0); + kvmppc_clear_host_ipi(cpu); opal_int_set_mfrr(get_hard_smp_processor_id(cpu), 0xff); } else { pr_err("XICS: hw interrupt 0x%x to offline cpu, " From f51913eef23f74c3bd07899dc7f1ed6df9e521d8 Mon Sep 17 00:00:00 2001 From: Stanley Chu Date: Wed, 18 Sep 2019 12:20:38 +0800 Subject: [PATCH 0838/2880] scsi: ufs: skip shutdown if hba is not powered In some cases, hba may go through shutdown flow without successful initialization and then make system hang. For example, if ufshcd_change_power_mode() gets error and leads to ufshcd_hba_exit() to release resources of the host, future shutdown flow may hang the system since the host register will be accessed in unpowered state. To solve this issue, simply add checking to skip shutdown for above kind of situation. Link: https://lore.kernel.org/r/1568780438-28753-1-git-send-email-stanley.chu@mediatek.com Signed-off-by: Stanley Chu Acked-by: Bean Huo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index c4a015e42045..57b2c3a8def3 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8140,6 +8140,9 @@ int ufshcd_shutdown(struct ufs_hba *hba) { int ret = 0; + if (!hba->is_powered) + goto out; + if (ufshcd_is_ufs_dev_poweroff(hba) && ufshcd_is_link_off(hba)) goto out; From 4b062e7cf940aa4f38fedc3a964b24cb095373f0 Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Thu, 19 Sep 2019 16:55:48 +0900 Subject: [PATCH 0839/2880] scsi: qedf: Remove always false 'tmp_prio < 0' statement Since tmp_prio is declared as u8, the following statement is always false. tmp_prio < 0 So remove 'always false' statement. Link: https://lore.kernel.org/r/20190919075548.GA112801@LGEARND20B15 Signed-off-by: Austin Kim Acked-by: Saurav Kashyap Signed-off-by: Martin K. Petersen --- drivers/scsi/qedf/qedf_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index 9c24f3834d70..1d2c111bdf82 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -596,7 +596,7 @@ static void qedf_dcbx_handler(void *dev, struct qed_dcbx_get *get, u32 mib_type) tmp_prio = get->operational.app_prio.fcoe; if (qedf_default_prio > -1) qedf->prio = qedf_default_prio; - else if (tmp_prio < 0 || tmp_prio > 7) { + else if (tmp_prio > 7) { QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "FIP/FCoE prio %d out of range, setting to %d.\n", tmp_prio, QEDF_DEFAULT_PRIO); From 0ed8810276907c8a633dc8cecc48dabb6678cd23 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 6 Sep 2019 10:24:20 -0700 Subject: [PATCH 0840/2880] scsi: storvsc: setup 1:1 mapping between hardware queue and CPU queue storvsc doesn't use a dedicated hardware queue for a given CPU queue. When issuing I/O, it selects returning CPU (hardware queue) dynamically based on vmbus channel usage across all channels. This patch advertises num_present_cpus() as number of hardware queues. This will have upper layer setup 1:1 mapping between hardware queue and CPU queue and avoid unnecessary locking when issuing I/O. Link: https://lore.kernel.org/r/1567790660-48142-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Long Li Reviewed-by: Michael Kelley Signed-off-by: Martin K. Petersen --- drivers/scsi/storvsc_drv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index ed8b9ac805e6..542d2bac2922 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1837,8 +1837,7 @@ static int storvsc_probe(struct hv_device *device, /* * Set the number of HW queues we are supporting. */ - if (stor_device->num_sc != 0) - host->nr_hw_queues = stor_device->num_sc + 1; + host->nr_hw_queues = num_present_cpus(); /* * Set the error handler work queue. From 70054aa39a013fa52eff432f2223b8bd5c0048f8 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Sat, 7 Sep 2019 09:07:30 +0800 Subject: [PATCH 0841/2880] scsi: megaraid: disable device when probe failed after enabled device For pci device, need to disable device when probe failed after enabled device. Link: https://lore.kernel.org/r/1567818450-173315-1-git-send-email-chenxiang66@hisilicon.com Signed-off-by: Xiang Chen Reviewed-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 45a66048801b..ff6d4aa92421 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -4183,11 +4183,11 @@ megaraid_probe_one(struct pci_dev *pdev, const struct pci_device_id *id) */ if (pdev->subsystem_vendor == PCI_VENDOR_ID_COMPAQ && pdev->subsystem_device == 0xC000) - return -ENODEV; + goto out_disable_device; /* Now check the magic signature byte */ pci_read_config_word(pdev, PCI_CONF_AMISIG, &magic); if (magic != HBA_SIGNATURE_471 && magic != HBA_SIGNATURE) - return -ENODEV; + goto out_disable_device; /* Ok it is probably a megaraid */ } From 4b6b1bb68628ec49e4cbfabd8ac17a169f079cd8 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Sep 2019 13:40:35 +0800 Subject: [PATCH 0842/2880] scsi: hisi_sas: Make three functions static Fix sparse warnings: drivers/scsi/hisi_sas/hisi_sas_main.c:3686:6: warning: symbol 'hisi_sas_debugfs_release' was not declared. Should it be static? drivers/scsi/hisi_sas/hisi_sas_main.c:3708:5: warning: symbol 'hisi_sas_debugfs_alloc' was not declared. Should it be static? drivers/scsi/hisi_sas/hisi_sas_main.c:3799:6: warning: symbol 'hisi_sas_debugfs_bist_init' was not declared. Should it be static? Link: https://lore.kernel.org/r/20190923054035.19036-1-yuehaibing@huawei.com Reported-by: Hulk Robot Signed-off-by: YueHaibing Acked-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index d1513fdf1e00..0847e682797b 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -3683,7 +3683,7 @@ void hisi_sas_debugfs_work_handler(struct work_struct *work) } EXPORT_SYMBOL_GPL(hisi_sas_debugfs_work_handler); -void hisi_sas_debugfs_release(struct hisi_hba *hisi_hba) +static void hisi_sas_debugfs_release(struct hisi_hba *hisi_hba) { struct device *dev = hisi_hba->dev; int i; @@ -3705,7 +3705,7 @@ void hisi_sas_debugfs_release(struct hisi_hba *hisi_hba) devm_kfree(dev, hisi_hba->debugfs_port_reg[i]); } -int hisi_sas_debugfs_alloc(struct hisi_hba *hisi_hba) +static int hisi_sas_debugfs_alloc(struct hisi_hba *hisi_hba) { const struct hisi_sas_hw *hw = hisi_hba->hw; struct device *dev = hisi_hba->dev; @@ -3796,7 +3796,7 @@ fail: return -ENOMEM; } -void hisi_sas_debugfs_bist_init(struct hisi_hba *hisi_hba) +static void hisi_sas_debugfs_bist_init(struct hisi_hba *hisi_hba) { hisi_hba->debugfs_bist_dentry = debugfs_create_dir("bist", hisi_hba->debugfs_dir); From 248a445adfc8c33ffd67cf1f2e336578e34f9e21 Mon Sep 17 00:00:00 2001 From: Himanshu Madhani Date: Thu, 12 Sep 2019 11:09:05 -0700 Subject: [PATCH 0843/2880] scsi: qla2xxx: Silence fwdump template message Print if fwdt template is present or not, only when ql2xextended_error_logging is enabled. Link: https://lore.kernel.org/r/20190912180918.6436-2-hmadhani@marvell.com Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index b6cdf108994c..f6a5d78f93c9 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -3190,7 +3190,7 @@ qla2x00_alloc_fw_dump(scsi_qla_host_t *vha) for (j = 0; j < 2; j++, fwdt++) { if (!fwdt->template) { - ql_log(ql_log_warn, vha, 0x00ba, + ql_dbg(ql_dbg_init, vha, 0x00ba, "-> fwdt%u no template\n", j); continue; } From c3b6a1d397420a0fdd97af2f06abfb78adc370df Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 12 Sep 2019 11:09:06 -0700 Subject: [PATCH 0844/2880] scsi: qla2xxx: Fix unbound sleep in fcport delete path. There are instances, though rare, where a LOGO request cannot be sent out and the thread in free session done can wait indefinitely. Fix this by putting an upper bound to sleep. Link: https://lore.kernel.org/r/20190912180918.6436-3-hmadhani@marvell.com Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_target.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 0ffda6171614..b58ecd2d7fb6 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -1020,6 +1020,7 @@ void qlt_free_session_done(struct work_struct *work) if (logout_started) { bool traced = false; + u16 cnt = 0; while (!READ_ONCE(sess->logout_completed)) { if (!traced) { @@ -1029,6 +1030,9 @@ void qlt_free_session_done(struct work_struct *work) traced = true; } msleep(100); + cnt++; + if (cnt > 200) + break; } ql_dbg(ql_dbg_disc, vha, 0xf087, From fd5564ba54e0d8a9e3e823d311b764232e09eb5f Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 12 Sep 2019 11:09:07 -0700 Subject: [PATCH 0845/2880] scsi: qla2xxx: Fix stale mem access on driver unload On driver unload, 'remove_one' thread was allowed to advance, while session cleanup still lag behind. This patch ensures session deletion will finish before remove_one can advance. Link: https://lore.kernel.org/r/20190912180918.6436-4-hmadhani@marvell.com Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_os.c | 1 + drivers/scsi/qla2xxx/qla_target.c | 21 ++++++++------------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 51154c049385..42980e52fb41 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1118,6 +1118,7 @@ qla2x00_wait_for_sess_deletion(scsi_qla_host_t *vha) qla2x00_mark_all_devices_lost(vha, 0); wait_event_timeout(vha->fcport_waitQ, test_fcport_count(vha), 10*HZ); + flush_workqueue(vha->hw->wq); } /* diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index b58ecd2d7fb6..b5315be00b4d 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -953,7 +953,7 @@ void qlt_free_session_done(struct work_struct *work) struct qla_hw_data *ha = vha->hw; unsigned long flags; bool logout_started = false; - scsi_qla_host_t *base_vha; + scsi_qla_host_t *base_vha = pci_get_drvdata(ha->pdev); struct qlt_plogi_ack_t *own = sess->plogi_link[QLT_PLOGI_LINK_SAME_WWN]; @@ -1105,6 +1105,7 @@ void qlt_free_session_done(struct work_struct *work) } spin_unlock_irqrestore(&ha->tgt.sess_lock, flags); + sess->free_pending = 0; ql_dbg(ql_dbg_tgt_mgt, vha, 0xf001, "Unregistration of sess %p %8phC finished fcp_cnt %d\n", @@ -1113,17 +1114,8 @@ void qlt_free_session_done(struct work_struct *work) if (tgt && (tgt->sess_count == 0)) wake_up_all(&tgt->waitQ); - if (vha->fcport_count == 0) - wake_up_all(&vha->fcport_waitQ); - - base_vha = pci_get_drvdata(ha->pdev); - - sess->free_pending = 0; - - if (test_bit(PFLG_DRIVER_REMOVING, &base_vha->pci_flags)) - return; - - if ((!tgt || !tgt->tgt_stop) && !LOOP_TRANSITION(vha)) { + if (!test_bit(PFLG_DRIVER_REMOVING, &base_vha->pci_flags) && + (!tgt || !tgt->tgt_stop) && !LOOP_TRANSITION(vha)) { switch (vha->host->active_mode) { case MODE_INITIATOR: case MODE_DUAL: @@ -1136,6 +1128,9 @@ void qlt_free_session_done(struct work_struct *work) break; } } + + if (vha->fcport_count == 0) + wake_up_all(&vha->fcport_waitQ); } /* ha->tgt.sess_lock supposed to be held on entry */ @@ -1165,7 +1160,7 @@ void qlt_unreg_sess(struct fc_port *sess) sess->last_login_gen = sess->login_gen; INIT_WORK(&sess->free_work, qlt_free_session_done); - schedule_work(&sess->free_work); + queue_work(sess->vha->hw->wq, &sess->free_work); } EXPORT_SYMBOL(qlt_unreg_sess); From f5187b7d1ac66b61676f896751d3af9fcf8dd592 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 12 Sep 2019 11:09:08 -0700 Subject: [PATCH 0846/2880] scsi: qla2xxx: Optimize NPIV tear down process In the case of NPIV port is being torn down, this patch will set a flag to indicate VPORT_DELETE. This would prevent relogin to be triggered. Link: https://lore.kernel.org/r/20190912180918.6436-5-hmadhani@marvell.com Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_attr.c | 2 ++ drivers/scsi/qla2xxx/qla_def.h | 1 + drivers/scsi/qla2xxx/qla_gs.c | 3 ++- drivers/scsi/qla2xxx/qla_mid.c | 32 +++++++++++++++++++++---------- drivers/scsi/qla2xxx/qla_os.c | 7 ++++++- drivers/scsi/qla2xxx/qla_target.c | 1 + 6 files changed, 34 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index e9c449ef515c..8b3015361428 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -2920,6 +2920,8 @@ qla24xx_vport_delete(struct fc_vport *fc_vport) struct qla_hw_data *ha = vha->hw; uint16_t id = vha->vp_idx; + set_bit(VPORT_DELETE, &vha->dpc_flags); + while (test_bit(LOOP_RESYNC_ACTIVE, &vha->dpc_flags) || test_bit(FCPORT_UPDATE_NEEDED, &vha->dpc_flags)) msleep(1000); diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 873a6aef1c5c..fa038568beb6 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -4394,6 +4394,7 @@ typedef struct scsi_qla_host { #define IOCB_WORK_ACTIVE 31 #define SET_ZIO_THRESHOLD_NEEDED 32 #define ISP_ABORT_TO_ROM 33 +#define VPORT_DELETE 34 unsigned long pci_flags; #define PFLG_DISCONNECTED 0 /* PCI device removed */ diff --git a/drivers/scsi/qla2xxx/qla_gs.c b/drivers/scsi/qla2xxx/qla_gs.c index dc0e36676313..5298ed10059f 100644 --- a/drivers/scsi/qla2xxx/qla_gs.c +++ b/drivers/scsi/qla2xxx/qla_gs.c @@ -3102,7 +3102,8 @@ int qla24xx_post_gpnid_work(struct scsi_qla_host *vha, port_id_t *id) { struct qla_work_evt *e; - if (test_bit(UNLOADING, &vha->dpc_flags)) + if (test_bit(UNLOADING, &vha->dpc_flags) || + (vha->vp_idx && test_bit(VPORT_DELETE, &vha->dpc_flags))) return 0; e = qla2x00_alloc_work(vha, QLA_EVT_GPNID); diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c index 1a9a11ae7285..6afad68e5ba2 100644 --- a/drivers/scsi/qla2xxx/qla_mid.c +++ b/drivers/scsi/qla2xxx/qla_mid.c @@ -66,6 +66,7 @@ qla24xx_deallocate_vp_id(scsi_qla_host_t *vha) uint16_t vp_id; struct qla_hw_data *ha = vha->hw; unsigned long flags = 0; + u8 i; mutex_lock(&ha->vport_lock); /* @@ -75,8 +76,9 @@ qla24xx_deallocate_vp_id(scsi_qla_host_t *vha) * ensures no active vp_list traversal while the vport is removed * from the queue) */ - wait_event_timeout(vha->vref_waitq, !atomic_read(&vha->vref_count), - 10*HZ); + for (i = 0; i < 10 && atomic_read(&vha->vref_count); i++) + wait_event_timeout(vha->vref_waitq, + atomic_read(&vha->vref_count), HZ); spin_lock_irqsave(&ha->vport_slock, flags); if (atomic_read(&vha->vref_count)) { @@ -262,6 +264,9 @@ qla2x00_alert_all_vps(struct rsp_que *rsp, uint16_t *mb) spin_lock_irqsave(&ha->vport_slock, flags); list_for_each_entry(vha, &ha->vp_list, list) { if (vha->vp_idx) { + if (test_bit(VPORT_DELETE, &vha->dpc_flags)) + continue; + atomic_inc(&vha->vref_count); spin_unlock_irqrestore(&ha->vport_slock, flags); @@ -300,6 +305,20 @@ qla2x00_alert_all_vps(struct rsp_que *rsp, uint16_t *mb) int qla2x00_vp_abort_isp(scsi_qla_host_t *vha) { + fc_port_t *fcport; + + /* + * To exclusively reset vport, we need to log it out first. + * Note: This control_vp can fail if ISP reset is already + * issued, this is expected, as the vp would be already + * logged out due to ISP reset. + */ + if (!test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags)) { + qla24xx_control_vp(vha, VCE_COMMAND_DISABLE_VPS_LOGO_ALL); + list_for_each_entry(fcport, &vha->vp_fcports, list) + fcport->logout_on_delete = 0; + } + /* * Physical port will do most of the abort and recovery work. We can * just treat it as a loop down @@ -312,16 +331,9 @@ qla2x00_vp_abort_isp(scsi_qla_host_t *vha) atomic_set(&vha->loop_down_timer, LOOP_DOWN_TIME); } - /* - * To exclusively reset vport, we need to log it out first. Note: this - * control_vp can fail if ISP reset is already issued, this is - * expected, as the vp would be already logged out due to ISP reset. - */ - if (!test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags)) - qla24xx_control_vp(vha, VCE_COMMAND_DISABLE_VPS_LOGO_ALL); - ql_dbg(ql_dbg_taskm, vha, 0x801d, "Scheduling enable of Vport %d.\n", vha->vp_idx); + return qla24xx_enable_vp(vha); } diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 42980e52fb41..5eda86ff6606 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1115,9 +1115,14 @@ static inline int test_fcport_count(scsi_qla_host_t *vha) void qla2x00_wait_for_sess_deletion(scsi_qla_host_t *vha) { + u8 i; + qla2x00_mark_all_devices_lost(vha, 0); - wait_event_timeout(vha->fcport_waitQ, test_fcport_count(vha), 10*HZ); + for (i = 0; i < 10; i++) + wait_event_timeout(vha->fcport_waitQ, test_fcport_count(vha), + HZ); + flush_workqueue(vha->hw->wq); } diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index b5315be00b4d..a06e56224a55 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -1115,6 +1115,7 @@ void qlt_free_session_done(struct work_struct *work) wake_up_all(&tgt->waitQ); if (!test_bit(PFLG_DRIVER_REMOVING, &base_vha->pci_flags) && + !(vha->vp_idx && test_bit(VPORT_DELETE, &vha->dpc_flags)) && (!tgt || !tgt->tgt_stop) && !LOOP_TRANSITION(vha)) { switch (vha->host->active_mode) { case MODE_INITIATOR: From 7f2a398d59d658818f3d219645164676fbbc88e8 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 12 Sep 2019 11:09:09 -0700 Subject: [PATCH 0847/2880] scsi: qla2xxx: Fix N2N link reset Fix stalled link recovery for N2N with FC-NVMe connection. Link: https://lore.kernel.org/r/20190912180918.6436-6-hmadhani@marvell.com Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_def.h | 3 +- drivers/scsi/qla2xxx/qla_init.c | 105 +++++++++++++++++++++++--------- drivers/scsi/qla2xxx/qla_mbx.c | 23 ++++++- drivers/scsi/qla2xxx/qla_os.c | 4 ++ 4 files changed, 102 insertions(+), 33 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index fa038568beb6..6ffa9877c28b 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -2396,6 +2396,7 @@ typedef struct fc_port { unsigned int query:1; unsigned int id_changed:1; unsigned int scan_needed:1; + unsigned int n2n_flag:1; struct completion nvme_del_done; uint32_t nvme_prli_service_param; @@ -2446,7 +2447,6 @@ typedef struct fc_port { uint8_t fc4_type; uint8_t fc4f_nvme; uint8_t scan_state; - uint8_t n2n_flag; unsigned long last_queue_full; unsigned long last_ramp_up; @@ -3036,6 +3036,7 @@ enum scan_flags_t { enum fc4type_t { FS_FC4TYPE_FCP = BIT_0, FS_FC4TYPE_NVME = BIT_1, + FS_FCP_IS_N2N = BIT_7, }; struct fab_scan_rp { diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index f6a5d78f93c9..67f522a8a9d4 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -746,12 +746,15 @@ static void qla24xx_handle_gnl_done_event(scsi_qla_host_t *vha, break; default: if ((id.b24 != fcport->d_id.b24 && - fcport->d_id.b24) || + fcport->d_id.b24 && + fcport->loop_id != FC_NO_LOOP_ID) || (fcport->loop_id != FC_NO_LOOP_ID && fcport->loop_id != loop_id)) { ql_dbg(ql_dbg_disc, vha, 0x20e3, "%s %d %8phC post del sess\n", __func__, __LINE__, fcport->port_name); + if (fcport->n2n_flag) + fcport->d_id.b24 = 0; qlt_schedule_sess_for_deletion(fcport); return; } @@ -759,6 +762,8 @@ static void qla24xx_handle_gnl_done_event(scsi_qla_host_t *vha, } fcport->loop_id = loop_id; + if (fcport->n2n_flag) + fcport->d_id.b24 = id.b24; wwn = wwn_to_u64(fcport->port_name); qlt_find_sess_invalidate_other(vha, wwn, @@ -972,7 +977,7 @@ static void qla24xx_async_gnl_sp_done(srb_t *sp, int res) wwn = wwn_to_u64(e->port_name); ql_dbg(ql_dbg_disc + ql_dbg_verbose, vha, 0x20e8, - "%s %8phC %02x:%02x:%02x state %d/%d lid %x \n", + "%s %8phC %02x:%02x:%02x CLS %x/%x lid %x \n", __func__, (void *)&wwn, e->port_id[2], e->port_id[1], e->port_id[0], e->current_login_state, e->last_login_state, (loop_id & 0x7fff)); @@ -1499,7 +1504,8 @@ int qla24xx_fcport_handle_login(struct scsi_qla_host *vha, fc_port_t *fcport) (fcport->fw_login_state == DSC_LS_PRLI_PEND))) return 0; - if (fcport->fw_login_state == DSC_LS_PLOGI_COMP) { + if (fcport->fw_login_state == DSC_LS_PLOGI_COMP && + !N2N_TOPO(vha->hw)) { if (time_before_eq(jiffies, fcport->plogi_nack_done_deadline)) { set_bit(RELOGIN_NEEDED, &vha->dpc_flags); return 0; @@ -1570,8 +1576,9 @@ int qla24xx_fcport_handle_login(struct scsi_qla_host *vha, fc_port_t *fcport) qla24xx_post_gpdb_work(vha, fcport, 0); } else { ql_dbg(ql_dbg_disc, vha, 0x2118, - "%s %d %8phC post NVMe PRLI\n", - __func__, __LINE__, fcport->port_name); + "%s %d %8phC post %s PRLI\n", + __func__, __LINE__, fcport->port_name, + fcport->fc4f_nvme ? "NVME" : "FC"); qla24xx_post_prli_work(vha, fcport); } break; @@ -1853,17 +1860,38 @@ qla24xx_handle_prli_done_event(struct scsi_qla_host *vha, struct event_arg *ea) break; } - if (ea->fcport->n2n_flag) { + if (ea->fcport->fc4f_nvme) { ql_dbg(ql_dbg_disc, vha, 0x2118, "%s %d %8phC post fc4 prli\n", __func__, __LINE__, ea->fcport->port_name); ea->fcport->fc4f_nvme = 0; - ea->fcport->n2n_flag = 0; qla24xx_post_prli_work(vha, ea->fcport); + return; + } + + /* at this point both PRLI NVME & PRLI FCP failed */ + if (N2N_TOPO(vha->hw)) { + if (ea->fcport->n2n_link_reset_cnt < 3) { + ea->fcport->n2n_link_reset_cnt++; + /* + * remote port is not sending Plogi. Reset + * link to kick start his state machine + */ + set_bit(N2N_LINK_RESET, &vha->dpc_flags); + } else { + ql_log(ql_log_warn, vha, 0x2119, + "%s %d %8phC Unable to reconnect\n", + __func__, __LINE__, ea->fcport->port_name); + } + } else { + /* + * switch connect. login failed. Take connection + * down and allow relogin to retrigger + */ + ea->fcport->flags &= ~FCF_ASYNC_SENT; + ea->fcport->keep_nport_handle = 0; + qlt_schedule_sess_for_deletion(ea->fcport); } - ql_dbg(ql_dbg_disc, vha, 0x2119, - "%s %d %8phC unhandle event of %x\n", - __func__, __LINE__, ea->fcport->port_name, ea->data[0]); break; } } @@ -5000,28 +5028,47 @@ qla2x00_configure_local_loop(scsi_qla_host_t *vha) unsigned long flags; /* Inititae N2N login. */ - if (test_and_clear_bit(N2N_LOGIN_NEEDED, &vha->dpc_flags)) { - /* borrowing */ - u32 *bp, i, sz; + if (N2N_TOPO(ha)) { + if (test_and_clear_bit(N2N_LOGIN_NEEDED, &vha->dpc_flags)) { + /* borrowing */ + u32 *bp, i, sz; - memset(ha->init_cb, 0, ha->init_cb_size); - sz = min_t(int, sizeof(struct els_plogi_payload), - ha->init_cb_size); - rval = qla24xx_get_port_login_templ(vha, ha->init_cb_dma, - (void *)ha->init_cb, sz); - if (rval == QLA_SUCCESS) { - bp = (uint32_t *)ha->init_cb; - for (i = 0; i < sz/4 ; i++, bp++) - *bp = cpu_to_be32(*bp); + memset(ha->init_cb, 0, ha->init_cb_size); + sz = min_t(int, sizeof(struct els_plogi_payload), + ha->init_cb_size); + rval = qla24xx_get_port_login_templ(vha, + ha->init_cb_dma, (void *)ha->init_cb, sz); + if (rval == QLA_SUCCESS) { + bp = (uint32_t *)ha->init_cb; + for (i = 0; i < sz/4 ; i++, bp++) + *bp = cpu_to_be32(*bp); - memcpy(&ha->plogi_els_payld.data, (void *)ha->init_cb, - sizeof(ha->plogi_els_payld.data)); - set_bit(RELOGIN_NEEDED, &vha->dpc_flags); - } else { - ql_dbg(ql_dbg_init, vha, 0x00d1, - "PLOGI ELS param read fail.\n"); + memcpy(&ha->plogi_els_payld.data, + (void *)ha->init_cb, + sizeof(ha->plogi_els_payld.data)); + set_bit(RELOGIN_NEEDED, &vha->dpc_flags); + } else { + ql_dbg(ql_dbg_init, vha, 0x00d1, + "PLOGI ELS param read fail.\n"); + goto skip_login; + } + } + + list_for_each_entry(fcport, &vha->vp_fcports, list) { + if (fcport->n2n_flag) { + qla24xx_fcport_handle_login(vha, fcport); + return QLA_SUCCESS; + } + } +skip_login: + spin_lock_irqsave(&vha->work_lock, flags); + vha->scan.scan_retry++; + spin_unlock_irqrestore(&vha->work_lock, flags); + + if (vha->scan.scan_retry < MAX_SCAN_RETRIES) { + set_bit(LOCAL_LOOP_UPDATE, &vha->dpc_flags); + set_bit(LOOP_RESYNC_NEEDED, &vha->dpc_flags); } - return QLA_SUCCESS; } found_devs = 0; diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 4c858e2d0ea8..6d6e10812b42 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -2249,7 +2249,7 @@ qla2x00_lip_reset(scsi_qla_host_t *vha) mbx_cmd_t mc; mbx_cmd_t *mcp = &mc; - ql_dbg(ql_dbg_mbx + ql_dbg_verbose, vha, 0x105a, + ql_dbg(ql_dbg_disc, vha, 0x105a, "Entered %s.\n", __func__); if (IS_CNA_CAPABLE(vha->hw)) { @@ -3883,14 +3883,23 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, case TOPO_N2N: ha->current_topology = ISP_CFG_N; spin_lock_irqsave(&vha->hw->tgt.sess_lock, flags); + list_for_each_entry(fcport, &vha->vp_fcports, list) { + fcport->scan_state = QLA_FCPORT_SCAN; + fcport->n2n_flag = 0; + } + fcport = qla2x00_find_fcport_by_wwpn(vha, rptid_entry->u.f1.port_name, 1); spin_unlock_irqrestore(&vha->hw->tgt.sess_lock, flags); if (fcport) { fcport->plogi_nack_done_deadline = jiffies + HZ; - fcport->dm_login_expire = jiffies + 3*HZ; + fcport->dm_login_expire = jiffies + 2*HZ; fcport->scan_state = QLA_FCPORT_FOUND; + fcport->n2n_flag = 1; + if (vha->flags.nvme_enabled) + fcport->fc4f_nvme = 1; + switch (fcport->disc_state) { case DSC_DELETED: set_bit(RELOGIN_NEEDED, @@ -3924,7 +3933,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, rptid_entry->u.f1.port_name, rptid_entry->u.f1.node_name, NULL, - FC4_TYPE_UNKNOWN); + FS_FCP_IS_N2N); } /* if our portname is higher then initiate N2N login */ @@ -4023,6 +4032,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, list_for_each_entry(fcport, &vha->vp_fcports, list) { fcport->scan_state = QLA_FCPORT_SCAN; + fcport->n2n_flag = 0; } fcport = qla2x00_find_fcport_by_wwpn(vha, @@ -4032,6 +4042,13 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, fcport->login_retry = vha->hw->login_retry_count; fcport->plogi_nack_done_deadline = jiffies + HZ; fcport->scan_state = QLA_FCPORT_FOUND; + fcport->n2n_flag = 1; + fcport->d_id.b.domain = + rptid_entry->u.f2.remote_nport_id[2]; + fcport->d_id.b.area = + rptid_entry->u.f2.remote_nport_id[1]; + fcport->d_id.b.al_pa = + rptid_entry->u.f2.remote_nport_id[0]; } } } diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 5eda86ff6606..8411dd5acd43 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -5033,6 +5033,10 @@ void qla24xx_create_new_sess(struct scsi_qla_host *vha, struct qla_work_evt *e) memcpy(fcport->port_name, e->u.new_sess.port_name, WWN_SIZE); + + if (e->u.new_sess.fc4_type & FS_FCP_IS_N2N) + fcport->n2n_flag = 1; + } else { ql_dbg(ql_dbg_disc, vha, 0xffff, "%s %8phC mem alloc fail.\n", From f3f1938bb673b1b5ad182c4608f5f8a24921eea3 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 12 Sep 2019 11:09:10 -0700 Subject: [PATCH 0848/2880] scsi: qla2xxx: Fix N2N link up fail During link up/bounce, qla driver would do command flush as part of cleanup. In this case, the flush can intefere with FW state. This patch allows FW to be in control of link up. Link: https://lore.kernel.org/r/20190912180918.6436-7-hmadhani@marvell.com Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_mbx.c | 2 ++ drivers/scsi/qla2xxx/qla_os.c | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 6d6e10812b42..1cc6913f76c4 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -3897,6 +3897,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, fcport->dm_login_expire = jiffies + 2*HZ; fcport->scan_state = QLA_FCPORT_FOUND; fcport->n2n_flag = 1; + fcport->keep_nport_handle = 1; if (vha->flags.nvme_enabled) fcport->fc4f_nvme = 1; @@ -4042,6 +4043,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, fcport->login_retry = vha->hw->login_retry_count; fcport->plogi_nack_done_deadline = jiffies + HZ; fcport->scan_state = QLA_FCPORT_FOUND; + fcport->keep_nport_handle = 1; fcport->n2n_flag = 1; fcport->d_id.b.domain = rptid_entry->u.f2.remote_nport_id[2]; diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 8411dd5acd43..ee47de9fbc05 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -5135,11 +5135,9 @@ void qla24xx_create_new_sess(struct scsi_qla_host *vha, struct qla_work_evt *e) if (dfcp) qlt_schedule_sess_for_deletion(tfcp); - - if (N2N_TOPO(vha->hw)) - fcport->flags &= ~FCF_FABRIC_DEVICE; - if (N2N_TOPO(vha->hw)) { + fcport->flags &= ~FCF_FABRIC_DEVICE; + fcport->keep_nport_handle = 1; if (vha->flags.nvme_enabled) { fcport->fc4f_nvme = 1; fcport->n2n_flag = 1; From 0aabb6b699f72dca96988d3f428e222f932dc889 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 12 Sep 2019 11:09:11 -0700 Subject: [PATCH 0849/2880] scsi: qla2xxx: Fix Nport ID display value For N2N, the NPort ID is assigned by driver in the PLOGI ELS. According to FW Spec the byte order for SID is not the same as DID. Link: https://lore.kernel.org/r/20190912180918.6436-8-hmadhani@marvell.com Reviewed-by: Roman Bolshakov Tested-by: Roman Bolshakov Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_iocb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c index e92e52aa6e9b..518eb954cf42 100644 --- a/drivers/scsi/qla2xxx/qla_iocb.c +++ b/drivers/scsi/qla2xxx/qla_iocb.c @@ -2656,9 +2656,10 @@ qla24xx_els_logo_iocb(srb_t *sp, struct els_entry_24xx *els_iocb) els_iocb->port_id[0] = sp->fcport->d_id.b.al_pa; els_iocb->port_id[1] = sp->fcport->d_id.b.area; els_iocb->port_id[2] = sp->fcport->d_id.b.domain; - els_iocb->s_id[0] = vha->d_id.b.al_pa; - els_iocb->s_id[1] = vha->d_id.b.area; - els_iocb->s_id[2] = vha->d_id.b.domain; + /* For SID the byte order is different than DID */ + els_iocb->s_id[1] = vha->d_id.b.al_pa; + els_iocb->s_id[2] = vha->d_id.b.area; + els_iocb->s_id[0] = vha->d_id.b.domain; if (elsio->u.els_logo.els_cmd == ELS_DCMD_PLOGI) { els_iocb->control_flags = 0; From d2f15428d6a0ebfc0edc364094d7c4a2de7037ed Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 22 Sep 2019 00:55:46 -0500 Subject: [PATCH 0850/2880] smb3: fix leak in "open on server" perf counter We were not bumping up the "open on server" (num_remote_opens) counter (in some cases) on opens of the share root so could end up showing as a negative value. CC: Stable Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2ops.c | 5 +++++ fs/cifs/smb2pdu.c | 1 + 2 files changed, 6 insertions(+) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index eaed18061314..3010066a8709 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -751,6 +751,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) goto oshr_exit; } + atomic_inc(&tcon->num_remote_opens); + o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; oparms.fid->persistent_fid = o_rsp->PersistentFileId; oparms.fid->volatile_fid = o_rsp->VolatileFileId; @@ -1176,6 +1178,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rc = compound_send_recv(xid, ses, flags, 3, rqst, resp_buftype, rsp_iov); + /* no need to bump num_remote_opens because handle immediately closed */ sea_exit: kfree(ea); @@ -1518,6 +1521,8 @@ smb2_ioctl_query_info(const unsigned int xid, resp_buftype, rsp_iov); if (rc) goto iqinf_exit; + + /* No need to bump num_remote_opens since handle immediately closed */ if (qi.flags & PASSTHRU_FSCTL) { pqi = (struct smb_query_info __user *)arg; io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 87066f1af12c..5f2491efd950 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2352,6 +2352,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, rqst.rq_iov = iov; rqst.rq_nvec = n_iov; + /* no need to inc num_remote_opens because we close it just below */ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); /* resource #4: response buffer */ From 388962e8e9ce8b253e91cbaed94e78a07dc92d84 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 23 Sep 2019 15:06:18 +0800 Subject: [PATCH 0851/2880] fs/cifs/smb2pdu.c: Make SMB2_notify_init static Fix sparse warnings: fs/cifs/smb2pdu.c:3200:1: warning: symbol 'SMB2_notify_init' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5f2491efd950..ea08e6159481 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3181,7 +3181,7 @@ SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, * See MS-SMB2 2.2.35 and 2.2.36 */ -int +static int SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 completion_filter, bool watch_tree) From 8559ad8e89185ca55d8c9ef1e0b20f58578a4055 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Tue, 24 Sep 2019 10:13:47 +0800 Subject: [PATCH 0852/2880] fs/cifs/sess.c: Remove set but not used variable 'capabilities' Fixes gcc '-Wunused-but-set-variable' warning: fs/cifs/sess.c: In function sess_auth_lanman: fs/cifs/sess.c:910:8: warning: variable capabilities set but not used [-Wunused-but-set-variable] Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Steve French --- fs/cifs/sess.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 4c764ff7edd2..85bd644f9773 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -698,7 +698,6 @@ sess_auth_lanman(struct sess_data *sess_data) char *bcc_ptr; struct cifs_ses *ses = sess_data->ses; char lnm_session_key[CIFS_AUTH_RESP_SIZE]; - __u32 capabilities; __u16 bytes_remaining; /* lanman 2 style sessionsetup */ @@ -709,7 +708,7 @@ sess_auth_lanman(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; bcc_ptr = sess_data->iov[2].iov_base; - capabilities = cifs_ssetup_hdr(ses, pSMB); + (void)cifs_ssetup_hdr(ses, pSMB); pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; From 63d37fb4ce5ae7bf1e58f906d1bf25f036fe79b2 Mon Sep 17 00:00:00 2001 From: Murphy Zhou Date: Sat, 21 Sep 2019 19:26:00 +0800 Subject: [PATCH 0853/2880] CIFS: fix max ea value size It should not be larger then the slab max buf size. If user specifies a larger size, it passes this check and goes straightly to SMB2_set_info_init performing an insecure memcpy. Signed-off-by: Murphy Zhou Reviewed-by: Aurelien Aptel CC: Stable Signed-off-by: Steve French --- fs/cifs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 9076150758d8..db4ba8f6077e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -31,7 +31,7 @@ #include "cifs_fs_sb.h" #include "cifs_unicode.h" -#define MAX_EA_VALUE_SIZE 65535 +#define MAX_EA_VALUE_SIZE CIFSMaxBufSize #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" #define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */ #define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */ From 0b8dc6abbdb9f6696b1a79c42976e506645e5c2c Mon Sep 17 00:00:00 2001 From: Yan-Hsuan Chuang Date: Mon, 23 Sep 2019 10:47:03 +0800 Subject: [PATCH 0854/2880] rtw88: configure firmware after HCI started After firmware has been downloaded, driver should send some information to it through H2C commands. Those H2C commands are transmitted through TX path. But before HCI has been started, the TX path is not working completely. Such as PCI interfaces, the interrupts are not enabled, hence TX interrupts will not be issued after H2C skb has been DMAed to the device. And the H2C skbs will not be released until the device is powered off. Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtw88/mac.c | 3 --- drivers/net/wireless/realtek/rtw88/main.c | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/mac.c b/drivers/net/wireless/realtek/rtw88/mac.c index fc14b37d927d..b61b073031e5 100644 --- a/drivers/net/wireless/realtek/rtw88/mac.c +++ b/drivers/net/wireless/realtek/rtw88/mac.c @@ -707,9 +707,6 @@ int rtw_download_firmware(struct rtw_dev *rtwdev, struct rtw_fw_state *fw) rtwdev->h2c.last_box_num = 0; rtwdev->h2c.seq = 0; - rtw_fw_send_general_info(rtwdev); - rtw_fw_send_phydm_info(rtwdev); - rtw_flag_set(rtwdev, RTW_FLAG_FW_RUNNING); return 0; diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c index fc8f6213fc8f..6dd457741b15 100644 --- a/drivers/net/wireless/realtek/rtw88/main.c +++ b/drivers/net/wireless/realtek/rtw88/main.c @@ -704,6 +704,10 @@ static int rtw_power_on(struct rtw_dev *rtwdev) goto err_off; } + /* send H2C after HCI has started */ + rtw_fw_send_general_info(rtwdev); + rtw_fw_send_phydm_info(rtwdev); + wifi_only = !rtwdev->efuse.btcoex; rtw_coex_power_on_setting(rtwdev); rtw_coex_init_hw_config(rtwdev, wifi_only); From 34c0989c05314b0288b058a4d1f6e58e2d320929 Mon Sep 17 00:00:00 2001 From: Andrei Dulea Date: Fri, 13 Sep 2019 16:42:28 +0200 Subject: [PATCH 0855/2880] iommu/amd: Fix pages leak in free_pagetable() Take into account the gathered freelist in free_sub_pt(), otherwise we end up leaking all that pages. Fixes: 409afa44f9ba ("iommu/amd: Introduce free_sub_pt() function") Signed-off-by: Andrei Dulea --- drivers/iommu/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 1ed3b98324ba..138547446345 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1425,7 +1425,7 @@ static void free_pagetable(struct protection_domain *domain) BUG_ON(domain->mode < PAGE_MODE_NONE || domain->mode > PAGE_MODE_6_LEVEL); - free_sub_pt(root, domain->mode, freelist); + freelist = free_sub_pt(root, domain->mode, freelist); free_page_list(freelist); } From 6ccb72f8374e17d60b58a7bfd5570496332c54e2 Mon Sep 17 00:00:00 2001 From: Andrei Dulea Date: Fri, 13 Sep 2019 16:42:29 +0200 Subject: [PATCH 0856/2880] iommu/amd: Fix downgrading default page-sizes in alloc_pte() Downgrading an existing large mapping to a mapping using smaller page-sizes works only for the mappings created with page-mode 7 (i.e. non-default page size). Treat large mappings created with page-mode 0 (i.e. default page size) like a non-present mapping and allow to overwrite it in alloc_pte(). While around, make sure that we flush the TLB only if we change an existing mapping, otherwise we might end up acting on garbage PTEs. Fixes: 6d568ef9a622 ("iommu/amd: Allow downgrading page-sizes in alloc_pte()") Signed-off-by: Andrei Dulea --- drivers/iommu/amd_iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 138547446345..c7e28a8d25d1 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1490,6 +1490,7 @@ static u64 *alloc_pte(struct protection_domain *domain, pte_level = PM_PTE_LEVEL(__pte); if (!IOMMU_PTE_PRESENT(__pte) || + pte_level == PAGE_MODE_NONE || pte_level == PAGE_MODE_7_LEVEL) { page = (u64 *)get_zeroed_page(gfp); if (!page) @@ -1500,7 +1501,7 @@ static u64 *alloc_pte(struct protection_domain *domain, /* pte could have been changed somewhere. */ if (cmpxchg64(pte, __pte, __npte) != __pte) free_page((unsigned long)page); - else if (pte_level == PAGE_MODE_7_LEVEL) + else if (IOMMU_PTE_PRESENT(__pte)) domain->updated = true; continue; From 7f1f1683c1e27c280556766cfd2c219957cebe4f Mon Sep 17 00:00:00 2001 From: Andrei Dulea Date: Fri, 13 Sep 2019 16:42:30 +0200 Subject: [PATCH 0857/2880] iommu/amd: Introduce first_pte_l7() helper Given an arbitrary pte that is part of a large mapping, this function returns the first pte of the series (and optionally the mapped size and number of PTEs) It will be re-used in a subsequent patch to replace an existing L7 mapping. Fixes: 6d568ef9a622 ("iommu/amd: Allow downgrading page-sizes in alloc_pte()") Signed-off-by: Andrei Dulea --- drivers/iommu/amd_iommu.c | 40 ++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index c7e28a8d25d1..a227e7a9b8b7 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -501,6 +501,29 @@ static void iommu_uninit_device(struct device *dev) */ } +/* + * Helper function to get the first pte of a large mapping + */ +static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, + unsigned long *count) +{ + unsigned long pte_mask, pg_size, cnt; + u64 *fpte; + + pg_size = PTE_PAGE_SIZE(*pte); + cnt = PAGE_SIZE_PTE_COUNT(pg_size); + pte_mask = ~((cnt << 3) - 1); + fpte = (u64 *)(((unsigned long)pte) & pte_mask); + + if (page_size) + *page_size = pg_size; + + if (count) + *count = cnt; + + return fpte; +} + /**************************************************************************** * * Interrupt handling functions @@ -1567,17 +1590,12 @@ static u64 *fetch_pte(struct protection_domain *domain, *page_size = PTE_LEVEL_PAGE_SIZE(level); } - if (PM_PTE_LEVEL(*pte) == 0x07) { - unsigned long pte_mask; - - /* - * If we have a series of large PTEs, make - * sure to return a pointer to the first one. - */ - *page_size = pte_mask = PTE_PAGE_SIZE(*pte); - pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1); - pte = (u64 *)(((unsigned long)pte) & pte_mask); - } + /* + * If we have a series of large PTEs, make + * sure to return a pointer to the first one. + */ + if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) + pte = first_pte_l7(pte, page_size, NULL); return pte; } From cc449541f2a8a646ad5ac45cb1c7b59c3ed6b310 Mon Sep 17 00:00:00 2001 From: Andrei Dulea Date: Fri, 13 Sep 2019 16:42:31 +0200 Subject: [PATCH 0858/2880] iommu/amd: Unmap all L7 PTEs when downgrading page-sizes When replacing a large mapping created with page-mode 7 (i.e. non-default page size), tear down the entire series of replicated PTEs. Besides providing access to the old mapping, another thing that might go wrong with this issue is on the fetch_pte() code path that can return a PDE entry of the newly re-mapped range. While at it, make sure that we flush the TLB in case alloc_pte() fails and returns NULL at a lower level. Fixes: 6d568ef9a622 ("iommu/amd: Allow downgrading page-sizes in alloc_pte()") Signed-off-by: Andrei Dulea --- drivers/iommu/amd_iommu.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a227e7a9b8b7..fda9923542c9 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1512,10 +1512,32 @@ static u64 *alloc_pte(struct protection_domain *domain, __pte = *pte; pte_level = PM_PTE_LEVEL(__pte); - if (!IOMMU_PTE_PRESENT(__pte) || - pte_level == PAGE_MODE_NONE || + /* + * If we replace a series of large PTEs, we need + * to tear down all of them. + */ + if (IOMMU_PTE_PRESENT(__pte) && pte_level == PAGE_MODE_7_LEVEL) { + unsigned long count, i; + u64 *lpte; + + lpte = first_pte_l7(pte, NULL, &count); + + /* + * Unmap the replicated PTEs that still match the + * original large mapping + */ + for (i = 0; i < count; ++i) + cmpxchg64(&lpte[i], __pte, 0ULL); + + domain->updated = true; + continue; + } + + if (!IOMMU_PTE_PRESENT(__pte) || + pte_level == PAGE_MODE_NONE) { page = (u64 *)get_zeroed_page(gfp); + if (!page) return NULL; @@ -1646,8 +1668,10 @@ static int iommu_map_page(struct protection_domain *dom, count = PAGE_SIZE_PTE_COUNT(page_size); pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp); - if (!pte) + if (!pte) { + update_domain(dom); return -ENOMEM; + } for (i = 0; i < count; ++i) freelist = free_clear_pte(&pte[i], pte[i], freelist); From d32d7c52e08a25d8d4b9f1a7b56400f35b8f72fa Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 1 Sep 2019 16:28:28 +0300 Subject: [PATCH 0859/2880] net/mlx5: DR, Fix SW steering HW bits and definitions Fix wrong reserved bits offsets. Fixes: 97b5484ed608 ("net/mlx5: Add HW bits and definitions required for SW steering") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a487b681b516..138c50d5a353 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -282,7 +282,6 @@ enum { MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT = 0x940, MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT = 0x941, MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT = 0x942, - MLX5_CMD_OP_SYNC_STEERING = 0xb00, MLX5_CMD_OP_FPGA_CREATE_QP = 0x960, MLX5_CMD_OP_FPGA_MODIFY_QP = 0x961, MLX5_CMD_OP_FPGA_QUERY_QP = 0x962, @@ -296,6 +295,7 @@ enum { MLX5_CMD_OP_DESTROY_UCTX = 0xa06, MLX5_CMD_OP_CREATE_UMEM = 0xa08, MLX5_CMD_OP_DESTROY_UMEM = 0xa0a, + MLX5_CMD_OP_SYNC_STEERING = 0xb00, MLX5_CMD_OP_MAX }; @@ -487,7 +487,7 @@ union mlx5_ifc_gre_key_bits { struct mlx5_ifc_fte_match_set_misc_bits { u8 gre_c_present[0x1]; - u8 reserved_auto1[0x1]; + u8 reserved_at_1[0x1]; u8 gre_k_present[0x1]; u8 gre_s_present[0x1]; u8 source_vhca_port[0x4]; @@ -5054,50 +5054,50 @@ struct mlx5_ifc_query_hca_cap_in_bits { struct mlx5_ifc_other_hca_cap_bits { u8 roce[0x1]; - u8 reserved_0[0x27f]; + u8 reserved_at_1[0x27f]; }; struct mlx5_ifc_query_other_hca_cap_out_bits { u8 status[0x8]; - u8 reserved_0[0x18]; + u8 reserved_at_8[0x18]; u8 syndrome[0x20]; - u8 reserved_1[0x40]; + u8 reserved_at_40[0x40]; struct mlx5_ifc_other_hca_cap_bits other_capability; }; struct mlx5_ifc_query_other_hca_cap_in_bits { u8 opcode[0x10]; - u8 reserved_0[0x10]; + u8 reserved_at_10[0x10]; - u8 reserved_1[0x10]; + u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_2[0x10]; + u8 reserved_at_40[0x10]; u8 function_id[0x10]; - u8 reserved_3[0x20]; + u8 reserved_at_60[0x20]; }; struct mlx5_ifc_modify_other_hca_cap_out_bits { u8 status[0x8]; - u8 reserved_0[0x18]; + u8 reserved_at_8[0x18]; u8 syndrome[0x20]; - u8 reserved_1[0x40]; + u8 reserved_at_40[0x40]; }; struct mlx5_ifc_modify_other_hca_cap_in_bits { u8 opcode[0x10]; - u8 reserved_0[0x10]; + u8 reserved_at_10[0x10]; - u8 reserved_1[0x10]; + u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_2[0x10]; + u8 reserved_at_40[0x10]; u8 function_id[0x10]; u8 field_select[0x20]; From cc5fd15fc5578e5a3da13f6b14b42ebef5ad42c2 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Sep 2019 11:38:20 +0300 Subject: [PATCH 0860/2880] net/mlx5: DR, Remove redundant vport number from action The vport number is part of the vport_cap, there is no reason to store in a separate variable on the vport. Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality") Signed-off-by: Alex Vesker Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 7d81a7735de5..b74b7d0f6590 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -615,7 +615,7 @@ static int dr_action_handle_cs_recalc(struct mlx5dr_domain *dmn, * that recalculates the CS and forwards to the vport. */ ret = mlx5dr_domain_cache_get_recalc_cs_ft_addr(dest_action->vport.dmn, - dest_action->vport.num, + dest_action->vport.caps->num, final_icm_addr); if (ret) { mlx5dr_err(dmn, "Failed to get FW cs recalc flow table\n"); @@ -744,7 +744,7 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, dest_action = action; if (rx_rule) { /* Loopback on WIRE vport is not supported */ - if (action->vport.num == WIRE_PORT) + if (action->vport.caps->num == WIRE_PORT) goto out_invalid_arg; attr.final_icm_addr = action->vport.caps->icm_address_rx; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index a37ee6359be2..78f899fb3305 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -745,7 +745,6 @@ struct mlx5dr_action { struct { struct mlx5dr_domain *dmn; struct mlx5dr_cmd_vport_cap *caps; - u32 num; } vport; struct { u32 vlan_hdr; /* tpid_pcp_dei_vid */ From 48cbde4bd2c7c028a7205cb83386bb345c315adc Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 19 Sep 2019 11:24:19 +0300 Subject: [PATCH 0861/2880] net/mlx5: DR, Fix getting incorrect prev node in ste_free When we free an STE and the STE is in the middle of collision list, the prev_ste was obtained incorrectly from the list. To avoid such issues list_entry calls replaced with standard list API. Fixes: 26d688e33f88 ("net/mlx5: DR, Add Steering entry (STE) utilities") Signed-off-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/steering/dr_matcher.c | 10 ++++------ .../ethernet/mellanox/mlx5/core/steering/dr_rule.c | 2 +- .../ethernet/mellanox/mlx5/core/steering/dr_ste.c | 14 +++++--------- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c index 01008cd66f75..9c2c25356dd0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c @@ -458,13 +458,11 @@ static int dr_matcher_add_to_tbl(struct mlx5dr_matcher *matcher) prev_matcher = NULL; if (next_matcher && !first) - prev_matcher = list_entry(next_matcher->matcher_list.prev, - struct mlx5dr_matcher, - matcher_list); + prev_matcher = list_prev_entry(next_matcher, matcher_list); else if (!first) - prev_matcher = list_entry(tbl->matcher_list.prev, - struct mlx5dr_matcher, - matcher_list); + prev_matcher = list_last_entry(&tbl->matcher_list, + struct mlx5dr_matcher, + matcher_list); if (dmn->type == MLX5DR_DOMAIN_TYPE_FDB || dmn->type == MLX5DR_DOMAIN_TYPE_NIC_RX) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c index 3bc3f66b8fa8..4187f2b112b8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c @@ -18,7 +18,7 @@ static int dr_rule_append_to_miss_list(struct mlx5dr_ste *new_last_ste, struct mlx5dr_ste *last_ste; /* The new entry will be inserted after the last */ - last_ste = list_entry(miss_list->prev, struct mlx5dr_ste, miss_list_node); + last_ste = list_last_entry(miss_list, struct mlx5dr_ste, miss_list_node); WARN_ON(!last_ste); ste_info_last = kzalloc(sizeof(*ste_info_last), GFP_KERNEL); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c index 6b0af64536d8..95b7221f5730 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -429,12 +429,9 @@ static void dr_ste_remove_middle_ste(struct mlx5dr_ste *ste, struct mlx5dr_ste *prev_ste; u64 miss_addr; - prev_ste = list_entry(mlx5dr_ste_get_miss_list(ste)->prev, struct mlx5dr_ste, - miss_list_node); - if (!prev_ste) { - WARN_ON(true); + prev_ste = list_prev_entry(ste, miss_list_node); + if (WARN_ON(!prev_ste)) return; - } miss_addr = mlx5dr_ste_get_miss_addr(ste->hw_ste); mlx5dr_ste_set_miss_addr(prev_ste->hw_ste, miss_addr); @@ -461,8 +458,8 @@ void mlx5dr_ste_free(struct mlx5dr_ste *ste, struct mlx5dr_ste_htbl *stats_tbl; LIST_HEAD(send_ste_list); - first_ste = list_entry(mlx5dr_ste_get_miss_list(ste)->next, - struct mlx5dr_ste, miss_list_node); + first_ste = list_first_entry(mlx5dr_ste_get_miss_list(ste), + struct mlx5dr_ste, miss_list_node); stats_tbl = first_ste->htbl; /* Two options: @@ -479,8 +476,7 @@ void mlx5dr_ste_free(struct mlx5dr_ste *ste, if (last_ste == first_ste) next_ste = NULL; else - next_ste = list_entry(ste->miss_list_node.next, - struct mlx5dr_ste, miss_list_node); + next_ste = list_next_entry(ste, miss_list_node); if (!next_ste) { /* One and only entry in the list */ From 640bdb1fdb4ee42fa469c7842e0fac7b0ada7b9d Mon Sep 17 00:00:00 2001 From: Alaa Hleihel Date: Wed, 18 Sep 2019 12:23:10 +0300 Subject: [PATCH 0862/2880] net/mlx5: DR, Allow matching on vport based on vhca_id In case source_eswitch_owner_vhca_id is given as a match, the source_vport (vhca_id) will be set in case vhca_id_valid. This will allow matching on peer vports, vports that belong to the other pf. Fixes: 26d688e33f88 ("net/mlx5: DR, Add Steering entry (STE) utilities") Signed-off-by: Alaa Hleihel Signed-off-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/steering/dr_matcher.c | 3 +- .../mellanox/mlx5/core/steering/dr_ste.c | 36 ++++++++++++++++--- .../mellanox/mlx5/core/steering/dr_types.h | 6 ++-- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c index 9c2c25356dd0..67dea7698fc9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c @@ -230,8 +230,7 @@ static int dr_matcher_set_ste_builders(struct mlx5dr_matcher *matcher, (dmn->type == MLX5DR_DOMAIN_TYPE_FDB || dmn->type == MLX5DR_DOMAIN_TYPE_NIC_RX)) { ret = mlx5dr_ste_build_src_gvmi_qpn(&sb[idx++], &mask, - &dmn->info.caps, - inner, rx); + dmn, inner, rx); if (ret) return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c index 95b7221f5730..4efe1b0be4a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -837,6 +837,8 @@ static void dr_ste_copy_mask_misc(char *mask, struct mlx5dr_match_misc *spec) spec->source_sqn = MLX5_GET(fte_match_set_misc, mask, source_sqn); spec->source_port = MLX5_GET(fte_match_set_misc, mask, source_port); + spec->source_eswitch_owner_vhca_id = MLX5_GET(fte_match_set_misc, mask, + source_eswitch_owner_vhca_id); spec->outer_second_prio = MLX5_GET(fte_match_set_misc, mask, outer_second_prio); spec->outer_second_cfi = MLX5_GET(fte_match_set_misc, mask, outer_second_cfi); @@ -2250,11 +2252,18 @@ static int dr_ste_build_src_gvmi_qpn_bit_mask(struct mlx5dr_match_param *value, { struct mlx5dr_match_misc *misc_mask = &value->misc; - if (misc_mask->source_port != 0xffff) + /* Partial misc source_port is not supported */ + if (misc_mask->source_port && misc_mask->source_port != 0xffff) + return -EINVAL; + + /* Partial misc source_eswitch_owner_vhca_id is not supported */ + if (misc_mask->source_eswitch_owner_vhca_id && + misc_mask->source_eswitch_owner_vhca_id != 0xffff) return -EINVAL; DR_STE_SET_MASK(src_gvmi_qp, bit_mask, source_gvmi, misc_mask, source_port); DR_STE_SET_MASK(src_gvmi_qp, bit_mask, source_qp, misc_mask, source_sqn); + misc_mask->source_eswitch_owner_vhca_id = 0; return 0; } @@ -2266,17 +2275,33 @@ static int dr_ste_build_src_gvmi_qpn_tag(struct mlx5dr_match_param *value, struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; struct mlx5dr_match_misc *misc = &value->misc; struct mlx5dr_cmd_vport_cap *vport_cap; + struct mlx5dr_domain *dmn = sb->dmn; + struct mlx5dr_cmd_caps *caps; u8 *tag = hw_ste->tag; DR_STE_SET_TAG(src_gvmi_qp, tag, source_qp, misc, source_sqn); - vport_cap = mlx5dr_get_vport_cap(sb->caps, misc->source_port); + if (sb->vhca_id_valid) { + /* Find port GVMI based on the eswitch_owner_vhca_id */ + if (misc->source_eswitch_owner_vhca_id == dmn->info.caps.gvmi) + caps = &dmn->info.caps; + else if (dmn->peer_dmn && (misc->source_eswitch_owner_vhca_id == + dmn->peer_dmn->info.caps.gvmi)) + caps = &dmn->peer_dmn->info.caps; + else + return -EINVAL; + } else { + caps = &dmn->info.caps; + } + + vport_cap = mlx5dr_get_vport_cap(caps, misc->source_port); if (!vport_cap) return -EINVAL; if (vport_cap->vport_gvmi) MLX5_SET(ste_src_gvmi_qp, tag, source_gvmi, vport_cap->vport_gvmi); + misc->source_eswitch_owner_vhca_id = 0; misc->source_port = 0; return 0; @@ -2284,17 +2309,20 @@ static int dr_ste_build_src_gvmi_qpn_tag(struct mlx5dr_match_param *value, int mlx5dr_ste_build_src_gvmi_qpn(struct mlx5dr_ste_build *sb, struct mlx5dr_match_param *mask, - struct mlx5dr_cmd_caps *caps, + struct mlx5dr_domain *dmn, bool inner, bool rx) { int ret; + /* Set vhca_id_valid before we reset source_eswitch_owner_vhca_id */ + sb->vhca_id_valid = mask->misc.source_eswitch_owner_vhca_id; + ret = dr_ste_build_src_gvmi_qpn_bit_mask(mask, sb->bit_mask); if (ret) return ret; sb->rx = rx; - sb->caps = caps; + sb->dmn = dmn; sb->inner = inner; sb->lu_type = MLX5DR_STE_LU_TYPE_SRC_GVMI_AND_QP; sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 78f899fb3305..1cb3769d4e3c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -180,6 +180,8 @@ void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size, struct mlx5dr_ste_build { u8 inner:1; u8 rx:1; + u8 vhca_id_valid:1; + struct mlx5dr_domain *dmn; struct mlx5dr_cmd_caps *caps; u8 lu_type; u16 byte_mask; @@ -331,7 +333,7 @@ void mlx5dr_ste_build_register_1(struct mlx5dr_ste_build *sb, bool inner, bool rx); int mlx5dr_ste_build_src_gvmi_qpn(struct mlx5dr_ste_build *sb, struct mlx5dr_match_param *mask, - struct mlx5dr_cmd_caps *caps, + struct mlx5dr_domain *dmn, bool inner, bool rx); void mlx5dr_ste_build_empty_always_hit(struct mlx5dr_ste_build *sb, bool rx); @@ -453,7 +455,7 @@ struct mlx5dr_match_misc { u32 gre_c_present:1; /* Source port.;0xffff determines wire port */ u32 source_port:16; - u32 reserved_auto2:16; + u32 source_eswitch_owner_vhca_id:16; /* VLAN ID of first VLAN tag the inner header of the incoming packet. * Valid only when inner_second_cvlan_tag ==1 or inner_second_svlan_tag ==1 */ From d19a79ee38c8fda6d297e4227e80db8bf51c71a6 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Mon, 26 Aug 2019 16:34:12 -0500 Subject: [PATCH 0863/2880] net/mlx5: Add device ID of upcoming BlueField-2 Add the device ID of upcoming BlueField-2 integrated ConnectX-6 Dx network controller. Its VFs will be using the generic VF device ID: 0x101e "ConnectX Family mlx5Gen Virtual Function". Fixes: 2e9d3e83ab82 ("net/mlx5: Update the list of the PCI supported devices") Signed-off-by: Bodong Wang Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 9648c2297803..e47dd7c1b909 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1568,6 +1568,7 @@ static const struct pci_device_id mlx5_core_pci_table[] = { { PCI_VDEVICE(MELLANOX, 0x101e), MLX5_PCI_DEV_IS_VF}, /* ConnectX Family mlx5Gen Virtual Function */ { PCI_VDEVICE(MELLANOX, 0xa2d2) }, /* BlueField integrated ConnectX-5 network controller */ { PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF}, /* BlueField integrated ConnectX-5 network controller VF */ + { PCI_VDEVICE(MELLANOX, 0xa2d6) }, /* BlueField-2 integrated ConnectX-6 Dx network controller */ { 0, } }; From d22fcc806b84b9818de08b32e494f3c05dd236c7 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 11 Sep 2019 07:50:13 -0700 Subject: [PATCH 0864/2880] net/mlx5e: Fix traffic duplication in ethtool steering Before this patch, when adding multiple ethtool steering rules with identical classification, the driver used to append the new destination to the already existing hw rule, which caused the hw to forward the traffic to all destinations (rx queues). Here we avoid this by setting the "no append" mlx5 fs core flag when adding a new ethtool rule. Fixes: 6dc6071cfcde ("net/mlx5e: Add ethtool flow steering support") Signed-off-by: Saeed Mahameed Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c index eed7101e8bb7..acd946f2ddbe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c @@ -399,10 +399,10 @@ add_ethtool_flow_rule(struct mlx5e_priv *priv, struct mlx5_flow_table *ft, struct ethtool_rx_flow_spec *fs) { + struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND }; struct mlx5_flow_destination *dst = NULL; - struct mlx5_flow_act flow_act = {0}; - struct mlx5_flow_spec *spec; struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; int err = 0; spec = kvzalloc(sizeof(*spec), GFP_KERNEL); From fe1587a7de94912ed75ba5ddbfabf0741f9f8239 Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Fri, 13 Sep 2019 10:42:21 +0000 Subject: [PATCH 0865/2880] net/mlx5e: Fix matching on tunnel addresses type In mlx5 parse_tunnel_attr() function dispatch on encap IP address type is performed by directly checking flow_rule_match_key() on FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, and then on FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS. However, since those are stored in union, first check is always true if any type of encap address is set, which leads to IPv6 tunnel encap address being parsed as IPv4 by mlx5. Determine correct IP address type by checking control key first and if it set, take address type from match.key->addr_type. Fixes: d1bda7eecd88 ("net/mlx5e: Allow matching only enc_key_id/enc_dst_port for decapsulation action") Signed-off-by: Dmytro Linkin Reviewed-by: Vlad Buslov Reviewed-by: Eli Britstein Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 85 +++++++++++-------- 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index da7555fdb4d5..3e78a727f3e6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1664,46 +1664,63 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, return err; } - if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { - struct flow_match_ipv4_addrs match; + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL)) { + struct flow_match_control match; + u16 addr_type; - flow_rule_match_enc_ipv4_addrs(rule, &match); - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - src_ipv4_src_ipv6.ipv4_layout.ipv4, - ntohl(match.mask->src)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - src_ipv4_src_ipv6.ipv4_layout.ipv4, - ntohl(match.key->src)); + flow_rule_match_enc_control(rule, &match); + addr_type = match.key->addr_type; - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - dst_ipv4_dst_ipv6.ipv4_layout.ipv4, - ntohl(match.mask->dst)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - dst_ipv4_dst_ipv6.ipv4_layout.ipv4, - ntohl(match.key->dst)); + /* For tunnel addr_type used same key id`s as for non-tunnel */ + if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + struct flow_match_ipv4_addrs match; - MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IP); - } else if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { - struct flow_match_ipv6_addrs match; + flow_rule_match_enc_ipv4_addrs(rule, &match); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4, + ntohl(match.mask->src)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv4_layout.ipv4, + ntohl(match.key->src)); - flow_rule_match_enc_ipv6_addrs(rule, &match); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, - src_ipv4_src_ipv6.ipv6_layout.ipv6), - &match.mask->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, - src_ipv4_src_ipv6.ipv6_layout.ipv6), - &match.key->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4, + ntohl(match.mask->dst)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4, + ntohl(match.key->dst)); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, - dst_ipv4_dst_ipv6.ipv6_layout.ipv6), - &match.mask->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, - dst_ipv4_dst_ipv6.ipv6_layout.ipv6), - &match.key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, + ethertype); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, + ETH_P_IP); + } else if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + struct flow_match_ipv6_addrs match; - MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IPV6); + flow_rule_match_enc_ipv6_addrs(rule, &match); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &match.mask->src, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &match.key->src, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &match.mask->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &match.key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)); + + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, + ethertype); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, + ETH_P_IPV6); + } } if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IP)) { From 0b15e02f0cc4fb34a9160de7ba6db3a4013dc1b7 Mon Sep 17 00:00:00 2001 From: Filippo Sironi Date: Tue, 10 Sep 2019 19:49:21 +0200 Subject: [PATCH 0866/2880] iommu/amd: Wait for completion of IOTLB flush in attach_device To make sure the domain tlb flush completes before the function returns, explicitly wait for its completion. Signed-off-by: Filippo Sironi Fixes: 42a49f965a8d ("amd-iommu: flush domain tlb when attaching a new device") [joro: Added commit message and fixes tag] Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index fda9923542c9..7bdce3b10f3d 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2212,6 +2212,8 @@ skip_ats_check: */ domain_flush_tlb_pde(domain); + domain_flush_complete(domain); + return ret; } From 1211ee61b4a8e60d6dc77211cdcf01906915bfba Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Fri, 20 Sep 2019 15:05:22 +0200 Subject: [PATCH 0867/2880] powerpc/pseries: Read TLB Block Invalidate Characteristics The PAPR document specifies the TLB Block Invalidate Characteristics which tells for each pair of segment base page size, actual page size, the size of the block the hcall H_BLOCK_REMOVE supports. These characteristics are loaded at boot time in a new table hblkr_size. The table is separate from the mmu_psize_def because this is specific to the pseries platform. A new init function, pseries_lpar_read_hblkrm_characteristics() is added to read the characteristics. It is called from pSeries_setup_arch(). Fixes: ba2dd8a26baa ("powerpc/pseries/mm: call H_BLOCK_REMOVE") Signed-off-by: Laurent Dufour Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190920130523.20441-2-ldufour@linux.ibm.com --- arch/powerpc/platforms/pseries/lpar.c | 140 +++++++++++++++++++++++ arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/setup.c | 1 + 3 files changed, 142 insertions(+) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 36b846f6e74e..8e3ac7ae4e11 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -56,6 +56,15 @@ EXPORT_SYMBOL(plpar_hcall); EXPORT_SYMBOL(plpar_hcall9); EXPORT_SYMBOL(plpar_hcall_norets); +/* + * H_BLOCK_REMOVE supported block size for this page size in segment who's base + * page size is that page size. + * + * The first index is the segment base page size, the second one is the actual + * page size. + */ +static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init; + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static u8 dtl_mask = DTL_LOG_PREEMPT; #else @@ -1311,6 +1320,137 @@ static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch, (void)call_block_remove(pix, param, true); } +/* + * TLB Block Invalidate Characteristics + * + * These characteristics define the size of the block the hcall H_BLOCK_REMOVE + * is able to process for each couple segment base page size, actual page size. + * + * The ibm,get-system-parameter properties is returning a buffer with the + * following layout: + * + * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ] + * ----------------- + * TLB Block Invalidate Specifiers: + * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ] + * [ 1 byte Number of page sizes (N) that are supported for the specified + * TLB invalidate block size ] + * [ 1 byte Encoded segment base page size and actual page size + * MSB=0 means 4k segment base page size and actual page size + * MSB=1 the penc value in mmu_psize_def ] + * ... + * ----------------- + * Next TLB Block Invalidate Specifiers... + * ----------------- + * [ 0 ] + */ +static inline void set_hblkrm_bloc_size(int bpsize, int psize, + unsigned int block_size) +{ + if (block_size > hblkrm_size[bpsize][psize]) + hblkrm_size[bpsize][psize] = block_size; +} + +/* + * Decode the Encoded segment base page size and actual page size. + * PAPR specifies: + * - bit 7 is the L bit + * - bits 0-5 are the penc value + * If the L bit is 0, this means 4K segment base page size and actual page size + * otherwise the penc value should be read. + */ +#define HBLKRM_L_MASK 0x80 +#define HBLKRM_PENC_MASK 0x3f +static inline void __init check_lp_set_hblkrm(unsigned int lp, + unsigned int block_size) +{ + unsigned int bpsize, psize; + + /* First, check the L bit, if not set, this means 4K */ + if ((lp & HBLKRM_L_MASK) == 0) { + set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size); + return; + } + + lp &= HBLKRM_PENC_MASK; + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) { + struct mmu_psize_def *def = &mmu_psize_defs[bpsize]; + + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + if (def->penc[psize] == lp) { + set_hblkrm_bloc_size(bpsize, psize, block_size); + return; + } + } + } +} + +#define SPLPAR_TLB_BIC_TOKEN 50 + +/* + * The size of the TLB Block Invalidate Characteristics is variable. But at the + * maximum it will be the number of possible page sizes *2 + 10 bytes. + * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size + * (128 bytes) for the buffer to get plenty of space. + */ +#define SPLPAR_TLB_BIC_MAXLENGTH 128 + +void __init pseries_lpar_read_hblkrm_characteristics(void) +{ + unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH]; + int call_status, len, idx, bpsize; + + spin_lock(&rtas_data_buf_lock); + memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE); + call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, + SPLPAR_TLB_BIC_TOKEN, + __pa(rtas_data_buf), + RTAS_DATA_BUF_SIZE); + memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH); + local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0'; + spin_unlock(&rtas_data_buf_lock); + + if (call_status != 0) { + pr_warn("%s %s Error calling get-system-parameter (0x%x)\n", + __FILE__, __func__, call_status); + return; + } + + /* + * The first two (2) bytes of the data in the buffer are the length of + * the returned data, not counting these first two (2) bytes. + */ + len = be16_to_cpu(*((u16 *)local_buffer)) + 2; + if (len > SPLPAR_TLB_BIC_MAXLENGTH) { + pr_warn("%s too large returned buffer %d", __func__, len); + return; + } + + idx = 2; + while (idx < len) { + u8 block_shift = local_buffer[idx++]; + u32 block_size; + unsigned int npsize; + + if (!block_shift) + break; + + block_size = 1 << block_shift; + + for (npsize = local_buffer[idx++]; + npsize > 0 && idx < len; npsize--) + check_lp_set_hblkrm((unsigned int) local_buffer[idx++], + block_size); + } + + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) + for (idx = 0; idx < MMU_PAGE_COUNT; idx++) + if (hblkrm_size[bpsize][idx]) + pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d", + bpsize, idx, hblkrm_size[bpsize][idx]); +} + /* * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie * lock. diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index a6624d4bd9d0..13fa370a87e4 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -112,5 +112,6 @@ static inline unsigned long cmo_get_page_size(void) int dlpar_workqueue_init(void); void pseries_setup_rfi_flush(void); +void pseries_lpar_read_hblkrm_characteristics(void); #endif /* _PSERIES_PSERIES_H */ diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index f8adcd0e4589..0a40201f315f 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -744,6 +744,7 @@ static void __init pSeries_setup_arch(void) pseries_setup_rfi_flush(); setup_stf_barrier(); + pseries_lpar_read_hblkrm_characteristics(); /* By default, only probe PCI (can be overridden by rtas_pci) */ pci_add_flags(PCI_PROBE_ONLY); From 59545ebe331917afcb1ebbd6f3e5dc3ed51beb05 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Fri, 20 Sep 2019 15:05:23 +0200 Subject: [PATCH 0868/2880] powerpc/pseries: Call H_BLOCK_REMOVE when supported Depending on the hardware and the hypervisor, the hcall H_BLOCK_REMOVE may not be able to process all the page sizes for a segment base page size, as reported by the TLB Invalidate Characteristics. For each pair of base segment page size and actual page size, this characteristic tells us the size of the block the hcall supports. In the case, the hcall is not supporting a pair of base segment page size, actual page size, it is returning H_PARAM which leads to a panic like this: kernel BUG at /home/srikar/work/linux.git/arch/powerpc/platforms/pseries/lpar.c:466! Oops: Exception in kernel mode, sig: 5 [#1] BE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 28 PID: 583 Comm: modprobe Not tainted 5.2.0-master #5 NIP: c0000000000be8dc LR: c0000000000be880 CTR: 0000000000000000 REGS: c0000007e77fb130 TRAP: 0700 Not tainted (5.2.0-master) MSR: 8000000000029032 CR: 42224824 XER: 20000000 CFAR: c0000000000be8fc IRQMASK: 0 GPR00: 0000000022224828 c0000007e77fb3c0 c000000001434d00 0000000000000005 GPR04: 9000000004fa8c00 0000000000000000 0000000000000003 0000000000000001 GPR08: c0000007e77fb450 0000000000000000 0000000000000001 ffffffffffffffff GPR12: c0000007e77fb450 c00000000edfcb80 0000cd7d3ea30000 c0000000016022b0 GPR16: 00000000000000b0 0000cd7d3ea30000 0000000000000001 c080001f04f00105 GPR20: 0000000000000003 0000000000000004 c000000fbeb05f58 c000000001602200 GPR24: 0000000000000000 0000000000000004 8800000000000000 c000000000c5d148 GPR28: c000000000000000 8000000000000000 a000000000000000 c0000007e77fb580 NIP [c0000000000be8dc] .call_block_remove+0x12c/0x220 LR [c0000000000be880] .call_block_remove+0xd0/0x220 Call Trace: 0xc000000fb8c00240 (unreliable) .pSeries_lpar_flush_hash_range+0x578/0x670 .flush_hash_range+0x44/0x100 .__flush_tlb_pending+0x3c/0xc0 .zap_pte_range+0x7ec/0x830 .unmap_page_range+0x3f4/0x540 .unmap_vmas+0x94/0x120 .exit_mmap+0xac/0x1f0 .mmput+0x9c/0x1f0 .do_exit+0x388/0xd60 .do_group_exit+0x54/0x100 .__se_sys_exit_group+0x14/0x20 system_call+0x5c/0x70 Instruction dump: 39400001 38a00000 4800003c 60000000 60420000 7fa9e800 38e00000 419e0014 7d29d278 7d290074 7929d182 69270001 <0b070000> 7d495378 394a0001 7fa93040 The call to H_BLOCK_REMOVE should only be made for the supported pair of base segment page size, actual page size and using the correct maximum block size. Due to the required complexity in do_block_remove() and call_block_remove(), and the fact that currently a block size of 8 is returned by the hypervisor, we are only supporting 8 size block to the H_BLOCK_REMOVE hcall. In order to identify this limitation easily in the code, a local define HBLKR_SUPPORTED_SIZE defining the currently supported block size, and a dedicated checking helper is_supported_hlbkr() are introduced. For regular pages and hugetlb, the assumption is made that the page size is equal to the base page size. For THP the page size is assumed to be 16M. Fixes: ba2dd8a26baa ("powerpc/pseries/mm: call H_BLOCK_REMOVE") Signed-off-by: Laurent Dufour Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190920130523.20441-3-ldufour@linux.ibm.com --- arch/powerpc/platforms/pseries/lpar.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 8e3ac7ae4e11..b53359258d99 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -65,6 +65,13 @@ EXPORT_SYMBOL(plpar_hcall_norets); */ static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init; +/* + * Due to the involved complexity, and that the current hypervisor is only + * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE + * buffer size to 8 size block. + */ +#define HBLKRM_SUPPORTED_BLOCK_SIZE 8 + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static u8 dtl_mask = DTL_LOG_PREEMPT; #else @@ -993,6 +1000,17 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, #define HBLKR_CTRL_ERRNOTFOUND 0x8800000000000000UL #define HBLKR_CTRL_ERRBUSY 0xa000000000000000UL +/* + * Returned true if we are supporting this block size for the specified segment + * base page size and actual page size. + * + * Currently, we only support 8 size block. + */ +static inline bool is_supported_hlbkrm(int bpsize, int psize) +{ + return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE); +} + /** * H_BLOCK_REMOVE caller. * @idx should point to the latest @param entry set with a PTEX. @@ -1152,7 +1170,8 @@ static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, if (lock_tlbie) spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); - if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) + /* Assuming THP size is 16M */ + if (is_supported_hlbkrm(psize, MMU_PAGE_16M)) hugepage_block_invalidate(slot, vpn, count, psize, ssize); else hugepage_bulk_invalidate(slot, vpn, count, psize, ssize); @@ -1470,7 +1489,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) if (lock_tlbie) spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); - if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) { + if (is_supported_hlbkrm(batch->psize, batch->psize)) { do_block_remove(number, batch, param); goto out; } From 677733e296b5c7a37c47da391fc70a43dc40bd67 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 09:22:51 +0530 Subject: [PATCH 0869/2880] powerpc/book3s64/mm: Don't do tlbie fixup for some hardware revisions The store ordering vs tlbie issue mentioned in commit a5d4b5891c2f ("powerpc/mm: Fixup tlbie vs store ordering issue on POWER9") is fixed for Nimbus 2.3 and Cumulus 1.3 revisions. We don't need to apply the fixup if we are running on them We can only do this on PowerNV. On pseries guest with KVM we still don't support redoing the feature fixup after migration. So we should be enabling all the workarounds needed, because whe can possibly migrate between DD 2.3 and DD 2.2 Fixes: a5d4b5891c2f ("powerpc/mm: Fixup tlbie vs store ordering issue on POWER9") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190924035254.24612-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/kernel/dt_cpu_ftrs.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index bceee2fde885..5fc1b527de46 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -692,9 +692,35 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) return true; } +/* + * Handle POWER9 broadcast tlbie invalidation issue using + * cpu feature flag. + */ +static __init void update_tlbie_feature_flag(unsigned long pvr) +{ + if (PVR_VER(pvr) == PVR_POWER9) { + /* + * Set the tlbie feature flag for anything below + * Nimbus DD 2.3 and Cumulus DD 1.3 + */ + if ((pvr & 0xe000) == 0) { + /* Nimbus */ + if ((pvr & 0xfff) < 0x203) + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + } else if ((pvr & 0xc000) == 0) { + /* Cumulus */ + if ((pvr & 0xfff) < 0x103) + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + } else { + WARN_ONCE(1, "Unknown PVR"); + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + } + } +} + static __init void cpufeatures_cpu_quirks(void) { - int version = mfspr(SPRN_PVR); + unsigned long version = mfspr(SPRN_PVR); /* * Not all quirks can be derived from the cpufeatures device tree. @@ -713,10 +739,10 @@ static __init void cpufeatures_cpu_quirks(void) if ((version & 0xffff0000) == 0x004e0000) { cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); - cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; cur_cpu_spec->cpu_features |= CPU_FTR_P9_TIDR; } + update_tlbie_feature_flag(version); /* * PKEY was not in the initial base or feature node * specification, but it should become optional in the next From 09ce98cacd51fcd0fa0af2f79d1e1d3192f4cbb0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 09:22:52 +0530 Subject: [PATCH 0870/2880] powerpc/book3s64/radix: Rename CPU_FTR_P9_TLBIE_BUG feature flag Rename the #define to indicate this is related to store vs tlbie ordering issue. In the next patch, we will be adding another feature flag that is used to handles ERAT flush vs tlbie ordering issue. Fixes: a5d4b5891c2f ("powerpc/mm: Fixup tlbie vs store ordering issue on POWER9") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190924035254.24612-2-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/cputable.h | 4 ++-- arch/powerpc/kernel/dt_cpu_ftrs.c | 6 +++--- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- arch/powerpc/mm/book3s64/hash_native.c | 2 +- arch/powerpc/mm/book3s64/radix_tlb.c | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index a1ebcbc3931f..f080fba48619 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -209,7 +209,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_POWER9_DD2_1 LONG_ASM_CONST(0x0000080000000000) #define CPU_FTR_P9_TM_HV_ASSIST LONG_ASM_CONST(0x0000100000000000) #define CPU_FTR_P9_TM_XER_SO_BUG LONG_ASM_CONST(0x0000200000000000) -#define CPU_FTR_P9_TLBIE_BUG LONG_ASM_CONST(0x0000400000000000) +#define CPU_FTR_P9_TLBIE_STQ_BUG LONG_ASM_CONST(0x0000400000000000) #define CPU_FTR_P9_TIDR LONG_ASM_CONST(0x0000800000000000) #ifndef __ASSEMBLY__ @@ -457,7 +457,7 @@ static inline void cpu_feature_keys_init(void) { } CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \ CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \ - CPU_FTR_P9_TLBIE_BUG | CPU_FTR_P9_TIDR) + CPU_FTR_P9_TLBIE_STQ_BUG | CPU_FTR_P9_TIDR) #define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9 #define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1) #define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \ diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 5fc1b527de46..a86486390c70 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -706,14 +706,14 @@ static __init void update_tlbie_feature_flag(unsigned long pvr) if ((pvr & 0xe000) == 0) { /* Nimbus */ if ((pvr & 0xfff) < 0x203) - cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG; } else if ((pvr & 0xc000) == 0) { /* Cumulus */ if ((pvr & 0xfff) < 0x103) - cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG; } else { WARN_ONCE(1, "Unknown PVR"); - cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG; } } } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 7186c65c61c9..cfdf232aac34 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -451,7 +451,7 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, "r" (rbvalues[i]), "r" (kvm->arch.lpid)); } - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { /* * Need the extra ptesync to make sure we don't * re-order the tlbie diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 90ab4f31e2b3..02568dae4695 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -199,7 +199,7 @@ static inline unsigned long ___tlbie(unsigned long vpn, int psize, static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize) { - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { /* Need the extra ptesync to ensure we don't reorder tlbie*/ asm volatile("ptesync": : :"memory"); ___tlbie(vpn, psize, apsize, ssize); diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 631be42abd33..69fdc004d83f 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -201,7 +201,7 @@ static inline void fixup_tlbie(void) unsigned long pid = 0; unsigned long va = ((1UL << 52) - 1); - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { asm volatile("ptesync": : :"memory"); __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); } @@ -211,7 +211,7 @@ static inline void fixup_tlbie_lpid(unsigned long lpid) { unsigned long va = ((1UL << 52) - 1); - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { asm volatile("ptesync": : :"memory"); __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); } From 047e6575aec71d75b765c22111820c4776cd1c43 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 09:22:53 +0530 Subject: [PATCH 0871/2880] powerpc/mm: Fixup tlbie vs mtpidr/mtlpidr ordering issue on POWER9 On POWER9, under some circumstances, a broadcast TLB invalidation will fail to invalidate the ERAT cache on some threads when there are parallel mtpidr/mtlpidr happening on other threads of the same core. This can cause stores to continue to go to a page after it's unmapped. The workaround is to force an ERAT flush using PID=0 or LPID=0 tlbie flush. This additional TLB flush will cause the ERAT cache invalidation. Since we are using PID=0 or LPID=0, we don't get filtered out by the TLB snoop filtering logic. We need to still follow this up with another tlbie to take care of store vs tlbie ordering issue explained in commit: a5d4b5891c2f ("powerpc/mm: Fixup tlbie vs store ordering issue on POWER9"). The presence of ERAT cache implies we can still get new stores and they may miss store queue marking flush. Cc: stable@vger.kernel.org Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190924035254.24612-3-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/cputable.h | 3 +- arch/powerpc/kernel/dt_cpu_ftrs.c | 2 + arch/powerpc/kvm/book3s_hv_rm_mmu.c | 42 ++++++++++---- arch/powerpc/mm/book3s64/hash_native.c | 29 +++++++++- arch/powerpc/mm/book3s64/radix_tlb.c | 80 +++++++++++++++++++++++--- 5 files changed, 134 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index f080fba48619..cf00ff0d121d 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -211,6 +211,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_P9_TM_XER_SO_BUG LONG_ASM_CONST(0x0000200000000000) #define CPU_FTR_P9_TLBIE_STQ_BUG LONG_ASM_CONST(0x0000400000000000) #define CPU_FTR_P9_TIDR LONG_ASM_CONST(0x0000800000000000) +#define CPU_FTR_P9_TLBIE_ERAT_BUG LONG_ASM_CONST(0x0001000000000000) #ifndef __ASSEMBLY__ @@ -457,7 +458,7 @@ static inline void cpu_feature_keys_init(void) { } CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \ CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \ - CPU_FTR_P9_TLBIE_STQ_BUG | CPU_FTR_P9_TIDR) + CPU_FTR_P9_TLBIE_STQ_BUG | CPU_FTR_P9_TLBIE_ERAT_BUG | CPU_FTR_P9_TIDR) #define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9 #define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1) #define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \ diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index a86486390c70..180b3a5d1001 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -715,6 +715,8 @@ static __init void update_tlbie_feature_flag(unsigned long pvr) WARN_ONCE(1, "Unknown PVR"); cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG; } + + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_ERAT_BUG; } } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index cfdf232aac34..220305454c23 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -433,6 +433,37 @@ static inline int is_mmio_hpte(unsigned long v, unsigned long r) (HPTE_R_KEY_HI | HPTE_R_KEY_LO)); } +static inline void fixup_tlbie_lpid(unsigned long rb_value, unsigned long lpid) +{ + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + /* Radix flush for a hash guest */ + + unsigned long rb,rs,prs,r,ric; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = 0; /* lpid = 0 */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + ric = 0; /* RIC_FLSUH_TLB */ + + /* + * Need the extra ptesync to make sure we don't + * re-order the tlbie + */ + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), + "i"(ric), "r"(rs) : "memory"); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : : + "r" (rb_value), "r" (lpid)); + } +} + static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, long npages, int global, bool need_sync) { @@ -451,16 +482,7 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, "r" (rbvalues[i]), "r" (kvm->arch.lpid)); } - if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { - /* - * Need the extra ptesync to make sure we don't - * re-order the tlbie - */ - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : : - "r" (rbvalues[0]), "r" (kvm->arch.lpid)); - } - + fixup_tlbie_lpid(rbvalues[i - 1], kvm->arch.lpid); asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } else { if (need_sync) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 02568dae4695..523e42eb11da 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -197,8 +197,31 @@ static inline unsigned long ___tlbie(unsigned long vpn, int psize, return va; } -static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize) +static inline void fixup_tlbie_vpn(unsigned long vpn, int psize, + int apsize, int ssize) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + /* Radix flush for a hash guest */ + + unsigned long rb,rs,prs,r,ric; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = 0; /* lpid = 0 */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + ric = 0; /* RIC_FLSUH_TLB */ + + /* + * Need the extra ptesync to make sure we don't + * re-order the tlbie + */ + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), + "i"(ric), "r"(rs) : "memory"); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { /* Need the extra ptesync to ensure we don't reorder tlbie*/ asm volatile("ptesync": : :"memory"); @@ -283,7 +306,7 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize, asm volatile("ptesync": : :"memory"); } else { __tlbie(vpn, psize, apsize, ssize); - fixup_tlbie(vpn, psize, apsize, ssize); + fixup_tlbie_vpn(vpn, psize, apsize, ssize); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } if (lock_tlbie && !use_local) @@ -856,7 +879,7 @@ static void native_flush_hash_range(unsigned long number, int local) /* * Just do one more with the last used values. */ - fixup_tlbie(vpn, psize, psize, ssize); + fixup_tlbie_vpn(vpn, psize, psize, ssize); asm volatile("eieio; tlbsync; ptesync":::"memory"); if (lock_tlbie) diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 69fdc004d83f..67af871190c6 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -196,21 +196,82 @@ static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } -static inline void fixup_tlbie(void) + +static inline void fixup_tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap) { - unsigned long pid = 0; + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, 0, ap, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_pid(0, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_pid(unsigned long pid) +{ + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ unsigned long va = ((1UL << 52) - 1); + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_pid(0, RIC_FLUSH_TLB); + } + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { asm volatile("ptesync": : :"memory"); __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); } } + +static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, 0, ap, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, ap, RIC_FLUSH_TLB); + } +} + static inline void fixup_tlbie_lpid(unsigned long lpid) { + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ unsigned long va = ((1UL << 52) - 1); + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid(0, RIC_FLUSH_TLB); + } + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { asm volatile("ptesync": : :"memory"); __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); @@ -258,6 +319,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) switch (ric) { case RIC_FLUSH_TLB: __tlbie_pid(pid, RIC_FLUSH_TLB); + fixup_tlbie_pid(pid); break; case RIC_FLUSH_PWC: __tlbie_pid(pid, RIC_FLUSH_PWC); @@ -265,8 +327,8 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) case RIC_FLUSH_ALL: default: __tlbie_pid(pid, RIC_FLUSH_ALL); + fixup_tlbie_pid(pid); } - fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -315,6 +377,7 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) switch (ric) { case RIC_FLUSH_TLB: __tlbie_lpid(lpid, RIC_FLUSH_TLB); + fixup_tlbie_lpid(lpid); break; case RIC_FLUSH_PWC: __tlbie_lpid(lpid, RIC_FLUSH_PWC); @@ -322,8 +385,8 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) case RIC_FLUSH_ALL: default: __tlbie_lpid(lpid, RIC_FLUSH_ALL); + fixup_tlbie_lpid(lpid); } - fixup_tlbie_lpid(lpid); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -390,6 +453,8 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end, for (addr = start; addr < end; addr += page_size) __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + + fixup_tlbie_va_range(addr - page_size, pid, ap); } static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, @@ -399,7 +464,7 @@ static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, asm volatile("ptesync": : :"memory"); __tlbie_va(va, pid, ap, ric); - fixup_tlbie(); + fixup_tlbie_va(va, pid, ap); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -457,7 +522,7 @@ static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, asm volatile("ptesync": : :"memory"); __tlbie_lpid_va(va, lpid, ap, ric); - fixup_tlbie_lpid(lpid); + fixup_tlbie_lpid_va(va, lpid, ap); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -469,7 +534,6 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end, if (also_pwc) __tlbie_pid(pid, RIC_FLUSH_PWC); __tlbie_va_range(start, end, pid, page_size, psize); - fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -856,7 +920,7 @@ is_local: if (gflush) __tlbie_va_range(gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G); - fixup_tlbie(); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); } else { _tlbiel_va_range_multicast(mm, From e2ada66ec41821e54a503776c1ce42fbf4e99fbe Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 21 Aug 2019 11:20:04 -0700 Subject: [PATCH 0872/2880] kvm: x86: Add Intel PMU MSRs to msrs_to_save[] These MSRs should be enumerated by KVM_GET_MSR_INDEX_LIST, so that userspace knows that these MSRs may be part of the vCPU state. Signed-off-by: Jim Mattson Reviewed-by: Eric Hankland Reviewed-by: Peter Shier Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dfd641243568..bb93771e4170 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1145,6 +1145,42 @@ static u32 msrs_to_save[] = { MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, + MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, + MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, + MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, + MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, + MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, + MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, + MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, + MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, + MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, + MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, + MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, + MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, + MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, + MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, + MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, + MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, + MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, + MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, + MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, + MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, + MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, + MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, + MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, }; static unsigned num_msrs_to_save; @@ -5051,6 +5087,11 @@ static void kvm_init_msr_list(void) u32 dummy[2]; unsigned i, j; + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, + "Please update the fixed PMCs in msrs_to_save[]"); + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, + "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); + for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; From 7a83247e010a2881ee246a88596af0549ef3e6c4 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 22 Aug 2019 22:30:19 +0800 Subject: [PATCH 0873/2880] x86/Hyper-V: Fix definition of struct hv_vp_assist_page The struct hv_vp_assist_page was defined incorrectly. The "vtl_control" should be u64[3], "nested_enlightenments _control" should be a u64 and there are 7 reserved bytes following "enlighten_vmentry". Fix the definition. Signed-off-by: Tianyu Lan Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 7a2705694f5b..3f08327744d1 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -524,14 +524,24 @@ struct hv_timer_message_payload { __u64 delivery_time; /* When the message was delivered */ } __packed; +struct hv_nested_enlightenments_control { + struct { + __u32 directhypercall:1; + __u32 reserved:31; + } features; + struct { + __u32 reserved; + } hypercallControls; +} __packed; + /* Define virtual processor assist page structure. */ struct hv_vp_assist_page { __u32 apic_assist; - __u32 reserved; - __u64 vtl_control[2]; - __u64 nested_enlightenments_control[2]; - __u32 enlighten_vmentry; - __u32 padding; + __u32 reserved1; + __u64 vtl_control[3]; + struct hv_nested_enlightenments_control nested_control; + __u8 enlighten_vmentry; + __u8 reserved2[7]; __u64 current_nested_vmcs; } __packed; From 344c6c804703841d2bff4d68d7390ba726053874 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 22 Aug 2019 22:30:20 +0800 Subject: [PATCH 0874/2880] KVM/Hyper-V: Add new KVM capability KVM_CAP_HYPERV_DIRECT_TLBFLUSH Hyper-V direct tlb flush function should be enabled for guest that only uses Hyper-V hypercall. User space hypervisor(e.g, Qemu) can disable KVM identification in CPUID and just exposes Hyper-V identification to make sure the precondition. Add new KVM capability KVM_CAP_ HYPERV_DIRECT_TLBFLUSH for user space to enable Hyper-V direct tlb function and this function is default to be disabled in KVM. Signed-off-by: Tianyu Lan Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.txt | 13 +++++++++++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 8 ++++++++ include/uapi/linux/kvm.h | 1 + 4 files changed, 23 insertions(+) diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt index 136f1eef3712..4833904d32a5 100644 --- a/Documentation/virt/kvm/api.txt +++ b/Documentation/virt/kvm/api.txt @@ -5309,3 +5309,16 @@ Architectures: x86 This capability indicates that KVM supports paravirtualized Hyper-V IPI send hypercalls: HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx. +8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH + +Architecture: x86 + +This capability indicates that KVM running on top of Hyper-V hypervisor +enables Direct TLB flush for its guests meaning that TLB flush +hypercalls are handled by Level 0 hypervisor (Hyper-V) bypassing KVM. +Due to the different ABI for hypercall parameters between Hyper-V and +KVM, enabling this capability effectively disables all hypercall +handling by KVM (as some KVM hypercall may be mistakenly treated as TLB +flush hypercalls by Hyper-V) so userspace should disable KVM identification +in CPUID and only exposes Hyper-V identification. In this case, guest +thinks it's running on Hyper-V and only use Hyper-V hypercalls. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a3a3ec73fa2f..4765ae0fc506 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1213,6 +1213,7 @@ struct kvm_x86_ops { bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); + int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bb93771e4170..5becc6753280 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3282,6 +3282,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = kvm_x86_ops->get_nested_state ? kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0; break; + case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: + r = kvm_x86_ops->enable_direct_tlbflush ? 1 : 0; + break; default: break; } @@ -4055,6 +4058,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = -EFAULT; } return r; + case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: + if (!kvm_x86_ops->enable_direct_tlbflush) + return -ENOTTY; + + return kvm_x86_ops->enable_direct_tlbflush(vcpu); default: return -EINVAL; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 233efbb1c81c..c73aead0c0a8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -999,6 +999,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_PTRAUTH_GENERIC 172 #define KVM_CAP_PMU_EVENT_FILTER 173 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 +#define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175 #ifdef KVM_CAP_IRQ_ROUTING From 6f6a657c99980485c265363447b269083dd1dc3a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 22 Aug 2019 22:30:21 +0800 Subject: [PATCH 0875/2880] KVM/Hyper-V/VMX: Add direct tlb flush support Hyper-V provides direct tlb flush function which helps L1 Hypervisor to handle Hyper-V tlb flush request from L2 guest. Add the function support for VMX. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Tianyu Lan Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 4 +++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/vmx/evmcs.h | 2 ++ arch/x86/kvm/vmx/vmx.c | 39 ++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 3f08327744d1..038581e91a84 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -181,6 +181,7 @@ #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) /* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ +#define HV_X64_NESTED_DIRECT_FLUSH BIT(17) #define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) #define HV_X64_NESTED_MSR_BITMAP BIT(19) @@ -892,4 +893,7 @@ struct hv_tlb_flush_ex { u64 gva_list[]; } __packed; +struct hv_partition_assist_pg { + u32 tlb_lock_count; +}; #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4765ae0fc506..13b35f94dec4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -844,6 +844,8 @@ struct kvm_hv { /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; + + struct hv_partition_assist_pg *hv_pa_pg; }; enum kvm_irqchip_mode { diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 39a24eec8884..07ebf6882a45 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -178,6 +178,8 @@ static inline void evmcs_load(u64 phys_addr) struct hv_vp_assist_page *vp_ap = hv_get_vp_assist_page(smp_processor_id()); + if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) + vp_ap->nested_control.features.directhypercall = 1; vp_ap->current_nested_vmcs = phys_addr; vp_ap->enlighten_vmentry = 1; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4a99be1fae4e..8bab3e53d1e1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -486,6 +486,35 @@ static int hv_remote_flush_tlb(struct kvm *kvm) return hv_remote_flush_tlb_with_range(kvm, NULL); } +static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) +{ + struct hv_enlightened_vmcs *evmcs; + struct hv_partition_assist_pg **p_hv_pa_pg = + &vcpu->kvm->arch.hyperv.hv_pa_pg; + /* + * Synthetic VM-Exit is not enabled in current code and so All + * evmcs in singe VM shares same assist page. + */ + if (!*p_hv_pa_pg) { + *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!*p_hv_pa_pg) + return -ENOMEM; + pr_debug("KVM: Hyper-V: allocated PA_PG for %llx\n", + (u64)&vcpu->kvm); + } + + evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; + + evmcs->partition_assist_page = + __pa(*p_hv_pa_pg); + evmcs->hv_vm_id = (u64)vcpu->kvm; + evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; + + pr_debug("KVM: Hyper-V: enabled DIRECT flush for %llx\n", + (u64)vcpu->kvm); + return 0; +} + #endif /* IS_ENABLED(CONFIG_HYPERV) */ /* @@ -6511,6 +6540,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + if (static_branch_unlikely(&enable_evmcs)) + current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index; + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (vmx->host_debugctlmsr) update_debugctlmsr(vmx->host_debugctlmsr); @@ -6578,6 +6610,7 @@ static struct kvm *vmx_vm_alloc(void) static void vmx_vm_free(struct kvm *kvm) { + kfree(kvm->arch.hyperv.hv_pa_pg); vfree(to_kvm_vmx(kvm)); } @@ -7837,6 +7870,7 @@ static void vmx_exit(void) if (!vp_ap) continue; + vp_ap->nested_control.features.directhypercall = 0; vp_ap->current_nested_vmcs = 0; vp_ap->enlighten_vmentry = 0; } @@ -7876,6 +7910,11 @@ static int __init vmx_init(void) pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); static_branch_enable(&enable_evmcs); } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) + vmx_x86_ops.enable_direct_tlbflush + = hv_enable_direct_tlbflush; + } else { enlightened_vmcs = false; } From 956e255c59a5b64d1b3f16fdf2db306d74829e33 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 28 Aug 2019 09:59:04 +0200 Subject: [PATCH 0876/2880] KVM: x86: svm: remove unneeded nested_enable_evmcs() hook Since commit 5158917c7b019 ("KVM: x86: nVMX: Allow nested_enable_evmcs to be NULL") the code in x86.c is prepared to see nested_enable_evmcs being NULL and in VMX case it actually is when nesting is disabled. Remove the unneeded stub from SVM code. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 04fe21849b6e..7e352e81df31 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -7099,13 +7099,6 @@ failed: return ret; } -static int nested_enable_evmcs(struct kvm_vcpu *vcpu, - uint16_t *vmcs_version) -{ - /* Intel-only feature */ - return -ENODEV; -} - static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) { unsigned long cr4 = kvm_read_cr4(vcpu); @@ -7311,7 +7304,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, - .nested_enable_evmcs = nested_enable_evmcs, + .nested_enable_evmcs = NULL, .nested_get_evmcs_version = NULL, .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, From 5a0165f6dde37fd16e85c86eccb895fd4f93305f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 28 Aug 2019 09:59:05 +0200 Subject: [PATCH 0877/2880] KVM: x86: announce KVM_CAP_HYPERV_ENLIGHTENED_VMCS support only when it is available It was discovered that after commit 65efa61dc0d5 ("selftests: kvm: provide common function to enable eVMCS") hyperv_cpuid selftest is failing on AMD. The reason is that the commit changed _vcpu_ioctl() to vcpu_ioctl() in the test and this one can't fail. Instead of fixing the test is seems to make more sense to not announce KVM_CAP_HYPERV_ENLIGHTENED_VMCS support if it is definitely missing (on svm and in case kvm_intel.nested=0). Signed-off-by: Vitaly Kuznetsov Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5becc6753280..67af887f0b9b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3205,7 +3205,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_EVENTFD: case KVM_CAP_HYPERV_TLBFLUSH: case KVM_CAP_HYPERV_SEND_IPI: - case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: case KVM_CAP_HYPERV_CPUID: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: @@ -3283,7 +3282,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0; break; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - r = kvm_x86_ops->enable_direct_tlbflush ? 1 : 0; + r = kvm_x86_ops->enable_direct_tlbflush != NULL; + break; + case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: + r = kvm_x86_ops->nested_enable_evmcs != NULL; break; default: break; From 12c386b2308344f2ce8819ad11aab466166f276d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Aug 2019 09:36:16 +0800 Subject: [PATCH 0878/2880] KVM: selftests: Move vm type into _vm_create() internally Rather than passing the vm type from the top level to the end of vm creation, let's simply keep that as an internal of kvm_vm struct and decide the type in _vm_create(). Several reasons for doing this: - The vm type is only decided by physical address width and currently only used in aarch64, so we've got enough information as long as we're passing vm_guest_mode into _vm_create(), - This removes a loop dependency between the vm->type and creation of vms. That's why now we need to parse vm_guest_mode twice sometimes, once in run_test() and then again in _vm_create(). The follow up patches will move on to clean up that as well so we can have a single place to decide guest machine types and so. Note that this patch will slightly change the behavior of aarch64 tests in that previously most vm_create() callers will directly pass in type==0 into _vm_create() but now the type will depend on vm_guest_mode, however it shouldn't affect any user because all vm_create() users of aarch64 will be using VM_MODE_DEFAULT guest mode (which is VM_MODE_P40V48_4K) so at last type will still be zero. Reviewed-by: Andrew Jones Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_test.c | 13 +++--------- .../testing/selftests/kvm/include/kvm_util.h | 3 +-- tools/testing/selftests/kvm/lib/kvm_util.c | 21 ++++++++++++------- 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index dc3346e090f5..6737b26b975e 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -249,14 +249,12 @@ static void vm_dirty_log_verify(unsigned long *bmap) } static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, - uint64_t extra_mem_pages, void *guest_code, - unsigned long type) + uint64_t extra_mem_pages, void *guest_code) { struct kvm_vm *vm; uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; - vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, - O_RDWR, type); + vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); kvm_vm_elf_load(vm, program_invocation_name, 0, 0); #ifdef __x86_64__ vm_create_irqchip(vm); @@ -273,7 +271,6 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, struct kvm_vm *vm; uint64_t max_gfn; unsigned long *bmap; - unsigned long type = 0; switch (mode) { case VM_MODE_P52V48_4K: @@ -314,10 +311,6 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, * bits we can change to 39. */ guest_pa_bits = 39; -#endif -#ifdef __aarch64__ - if (guest_pa_bits != 40) - type = KVM_VM_TYPE_ARM_IPA_SIZE(guest_pa_bits); #endif max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1; guest_page_size = (1ul << guest_page_shift); @@ -351,7 +344,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, bmap = bitmap_alloc(host_num_pages); host_bmap_track = bitmap_alloc(host_num_pages); - vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code, type); + vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code); #ifdef USE_CLEAR_DIRTY_LOG struct kvm_enable_cap cap = {}; diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 5463b7896a0a..b2c15135f596 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -60,8 +60,7 @@ int kvm_check_cap(long cap); int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); -struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, - int perm, unsigned long type); +struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_restart(struct kvm_vm *vmp, int perm); void kvm_vm_release(struct kvm_vm *vmp); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 6e49bb039376..34a8a6572c7c 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -84,7 +84,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) return ret; } -static void vm_open(struct kvm_vm *vm, int perm, unsigned long type) +static void vm_open(struct kvm_vm *vm, int perm) { vm->kvm_fd = open(KVM_DEV_PATH, perm); if (vm->kvm_fd < 0) @@ -95,7 +95,7 @@ static void vm_open(struct kvm_vm *vm, int perm, unsigned long type) exit(KSFT_SKIP); } - vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, type); + vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type); TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " "rc: %i errno: %i", vm->fd, errno); } @@ -130,8 +130,7 @@ _Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES, * descriptor to control the created VM is created with the permissions * given by perm (e.g. O_RDWR). */ -struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, - int perm, unsigned long type) +struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) { struct kvm_vm *vm; @@ -139,8 +138,7 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, TEST_ASSERT(vm != NULL, "Insufficient Memory"); vm->mode = mode; - vm->type = type; - vm_open(vm, perm, type); + vm->type = 0; /* Setup mode specific traits. */ switch (vm->mode) { @@ -190,6 +188,13 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode); } +#ifdef __aarch64__ + if (vm->pa_bits != 40) + vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits); +#endif + + vm_open(vm, perm); + /* Limit to VA-bit canonical virtual addresses. */ vm->vpages_valid = sparsebit_alloc(); sparsebit_set_num(vm->vpages_valid, @@ -212,7 +217,7 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) { - return _vm_create(mode, phy_pages, perm, 0); + return _vm_create(mode, phy_pages, perm); } /* @@ -232,7 +237,7 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm) { struct userspace_mem_region *region; - vm_open(vmp, perm, vmp->type); + vm_open(vmp, perm); if (vmp->has_irqchip) vm_create_irqchip(vmp); From 338eb29876b9e571273175f167fcd58d9441ac8e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Aug 2019 09:36:17 +0800 Subject: [PATCH 0879/2880] KVM: selftests: Create VM earlier for dirty log test Since we've just removed the dependency of vm type in previous patch, now we can create the vm much earlier. Note that to move it earlier we used an approximation of number of extra pages but it should be fine. This prepares for the follow up patches to finally remove the duplication of guest mode parsings. Reviewed-by: Andrew Jones Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_test.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 6737b26b975e..cf2099abb121 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -263,6 +263,9 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, return vm; } +#define DIRTY_MEM_BITS 30 /* 1G */ +#define PAGE_SHIFT_4K 12 + static void run_test(enum vm_guest_mode mode, unsigned long iterations, unsigned long interval, uint64_t phys_offset) { @@ -272,6 +275,18 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, uint64_t max_gfn; unsigned long *bmap; + /* + * We reserve page table for 2 times of extra dirty mem which + * will definitely cover the original (1G+) test range. Here + * we do the calculation with 4K page size which is the + * smallest so the page number will be enough for all archs + * (e.g., 64K page size guest will need even less memory for + * page tables). + */ + vm = create_vm(mode, VCPU_ID, + 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), + guest_code); + switch (mode) { case VM_MODE_P52V48_4K: guest_pa_bits = 52; @@ -318,7 +333,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, * A little more than 1G of guest page sized pages. Cover the * case where the size is not aligned to 64 pages. */ - guest_num_pages = (1ul << (30 - guest_page_shift)) + 16; + guest_num_pages = (1ul << (DIRTY_MEM_BITS - guest_page_shift)) + 16; #ifdef __s390x__ /* Round up to multiple of 1M (segment size) */ guest_num_pages = (guest_num_pages + 0xff) & ~0xffUL; @@ -344,8 +359,6 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, bmap = bitmap_alloc(host_num_pages); host_bmap_track = bitmap_alloc(host_num_pages); - vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code); - #ifdef USE_CLEAR_DIRTY_LOG struct kvm_enable_cap cap = {}; From 567a9f1e9deb273a2c02dd18c254208537fcefaa Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Aug 2019 09:36:18 +0800 Subject: [PATCH 0880/2880] KVM: selftests: Introduce VM_MODE_PXXV48_4K The naming VM_MODE_P52V48_4K is explicit but unclear when used on x86_64 machines, because x86_64 machines are having various physical address width rather than some static values. Here's some examples: - Intel Xeon E3-1220: 36 bits - Intel Core i7-8650: 39 bits - AMD EPYC 7251: 48 bits All of them are using 48 bits linear address width but with totally different physical address width (and most of the old machines should be less than 52 bits). Let's create a new guest mode called VM_MODE_PXXV48_4K for current x86_64 tests and make it as the default to replace the old naming of VM_MODE_P52V48_4K because it shows more clearly that the PA width is not really a constant. Meanwhile we also stop assuming all the x86 machines are having 52 bits PA width but instead we fetch the real vm->pa_bits from CPUID 0x80000008 during runtime. We currently make this exclusively used by x86_64 but no other arch. As a slight touch up, moving DEBUG macro from dirty_log_test.c to kvm_util.h so lib can use it too. Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_test.c | 5 ++-- .../testing/selftests/kvm/include/kvm_util.h | 11 ++++++- .../selftests/kvm/include/x86_64/processor.h | 3 ++ .../selftests/kvm/lib/aarch64/processor.c | 3 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 29 ++++++++++++++---- .../selftests/kvm/lib/x86_64/processor.c | 30 ++++++++++++++++--- 6 files changed, 67 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index cf2099abb121..12c36fa356a8 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -19,8 +19,6 @@ #include "kvm_util.h" #include "processor.h" -#define DEBUG printf - #define VCPU_ID 1 /* The memory slot index to track dirty pages */ @@ -289,6 +287,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, switch (mode) { case VM_MODE_P52V48_4K: + case VM_MODE_PXXV48_4K: guest_pa_bits = 52; guest_page_shift = 12; break; @@ -488,7 +487,7 @@ int main(int argc, char *argv[]) #endif #ifdef __x86_64__ - vm_guest_mode_params_init(VM_MODE_P52V48_4K, true, true); + vm_guest_mode_params_init(VM_MODE_PXXV48_4K, true, true); #endif #ifdef __aarch64__ vm_guest_mode_params_init(VM_MODE_P40V48_4K, true, true); diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index b2c15135f596..ef3259c5e856 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -24,6 +24,12 @@ struct kvm_vm; typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ +#ifndef NDEBUG +#define DEBUG(...) printf(__VA_ARGS__); +#else +#define DEBUG(...) +#endif + /* Minimum allocated guest virtual and physical addresses */ #define KVM_UTIL_MIN_VADDR 0x2000 @@ -38,11 +44,14 @@ enum vm_guest_mode { VM_MODE_P48V48_64K, VM_MODE_P40V48_4K, VM_MODE_P40V48_64K, + VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ NUM_VM_MODES, }; -#ifdef __aarch64__ +#if defined(__aarch64__) #define VM_MODE_DEFAULT VM_MODE_P40V48_4K +#elif defined(__x86_64__) +#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K #else #define VM_MODE_DEFAULT VM_MODE_P52V48_4K #endif diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 80d19740d2dc..0c17f2ee685e 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -325,6 +325,9 @@ uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index); void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, uint64_t msr_value); +uint32_t kvm_get_cpuid_max(void); +void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); + /* * Basic CPU control in CR0 */ diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 486400a97374..86036a59a668 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -264,6 +264,9 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini case VM_MODE_P52V48_4K: TEST_ASSERT(false, "AArch64 does not support 4K sized pages " "with 52-bit physical address ranges"); + case VM_MODE_PXXV48_4K: + TEST_ASSERT(false, "AArch64 does not support 4K sized pages " + "with ANY-bit physical address ranges"); case VM_MODE_P52V48_64K: tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 34a8a6572c7c..bb8f993b25fb 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -8,6 +8,7 @@ #include "test_util.h" #include "kvm_util.h" #include "kvm_util_internal.h" +#include "processor.h" #include #include @@ -101,12 +102,13 @@ static void vm_open(struct kvm_vm *vm, int perm) } const char * const vm_guest_mode_string[] = { - "PA-bits:52, VA-bits:48, 4K pages", - "PA-bits:52, VA-bits:48, 64K pages", - "PA-bits:48, VA-bits:48, 4K pages", - "PA-bits:48, VA-bits:48, 64K pages", - "PA-bits:40, VA-bits:48, 4K pages", - "PA-bits:40, VA-bits:48, 64K pages", + "PA-bits:52, VA-bits:48, 4K pages", + "PA-bits:52, VA-bits:48, 64K pages", + "PA-bits:48, VA-bits:48, 4K pages", + "PA-bits:48, VA-bits:48, 64K pages", + "PA-bits:40, VA-bits:48, 4K pages", + "PA-bits:40, VA-bits:48, 64K pages", + "PA-bits:ANY, VA-bits:48, 4K pages", }; _Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES, "Missing new mode strings?"); @@ -184,6 +186,21 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) vm->page_size = 0x10000; vm->page_shift = 16; break; + case VM_MODE_PXXV48_4K: +#ifdef __x86_64__ + kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits); + TEST_ASSERT(vm->va_bits == 48, "Linear address width " + "(%d bits) not supported", vm->va_bits); + vm->pgtable_levels = 4; + vm->page_size = 0x1000; + vm->page_shift = 12; + DEBUG("Guest physical address width detected: %d\n", + vm->pa_bits); +#else + TEST_ASSERT(false, "VM_MODE_PXXV48_4K not supported on " + "non-x86 platforms"); +#endif + break; default: TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode); } diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 0a5e487dbc50..c53dbc6bc568 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -228,7 +228,7 @@ void sregs_dump(FILE *stream, struct kvm_sregs *sregs, void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) { - TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use " + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); /* If needed, create page map l4 table. */ @@ -261,7 +261,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint16_t index[4]; struct pageMapL4Entry *pml4e; - TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use " + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); TEST_ASSERT((vaddr % vm->page_size) == 0, @@ -547,7 +547,7 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) struct pageDirectoryEntry *pde; struct pageTableEntry *pte; - TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use " + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); index[0] = (gva >> 12) & 0x1ffu; @@ -621,7 +621,7 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot); switch (vm->mode) { - case VM_MODE_P52V48_4K: + case VM_MODE_PXXV48_4K: sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); @@ -1157,3 +1157,25 @@ bool is_intel_cpu(void) chunk = (const uint32_t *)("GenuineIntel"); return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]); } + +uint32_t kvm_get_cpuid_max(void) +{ + return kvm_get_supported_cpuid_entry(0x80000000)->eax; +} + +void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) +{ + struct kvm_cpuid_entry2 *entry; + bool pae; + + /* SDM 4.1.4 */ + if (kvm_get_cpuid_max() < 0x80000008) { + pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6); + *pa_bits = pae ? 36 : 32; + *va_bits = 32; + } else { + entry = kvm_get_supported_cpuid_entry(0x80000008); + *pa_bits = entry->eax & 0xff; + *va_bits = (entry->eax >> 8) & 0xff; + } +} From 52200d0d944e473142271773c41f5f490f3a821f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Aug 2019 09:36:19 +0800 Subject: [PATCH 0881/2880] KVM: selftests: Remove duplicate guest mode handling Remove the duplication code in run_test() of dirty_log_test because after some reordering of functions now we can directly use the outcome of vm_create(). Meanwhile, with the new VM_MODE_PXXV48_4K, we can safely revert b442324b58 too where we stick the x86_64 PA width to 39 bits for dirty_log_test. Reviewed-by: Andrew Jones Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_test.c | 52 ++----------------- .../testing/selftests/kvm/include/kvm_util.h | 4 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 17 ++++++ 3 files changed, 26 insertions(+), 47 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 12c36fa356a8..5614222a6628 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -267,10 +267,8 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, static void run_test(enum vm_guest_mode mode, unsigned long iterations, unsigned long interval, uint64_t phys_offset) { - unsigned int guest_pa_bits, guest_page_shift; pthread_t vcpu_thread; struct kvm_vm *vm; - uint64_t max_gfn; unsigned long *bmap; /* @@ -285,54 +283,13 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), guest_code); - switch (mode) { - case VM_MODE_P52V48_4K: - case VM_MODE_PXXV48_4K: - guest_pa_bits = 52; - guest_page_shift = 12; - break; - case VM_MODE_P52V48_64K: - guest_pa_bits = 52; - guest_page_shift = 16; - break; - case VM_MODE_P48V48_4K: - guest_pa_bits = 48; - guest_page_shift = 12; - break; - case VM_MODE_P48V48_64K: - guest_pa_bits = 48; - guest_page_shift = 16; - break; - case VM_MODE_P40V48_4K: - guest_pa_bits = 40; - guest_page_shift = 12; - break; - case VM_MODE_P40V48_64K: - guest_pa_bits = 40; - guest_page_shift = 16; - break; - default: - TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode); - } - - DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - -#ifdef __x86_64__ - /* - * FIXME - * The x86_64 kvm selftests framework currently only supports a - * single PML4 which restricts the number of physical address - * bits we can change to 39. - */ - guest_pa_bits = 39; -#endif - max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1; - guest_page_size = (1ul << guest_page_shift); + guest_page_size = vm_get_page_size(vm); /* * A little more than 1G of guest page sized pages. Cover the * case where the size is not aligned to 64 pages. */ - guest_num_pages = (1ul << (DIRTY_MEM_BITS - guest_page_shift)) + 16; + guest_num_pages = (1ul << (DIRTY_MEM_BITS - + vm_get_page_shift(vm))) + 16; #ifdef __s390x__ /* Round up to multiple of 1M (segment size) */ guest_num_pages = (guest_num_pages + 0xff) & ~0xffUL; @@ -342,7 +299,8 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, !!((guest_num_pages * guest_page_size) % host_page_size); if (!phys_offset) { - guest_test_phys_mem = (max_gfn - guest_num_pages) * guest_page_size; + guest_test_phys_mem = (vm_get_max_gfn(vm) - + guest_num_pages) * guest_page_size; guest_test_phys_mem &= ~(host_page_size - 1); } else { guest_test_phys_mem = phys_offset; diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index ef3259c5e856..29cccaf96baf 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -154,6 +154,10 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); bool vm_is_unrestricted_guest(struct kvm_vm *vm); +unsigned int vm_get_page_size(struct kvm_vm *vm); +unsigned int vm_get_page_shift(struct kvm_vm *vm); +unsigned int vm_get_max_gfn(struct kvm_vm *vm); + struct kvm_userspace_memory_region * kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index bb8f993b25fb..80a338b5403c 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -136,6 +136,8 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) { struct kvm_vm *vm; + DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + vm = calloc(1, sizeof(*vm)); TEST_ASSERT(vm != NULL, "Insufficient Memory"); @@ -1650,3 +1652,18 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm) return val == 'Y'; } + +unsigned int vm_get_page_size(struct kvm_vm *vm) +{ + return vm->page_size; +} + +unsigned int vm_get_page_shift(struct kvm_vm *vm) +{ + return vm->page_shift; +} + +unsigned int vm_get_max_gfn(struct kvm_vm *vm) +{ + return vm->max_gfn; +} From 319109a2d0dde8672323efcf909740c2c6e98be4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Sep 2019 12:19:52 -0700 Subject: [PATCH 0882/2880] KVM: x86: Manually flush collapsible SPTEs only when toggling flags Zapping collapsible sptes, a.k.a. 4k sptes that can be promoted into a large page, is only necessary when changing only the dirty logging flag of a memory region. If the memslot is also being moved, then all sptes for the memslot are zapped when it is invalidated. When a memslot is being created, it is impossible for there to be existing dirty mappings, e.g. KVM can have MMIO sptes, but not present, and thus dirty, sptes. Note, the comment and logic are shamelessly borrowed from MIPS's version of kvm_arch_commit_memory_region(). Fixes: 3ea3b7fa9af06 ("kvm: mmu: lazy collapse small sptes into large sptes") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 67af887f0b9b..7e51924c4a1c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9741,8 +9741,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * Scan sptes if dirty logging has been stopped, dropping those * which can be collapsed into a single large-page spte. Later * page faults will create the large-page sptes. + * + * There is no need to do this in any of the following cases: + * CREATE: No dirty mappings will already exist. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot() */ - if ((change != KVM_MR_DELETE) && + if (change == KVM_MR_FLAGS_ONLY && (old->flags & KVM_MEM_LOG_DIRTY_PAGES) && !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_zap_collapsible_sptes(kvm, new); From a073d7e3ad687a7ef32b65affe80faa7ce89bf92 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 16 Sep 2019 15:42:32 +0800 Subject: [PATCH 0883/2880] KVM: hyperv: Fix Direct Synthetic timers assert an interrupt w/o lapic_in_kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported by syzkaller: kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN RIP: 0010:__apic_accept_irq+0x46/0x740 arch/x86/kvm/lapic.c:1029 Call Trace: kvm_apic_set_irq+0xb4/0x140 arch/x86/kvm/lapic.c:558 stimer_notify_direct arch/x86/kvm/hyperv.c:648 [inline] stimer_expiration arch/x86/kvm/hyperv.c:659 [inline] kvm_hv_process_stimers+0x594/0x1650 arch/x86/kvm/hyperv.c:686 vcpu_enter_guest+0x2b2a/0x54b0 arch/x86/kvm/x86.c:7896 vcpu_run+0x393/0xd40 arch/x86/kvm/x86.c:8152 kvm_arch_vcpu_ioctl_run+0x636/0x900 arch/x86/kvm/x86.c:8360 kvm_vcpu_ioctl+0x6cf/0xaf0 arch/x86/kvm/../../../virt/kvm/kvm_main.c:2765 The testcase programs HV_X64_MSR_STIMERn_CONFIG/HV_X64_MSR_STIMERn_COUNT, in addition, there is no lapic in the kernel, the counters value are small enough in order that kvm_hv_process_stimers() inject this already-expired timer interrupt into the guest through lapic in the kernel which triggers the NULL deferencing. This patch fixes it by don't advertise direct mode synthetic timers and discarding the inject when lapic is not in kernel. syzkaller source: https://syzkaller.appspot.com/x/repro.c?x=1752fe0a600000 Reported-by: syzbot+dff25ee91f0c7d5c1695@syzkaller.appspotmail.com Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index fff790a3f4ee..c0867b0aae3e 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -645,7 +645,9 @@ static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) .vector = stimer->config.apic_vector }; - return !kvm_apic_set_irq(vcpu, &irq, NULL); + if (lapic_in_kernel(vcpu)) + return !kvm_apic_set_irq(vcpu, &irq, NULL); + return 0; } static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) @@ -1852,7 +1854,13 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; - ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; + + /* + * Direct Synthetic timers only make sense with in-kernel + * LAPIC + */ + if (lapic_in_kernel(vcpu)) + ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; break; From e1572f1d08be57a5412a464cff0712a23cd0b73e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 16 Sep 2019 18:22:56 +0200 Subject: [PATCH 0884/2880] cpu/SMT: create and export cpu_smt_possible() KVM needs to know if SMT is theoretically possible, this means it is supported and not forcefully disabled ('nosmt=force'). Create and export cpu_smt_possible() answering this question. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- include/linux/cpu.h | 2 ++ kernel/cpu.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 88dc0c653925..d0633ebdaa9c 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -201,12 +201,14 @@ enum cpuhp_smt_control { extern enum cpuhp_smt_control cpu_smt_control; extern void cpu_smt_disable(bool force); extern void cpu_smt_check_topology(void); +extern bool cpu_smt_possible(void); extern int cpuhp_smt_enable(void); extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval); #else # define cpu_smt_control (CPU_SMT_NOT_IMPLEMENTED) static inline void cpu_smt_disable(bool force) { } static inline void cpu_smt_check_topology(void) { } +static inline bool cpu_smt_possible(void) { return false; } static inline int cpuhp_smt_enable(void) { return 0; } static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index e1967e9eddc2..fc28e17940e0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -392,8 +392,7 @@ enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; void __init cpu_smt_disable(bool force) { - if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || - cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + if (!cpu_smt_possible()) return; if (force) { @@ -438,6 +437,14 @@ static inline bool cpu_smt_allowed(unsigned int cpu) */ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); } + +/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */ +bool cpu_smt_possible(void) +{ + return cpu_smt_control != CPU_SMT_FORCE_DISABLED && + cpu_smt_control != CPU_SMT_NOT_SUPPORTED; +} +EXPORT_SYMBOL_GPL(cpu_smt_possible); #else static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } #endif From b2d8b167e15bb5ec2691d1119c025630a247f649 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 16 Sep 2019 18:22:57 +0200 Subject: [PATCH 0885/2880] KVM: x86: hyper-v: set NoNonArchitecturalCoreSharing CPUID bit when SMT is impossible Hyper-V 2019 doesn't expose MD_CLEAR CPUID bit to guests when it cannot guarantee that two virtual processors won't end up running on sibling SMT threads without knowing about it. This is done as an optimization as in this case there is nothing the guest can do to protect itself against MDS and issuing additional flush requests is just pointless. On bare metal the topology is known, however, when Hyper-V is running nested (e.g. on top of KVM) it needs an additional piece of information: a confirmation that the exposed topology (wrt vCPU placement on different SMT threads) is trustworthy. NoNonArchitecturalCoreSharing (CPUID 0x40000004 EAX bit 18) is described in TLFS as follows: "Indicates that a virtual processor will never share a physical core with another virtual processor, except for virtual processors that are reported as sibling SMT threads." From KVM we can give such guarantee in two cases: - SMT is unsupported or forcefully disabled (just 'disabled' doesn't work as it can become re-enabled during the lifetime of the guest). - vCPUs are properly pinned so the scheduler won't put them on sibling SMT threads (when they're not reported as such). This patch reports NoNonArchitecturalCoreSharing bit in to userspace in the first case. The second case is outside of KVM's domain of responsibility (as vCPU pinning is actually done by someone who manages KVM's userspace - e.g. libvirt pinning QEMU threads). Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 7 +++++++ arch/x86/kvm/hyperv.c | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 038581e91a84..7741e211f7f5 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -180,6 +180,13 @@ /* Recommend using enlightened VMCS */ #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) +/* + * Virtual processor will never share a physical core with another virtual + * processor, except for virtual processors that are reported as sibling SMT + * threads. + */ +#define HV_X64_NO_NONARCH_CORESHARING BIT(18) + /* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ #define HV_X64_NESTED_DIRECT_FLUSH BIT(17) #define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index c0867b0aae3e..23ff65504d7e 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -23,6 +23,7 @@ #include "ioapic.h" #include "hyperv.h" +#include #include #include #include @@ -1872,7 +1873,8 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; - + if (!cpu_smt_possible()) + ent->eax |= HV_X64_NO_NONARCH_CORESHARING; /* * Default number of spinlock retry attempts, matches * HyperV 2016. From e738772e29214019f506372dd1162723b7a4d507 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 16 Sep 2019 18:22:58 +0200 Subject: [PATCH 0886/2880] KVM: selftests: hyperv_cpuid: add check for NoNonArchitecturalCoreSharing bit The bit is supposed to be '1' when SMT is not supported or forcefully disabled and '0' otherwise. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/hyperv_cpuid.c | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index ee59831fbc98..443a2b54645b 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -26,6 +26,25 @@ static void guest_code(void) { } +static int smt_possible(void) +{ + char buf[16]; + FILE *f; + bool res = 1; + + f = fopen("/sys/devices/system/cpu/smt/control", "r"); + if (f) { + if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) { + if (!strncmp(buf, "forceoff", 8) || + !strncmp(buf, "notsupported", 12)) + res = 0; + } + fclose(f); + } + + return res; +} + static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, int evmcs_enabled) { @@ -59,6 +78,14 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, TEST_ASSERT(!entry->padding[0] && !entry->padding[1] && !entry->padding[2], "padding should be zero"); + if (entry->function == 0x40000004) { + int nononarchcs = !!(entry->eax & (1UL << 18)); + + TEST_ASSERT(nononarchcs == !smt_possible(), + "NoNonArchitecturalCoreSharing bit" + " doesn't reflect SMT setting"); + } + /* * If needed for debug: * fprintf(stdout, From 5845038c111db27902bc220a4f70070fe945871c Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Fri, 9 Aug 2019 12:26:19 -0700 Subject: [PATCH 0887/2880] KVM: nVMX: Check Host Address Space Size on vmentry of nested guests According to section "Checks Related to Address-Space Size" in Intel SDM vol 3C, the following checks are performed on vmentry of nested guests: If the logical processor is outside IA-32e mode (if IA32_EFER.LMA = 0) at the time of VM entry, the following must hold: - The "IA-32e mode guest" VM-entry control is 0. - The "host address-space size" VM-exit control is 0. If the logical processor is in IA-32e mode (if IA32_EFER.LMA = 1) at the time of VM entry, the "host address-space size" VM-exit control must be 1. If the "host address-space size" VM-exit control is 0, the following must hold: - The "IA-32e mode guest" VM-entry control is 0. - Bit 17 of the CR4 field (corresponding to CR4.PCIDE) is 0. - Bits 63:32 in the RIP field are 0. If the "host address-space size" VM-exit control is 1, the following must hold: - Bit 5 of the CR4 field (corresponding to CR4.PAE) is 1. - The RIP field contains a canonical address. On processors that do not support Intel 64 architecture, checks are performed to ensure that the "IA-32e mode guest" VM-entry control and the "host address-space size" VM-exit control are both 0. Signed-off-by: Krish Sadhukhan Reviewed-by: Karl Heubaum Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1a10cd351940..75ed0a63abbe 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2664,6 +2664,34 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu))) return -EINVAL; + + if (!(vmcs12->host_ia32_efer & EFER_LMA) && + ((vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || + (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE))) { + return -EINVAL; + } + + if ((vmcs12->host_ia32_efer & EFER_LMA) && + !(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) { + return -EINVAL; + } + + if (!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) && + ((vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || + (vmcs12->host_cr4 & X86_CR4_PCIDE) || + (((vmcs12->host_rip) >> 32) & 0xffffffff))) { + return -EINVAL; + } + + if ((vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) && + ((!(vmcs12->host_cr4 & X86_CR4_PAE)) || + (is_noncanonical_address(vmcs12->host_rip, vcpu)))) { + return -EINVAL; + } +#else + if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE || + vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + return -EINVAL; #endif /* From bc8a0aafcbb82e5c7dde486c03e6c51e3896d797 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:27 -0700 Subject: [PATCH 0888/2880] KVM: x86: Relocate MMIO exit stats counting Move the stat.mmio_exits update into x86_emulate_instruction(). This is both a bug fix, e.g. the current update flows will incorrectly increment mmio_exits on emulation failure, and a preparatory change to set the stage for eliminating EMULATE_DONE and company. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 2 -- arch/x86/kvm/vmx/vmx.c | 1 - arch/x86/kvm/x86.c | 2 ++ 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a10af9c87f8a..138dcdef8a06 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5456,8 +5456,6 @@ emulate: case EMULATE_DONE: return 1; case EMULATE_USER_EXIT: - ++vcpu->stat.mmio_exits; - /* fall through */ case EMULATE_FAIL: return 0; default: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8bab3e53d1e1..8f61250fc1c0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5201,7 +5201,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) err = kvm_emulate_instruction(vcpu, 0); if (err == EMULATE_USER_EXIT) { - ++vcpu->stat.mmio_exits; ret = 0; goto out; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7e51924c4a1c..2b92c2ca1d79 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6703,6 +6703,8 @@ restart: } r = EMULATE_USER_EXIT; } else if (vcpu->mmio_needed) { + ++vcpu->stat.mmio_exits; + if (!vcpu->mmio_is_write) writeback = false; r = EMULATE_USER_EXIT; From 22da61c9123e574f607c92d8f69165b1ffff6d5f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:28 -0700 Subject: [PATCH 0889/2880] KVM: x86: Clean up handle_emulation_failure() When handling emulation failure, return the emulation result directly instead of capturing it in a local variable. Future patches will move additional cases into handle_emulation_failure(), clean up the cruft before so there isn't an ugly mix of setting a local variable and returning directly. Reviewed-by: Vitaly Kuznetsov Reviewed-by: Liran Alon Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b92c2ca1d79..786d2f88fdf5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6304,24 +6304,22 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { - int r = EMULATE_DONE; - ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); if (emulation_type & EMULTYPE_NO_UD_ON_FAIL) return EMULATE_FAIL; + kvm_queue_exception(vcpu, UD_VECTOR); + if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; - r = EMULATE_USER_EXIT; + return EMULATE_USER_EXIT; } - kvm_queue_exception(vcpu, UD_VECTOR); - - return r; + return EMULATE_DONE; } static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, From f41f900568d9ffd896cc941db7021eb14bd55910 Mon Sep 17 00:00:00 2001 From: Jussi Laako Date: Tue, 24 Sep 2019 10:11:43 +0300 Subject: [PATCH 0890/2880] ALSA: usb-audio: Add DSD support for EVGA NU Audio EVGA NU Audio is actually a USB audio device on a PCIexpress card, with it's own USB controller. It supports both PCM and DSD. Signed-off-by: Jussi Laako Cc: Link: https://lore.kernel.org/r/20190924071143.30911-1-jussi@sonarnerd.net Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 5fd4ccce452d..fbfde996fee7 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1658,6 +1658,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, case 0x25ce: /* Mytek devices */ case 0x278b: /* Rotel? */ case 0x2ab6: /* T+A devices */ + case 0x3842: /* EVGA */ case 0xc502: /* HiBy devices */ if (fp->dsd_raw) return SNDRV_PCM_FMTBIT_DSD_U32_BE; From 120c2c4f99a80ef8682867b41e82d4e5d6b49635 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:29 -0700 Subject: [PATCH 0891/2880] KVM: x86: Refactor kvm_vcpu_do_singlestep() to remove out param Return the single-step emulation result directly instead of via an out param. Presumably at some point in the past kvm_vcpu_do_singlestep() could be called with *r==EMULATE_USER_EXIT, but that is no longer the case, i.e. all callers are happy to overwrite their own return variable. Reviewed-by: Vitaly Kuznetsov Reviewed-by: Liran Alon Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 786d2f88fdf5..e6063f20259e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6474,7 +6474,7 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) +static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; @@ -6483,10 +6483,10 @@ static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; - } else { - kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); + return EMULATE_USER_EXIT; } + kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); + return EMULATE_DONE; } int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) @@ -6507,7 +6507,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) * that sets the TF flag". */ if (unlikely(rflags & X86_EFLAGS_TF)) - kvm_vcpu_do_singlestep(vcpu, &r); + r = kvm_vcpu_do_singlestep(vcpu); return r == EMULATE_DONE; } EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); @@ -6720,7 +6720,7 @@ restart: exception_type(ctxt->exception.vector) == EXCPT_TRAP) { kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE && ctxt->tf) - kvm_vcpu_do_singlestep(vcpu, &r); + r = kvm_vcpu_do_singlestep(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } From a6c6ed1e81060e110258dbc7118fda07e9971e48 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:30 -0700 Subject: [PATCH 0892/2880] KVM: x86: Don't attempt VMWare emulation on #GP with non-zero error code The VMware backdoor hooks #GP faults on IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero error code for their #GP. Re-injecting #GP instead of attempting emulation on a non-zero error code will allow a future patch to move #GP injection (for emulation failure) into kvm_emulate_instruction() without having to plumb in the error code. Reviewed-and-tested-by: Vitaly Kuznetsov Reviewed-by: Liran Alon Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 10 +++++++++- arch/x86/kvm/vmx/vmx.c | 12 +++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7e352e81df31..00ec27bc2946 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2772,12 +2772,20 @@ static int gp_interception(struct vcpu_svm *svm) WARN_ON_ONCE(!enable_vmware_backdoor); + /* + * VMware backdoor emulation on #GP interception only handles IN{S}, + * OUT{S}, and RDPMC, none of which generate a non-zero error code. + */ + if (error_code) { + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); + return 1; + } er = kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); if (er == EMULATE_USER_EXIT) return 0; else if (er != EMULATE_DONE) - kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return 1; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8f61250fc1c0..d1bac3cb2440 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4539,12 +4539,22 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { WARN_ON_ONCE(!enable_vmware_backdoor); + + /* + * VMware backdoor emulation on #GP interception only handles + * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero + * error code on #GP. + */ + if (error_code) { + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); + return 1; + } er = kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); if (er == EMULATE_USER_EXIT) return 0; else if (er != EMULATE_DONE) - kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return 1; } From 42cbf06872cc44fb8d2d6665fa6494b5925eaf1c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:31 -0700 Subject: [PATCH 0893/2880] KVM: x86: Move #GP injection for VMware into x86_emulate_instruction() Immediately inject a #GP when VMware emulation fails and return EMULATE_DONE instead of propagating EMULATE_FAIL up the stack. This helps pave the way for removing EMULATE_FAIL altogether. Rename EMULTYPE_VMWARE to EMULTYPE_VMWARE_GP to document that the x86 emulator is called to handle VMware #GP interception, e.g. why a #GP is injected on emulation failure for EMULTYPE_VMWARE_GP. Drop EMULTYPE_NO_UD_ON_FAIL as a standalone type. The "no #UD on fail" is used only in the VMWare case and is obsoleted by having the emulator itself reinject #GP. Signed-off-by: Sean Christopherson Reviewed-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/svm.c | 10 ++-------- arch/x86/kvm/vmx/vmx.c | 10 ++-------- arch/x86/kvm/x86.c | 14 +++++++++----- 4 files changed, 14 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 13b35f94dec4..80071d02b87d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1325,8 +1325,7 @@ enum emulation_result { #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) #define EMULTYPE_ALLOW_RETRY (1 << 3) -#define EMULTYPE_NO_UD_ON_FAIL (1 << 4) -#define EMULTYPE_VMWARE (1 << 5) +#define EMULTYPE_VMWARE_GP (1 << 5) int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 00ec27bc2946..c7aa55e192c0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2768,7 +2768,6 @@ static int gp_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 error_code = svm->vmcb->control.exit_info_1; - int er; WARN_ON_ONCE(!enable_vmware_backdoor); @@ -2780,13 +2779,8 @@ static int gp_interception(struct vcpu_svm *svm) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); return 1; } - er = kvm_emulate_instruction(vcpu, - EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); - if (er == EMULATE_USER_EXIT) - return 0; - else if (er != EMULATE_DONE) - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; + return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP) != + EMULATE_USER_EXIT; } static bool is_erratum_383(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d1bac3cb2440..d4371599abe3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4522,7 +4522,6 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) u32 intr_info, ex_no, error_code; unsigned long cr2, rip, dr6; u32 vect_info; - enum emulation_result er; vect_info = vmx->idt_vectoring_info; intr_info = vmx->exit_intr_info; @@ -4549,13 +4548,8 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); return 1; } - er = kvm_emulate_instruction(vcpu, - EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); - if (er == EMULATE_USER_EXIT) - return 0; - else if (er != EMULATE_DONE) - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; + return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP) != + EMULATE_USER_EXIT; } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e6063f20259e..09753b699d81 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6307,8 +6307,10 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); - if (emulation_type & EMULTYPE_NO_UD_ON_FAIL) - return EMULATE_FAIL; + if (emulation_type & EMULTYPE_VMWARE_GP) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return EMULATE_DONE; + } kvm_queue_exception(vcpu, UD_VECTOR); @@ -6648,9 +6650,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } } - if ((emulation_type & EMULTYPE_VMWARE) && - !is_vmware_backdoor_opcode(ctxt)) - return EMULATE_FAIL; + if ((emulation_type & EMULTYPE_VMWARE_GP) && + !is_vmware_backdoor_opcode(ctxt)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return EMULATE_DONE; + } if (emulation_type & EMULTYPE_SKIP) { kvm_rip_write(vcpu, ctxt->_eip); From b4000606205959e6cfe1fd3a71c490267ff23506 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:32 -0700 Subject: [PATCH 0894/2880] KVM: x86: Add explicit flag for forced emulation on #UD Add an explicit emulation type for forced #UD emulation and use it to detect that KVM should unconditionally inject a #UD instead of falling into its standard emulation failure handling. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 80071d02b87d..f3c623dbd5ab 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1325,6 +1325,7 @@ enum emulation_result { #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) #define EMULTYPE_ALLOW_RETRY (1 << 3) +#define EMULTYPE_TRAP_UD_FORCED (1 << 4) #define EMULTYPE_VMWARE_GP (1 << 5) int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 09753b699d81..eb2ed5c9a584 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5449,7 +5449,7 @@ int handle_ud(struct kvm_vcpu *vcpu) sig, sizeof(sig), &e) == 0 && memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) { kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); - emul_type = 0; + emul_type = EMULTYPE_TRAP_UD_FORCED; } er = kvm_emulate_instruction(vcpu, emul_type); @@ -6629,7 +6629,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; if (r != EMULATION_OK) { - if (emulation_type & EMULTYPE_TRAP_UD) + if ((emulation_type & EMULTYPE_TRAP_UD) || + (emulation_type & EMULTYPE_TRAP_UD_FORCED)) return EMULATE_FAIL; if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) From c83fad65e2cae1aa570a519b33e8ebf00f6e7227 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:33 -0700 Subject: [PATCH 0895/2880] KVM: x86: Move #UD injection for failed emulation into emulation code Immediately inject a #UD and return EMULATE done if emulation fails when handling an intercepted #UD. This helps pave the way for removing EMULATE_FAIL altogether. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eb2ed5c9a584..64d584d48c60 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5440,7 +5440,6 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); int handle_ud(struct kvm_vcpu *vcpu) { int emul_type = EMULTYPE_TRAP_UD; - enum emulation_result er; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; @@ -5452,12 +5451,7 @@ int handle_ud(struct kvm_vcpu *vcpu) emul_type = EMULTYPE_TRAP_UD_FORCED; } - er = kvm_emulate_instruction(vcpu, emul_type); - if (er == EMULATE_USER_EXIT) - return 0; - if (er != EMULATE_DONE) - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; + return kvm_emulate_instruction(vcpu, emul_type) != EMULATE_USER_EXIT; } EXPORT_SYMBOL_GPL(handle_ud); @@ -6630,8 +6624,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, ++vcpu->stat.insn_emulation; if (r != EMULATION_OK) { if ((emulation_type & EMULTYPE_TRAP_UD) || - (emulation_type & EMULTYPE_TRAP_UD_FORCED)) - return EMULATE_FAIL; + (emulation_type & EMULTYPE_TRAP_UD_FORCED)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return EMULATE_DONE; + } if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) return EMULATE_DONE; From 738fece46dc5e1bb1309a827a2b69056143b3d13 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:34 -0700 Subject: [PATCH 0896/2880] KVM: x86: Exit to userspace on emulation skip failure Kill a few birds with one stone by forcing an exit to userspace on skip emulation failure. This removes a reference to EMULATE_FAIL, fixes a bug in handle_ept_misconfig() where it would exit to userspace without setting run->exit_reason, and fixes a theoretical bug in SVM's task_switch_interception() where it would overwrite run->exit_reason on a return of EMULATE_USER_EXIT. Note, this technically doesn't fully fix task_switch_interception() as it now incorrectly handles EMULATE_FAIL, but in practice there is no bug as EMULATE_FAIL will never be returned for EMULTYPE_SKIP. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/x86.c | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c7aa55e192c0..fc40052fa334 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3885,8 +3885,8 @@ static int task_switch_interception(struct vcpu_svm *svm) int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT && (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { - if (skip_emulated_instruction(&svm->vcpu) != EMULATE_DONE) - goto fail; + if (skip_emulated_instruction(&svm->vcpu) == EMULATE_USER_EXIT) + return 0; } if (int_type != SVM_EXITINTINFO_TYPE_SOFT) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 64d584d48c60..802dfb926ca7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6306,6 +6306,13 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) return EMULATE_DONE; } + if (emulation_type & EMULTYPE_SKIP) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return EMULATE_USER_EXIT; + } + kvm_queue_exception(vcpu, UD_VECTOR); if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { @@ -6641,8 +6648,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, inject_emulated_exception(vcpu); return EMULATE_DONE; } - if (emulation_type & EMULTYPE_SKIP) - return EMULATE_FAIL; return handle_emulation_failure(vcpu, emulation_type); } } From 1051778f6e1e25c1203869586f4431785a35c13c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:35 -0700 Subject: [PATCH 0897/2880] KVM: x86: Handle emulation failure directly in kvm_task_switch() Consolidate the reporting of emulation failure into kvm_task_switch() so that it can return EMULATE_USER_EXIT. This helps pave the way for removing EMULATE_FAIL altogether. This also fixes a theoretical bug where task switch interception could suppress an EMULATE_USER_EXIT return. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 11 ++--------- arch/x86/kvm/vmx/vmx.c | 14 +++----------- arch/x86/kvm/x86.c | 9 ++++++--- 3 files changed, 11 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fc40052fa334..69e53aa3969d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3892,17 +3892,10 @@ static int task_switch_interception(struct vcpu_svm *svm) if (int_type != SVM_EXITINTINFO_TYPE_SOFT) int_vec = -1; - if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, - has_error_code, error_code) == EMULATE_FAIL) - goto fail; - return 1; -fail: - svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - svm->vcpu.run->internal.ndata = 0; - return 0; + return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, + has_error_code, error_code) != EMULATE_USER_EXIT; } static int cpuid_interception(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d4371599abe3..498270cfc71d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5074,21 +5074,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) type != INTR_TYPE_NMI_INTR)) skip_emulated_instruction(vcpu); - if (kvm_task_switch(vcpu, tss_selector, - type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, - has_error_code, error_code) == EMULATE_FAIL) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } - /* * TODO: What about debug traps on tss switch? * Are we supposed to inject them and update dr6? */ - - return 1; + return kvm_task_switch(vcpu, tss_selector, + type == INTR_TYPE_SOFT_INTR ? idt_index : -1, + reason, has_error_code, error_code) != EMULATE_USER_EXIT; } static int handle_ept_violation(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 802dfb926ca7..bdee7e39accb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8693,9 +8693,12 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); - - if (ret) - return EMULATE_FAIL; + if (ret) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return EMULATE_USER_EXIT; + } kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); From 9497e1f2ec93f0c8a7832c3c1eb3af8159800cd8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:36 -0700 Subject: [PATCH 0898/2880] KVM: x86: Move triple fault request into RM int injection Request triple fault in kvm_inject_realmode_interrupt() instead of returning EMULATE_FAIL and deferring to the caller. All existing callers request triple fault and it's highly unlikely Real Mode is going to acquire new features. While this consolidates a small amount of code, the real goal is to remove the last reference to EMULATE_FAIL. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 9 +++------ arch/x86/kvm/x86.c | 17 ++++++++--------- arch/x86/kvm/x86.h | 2 +- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 498270cfc71d..67733eef184c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1556,8 +1556,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) int inc_eip = 0; if (kvm_exception_is_soft(nr)) inc_eip = vcpu->arch.event_exit_inst_len; - if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_inject_realmode_interrupt(vcpu, nr, inc_eip); return; } @@ -4306,8 +4305,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) int inc_eip = 0; if (vcpu->arch.interrupt.soft) inc_eip = vcpu->arch.event_exit_inst_len; - if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); return; } intr = irq | INTR_INFO_VALID_MASK; @@ -4343,8 +4341,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); return; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bdee7e39accb..92b6690d0512 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6273,7 +6273,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) +void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int ret; @@ -6285,14 +6285,13 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) ctxt->_eip = ctxt->eip + inc_eip; ret = emulate_int_real(ctxt, irq); - if (ret != X86EMUL_CONTINUE) - return EMULATE_FAIL; - - ctxt->eip = ctxt->_eip; - kvm_rip_write(vcpu, ctxt->eip); - kvm_set_rflags(vcpu, ctxt->eflags); - - return EMULATE_DONE; + if (ret != X86EMUL_CONTINUE) { + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } else { + ctxt->eip = ctxt->_eip; + kvm_rip_write(vcpu, ctxt->eip); + kvm_set_rflags(vcpu, ctxt->eflags); + } } EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index b5274e2a53cf..dbf7442a822b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -261,7 +261,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) } void kvm_set_pending_timer(struct kvm_vcpu *vcpu); -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); +void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 get_kvmclock_ns(struct kvm *kvm); From 8fff2710eaf50aff1e6a7aa0607fe5288b619404 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:37 -0700 Subject: [PATCH 0899/2880] KVM: VMX: Remove EMULATE_FAIL handling in handle_invalid_guest_state() Now that EMULATE_FAIL is completely unused, remove the last remaning usage where KVM does something functional in response to EMULATE_FAIL. Leave the check in place as a WARN_ON_ONCE to provide a better paper trail when EMULATE_{DONE,FAIL,USER_EXIT} are completely removed. Opportunistically remove the gotos in handle_invalid_guest_state(). With the EMULATE_FAIL handling gone there is no need to have a common handler for emulation failure and the gotos only complicate things, e.g. the signal_pending() check always returns '1', but this is far from obvious when glancing through the code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 67733eef184c..9c92b2993f5a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5169,8 +5169,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - enum emulation_result err = EMULATE_DONE; - int ret = 1; + enum emulation_result err; bool intr_window_requested; unsigned count = 130; @@ -5193,38 +5192,38 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) err = kvm_emulate_instruction(vcpu, 0); - if (err == EMULATE_USER_EXIT) { - ret = 0; - goto out; - } + if (err == EMULATE_USER_EXIT) + return 0; - if (err != EMULATE_DONE) - goto emulation_error; + if (WARN_ON_ONCE(err == EMULATE_FAIL)) + return 1; if (vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending) - goto emulation_error; + vcpu->arch.exception.pending) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - ret = kvm_vcpu_halt(vcpu); - goto out; + return kvm_vcpu_halt(vcpu); } + /* + * Note, return 1 and not 0, vcpu_run() is responsible for + * morphing the pending signal into the proper return code. + */ if (signal_pending(current)) - goto out; + return 1; + if (need_resched()) schedule(); } -out: - return ret; - -emulation_error: - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; + return 1; } static void grow_ple_window(struct kvm_vcpu *vcpu) From 60fc3d02d5b8829b91b7b443ef6c7e8f0bbae868 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:38 -0700 Subject: [PATCH 0900/2880] KVM: x86: Remove emulation_result enums, EMULATE_{DONE,FAIL,USER_EXIT} Deferring emulation failure handling (in some cases) to the caller of x86_emulate_instruction() has proven fragile, e.g. multiple instances of KVM not setting run->exit_reason on EMULATE_FAIL, largely due to it being difficult to discern what emulation types can return what result, and which combination of types and results are handled where. Now that x86_emulate_instruction() always handles emulation failure, i.e. EMULATION_FAIL is only referenced in callers, remove the emulation_result enums entirely. Per KVM's existing exit handling conventions, return '0' and '1' for "exit to userspace" and "resume guest" respectively. Doing so cleans up many callers, e.g. they can return kvm_emulate_instruction() directly instead of having to interpret its result. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ---- arch/x86/kvm/mmu.c | 14 ++------ arch/x86/kvm/svm.c | 22 ++++++------- arch/x86/kvm/vmx/vmx.c | 28 ++++++---------- arch/x86/kvm/x86.c | 57 ++++++++++++++++----------------- 5 files changed, 49 insertions(+), 78 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f3c623dbd5ab..26f85a24b5e7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1315,12 +1315,6 @@ extern u64 kvm_default_tsc_scaling_ratio; extern u64 kvm_mce_cap_supported; -enum emulation_result { - EMULATE_DONE, /* no further processing */ - EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ - EMULATE_FAIL, /* can't emulate this instruction */ -}; - #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 138dcdef8a06..84e1a9c661dd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5383,7 +5383,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { int r, emulation_type = 0; - enum emulation_result er; bool direct = vcpu->arch.mmu->direct_map; /* With shadow page tables, fault_address contains a GVA or nGPA. */ @@ -5450,17 +5449,8 @@ emulate: return 1; } - er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); - - switch (er) { - case EMULATE_DONE: - return 1; - case EMULATE_USER_EXIT: - case EMULATE_FAIL: - return 0; - default: - BUG(); - } + return x86_emulate_instruction(vcpu, cr2, emulation_type, insn, + insn_len); } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 69e53aa3969d..cb7ad362e9b9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -787,7 +787,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) kvm_rip_write(vcpu, svm->next_rip); svm_set_interrupt_shadow(vcpu, 0); - return EMULATE_DONE; + return 1; } static void svm_queue_exception(struct kvm_vcpu *vcpu) @@ -2779,8 +2779,7 @@ static int gp_interception(struct vcpu_svm *svm) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); return 1; } - return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP) != - EMULATE_USER_EXIT; + return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); } static bool is_erratum_383(void) @@ -2878,7 +2877,7 @@ static int io_interception(struct vcpu_svm *svm) string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; if (string) - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; @@ -3885,17 +3884,15 @@ static int task_switch_interception(struct vcpu_svm *svm) int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT && (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { - if (skip_emulated_instruction(&svm->vcpu) == EMULATE_USER_EXIT) + if (!skip_emulated_instruction(&svm->vcpu)) return 0; } if (int_type != SVM_EXITINTINFO_TYPE_SOFT) int_vec = -1; - - return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, - has_error_code, error_code) != EMULATE_USER_EXIT; + has_error_code, error_code); } static int cpuid_interception(struct vcpu_svm *svm) @@ -3916,7 +3913,7 @@ static int iret_interception(struct vcpu_svm *svm) static int invlpg_interception(struct vcpu_svm *svm) { if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0); kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); return kvm_skip_emulated_instruction(&svm->vcpu); @@ -3924,13 +3921,12 @@ static int invlpg_interception(struct vcpu_svm *svm) static int emulate_on_interception(struct vcpu_svm *svm) { - return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0); } static int rsm_interception(struct vcpu_svm *svm) { - return kvm_emulate_instruction_from_buffer(&svm->vcpu, - rsm_ins_bytes, 2) == EMULATE_DONE; + return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2); } static int rdpmc_interception(struct vcpu_svm *svm) @@ -4719,7 +4715,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) ret = avic_unaccel_trap_write(svm); } else { /* Handling Fault */ - ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); + ret = kvm_emulate_instruction(&svm->vcpu, 0); } return ret; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9c92b2993f5a..e71dc36850cb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1516,7 +1516,7 @@ static int __skip_emulated_instruction(struct kvm_vcpu *vcpu) /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); - return EMULATE_DONE; + return 1; } static inline void skip_emulated_instruction(struct kvm_vcpu *vcpu) @@ -4468,7 +4468,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { - if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { + if (kvm_emulate_instruction(vcpu, 0)) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; return kvm_vcpu_halt(vcpu); @@ -4545,8 +4545,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); return 1; } - return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP) != - EMULATE_USER_EXIT; + return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); } /* @@ -4643,7 +4642,7 @@ static int handle_io(struct kvm_vcpu *vcpu) ++vcpu->stat.io_exits; if (string) - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -4717,7 +4716,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) static int handle_desc(struct kvm_vcpu *vcpu) { WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); } static int handle_cr(struct kvm_vcpu *vcpu) @@ -4933,7 +4932,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) static int handle_invd(struct kvm_vcpu *vcpu) { - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); } static int handle_invlpg(struct kvm_vcpu *vcpu) @@ -5000,7 +4999,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } } - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); } static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) @@ -5077,7 +5076,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) */ return kvm_task_switch(vcpu, tss_selector, type == INTR_TYPE_SOFT_INTR ? idt_index : -1, - reason, has_error_code, error_code) != EMULATE_USER_EXIT; + reason, has_error_code, error_code); } static int handle_ept_violation(struct kvm_vcpu *vcpu) @@ -5149,8 +5148,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) return kvm_skip_emulated_instruction(vcpu); else - return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == - EMULATE_DONE; + return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP); } return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); @@ -5169,7 +5167,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - enum emulation_result err; bool intr_window_requested; unsigned count = 130; @@ -5190,14 +5187,9 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; - err = kvm_emulate_instruction(vcpu, 0); - - if (err == EMULATE_USER_EXIT) + if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (WARN_ON_ONCE(err == EMULATE_FAIL)) - return 1; - if (vmx->emulation_required && !vmx->rmode.vm86_active && vcpu->arch.exception.pending) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92b6690d0512..a83b269126a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5451,7 +5451,7 @@ int handle_ud(struct kvm_vcpu *vcpu) emul_type = EMULTYPE_TRAP_UD_FORCED; } - return kvm_emulate_instruction(vcpu, emul_type) != EMULATE_USER_EXIT; + return kvm_emulate_instruction(vcpu, emul_type); } EXPORT_SYMBOL_GPL(handle_ud); @@ -6302,14 +6302,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) if (emulation_type & EMULTYPE_VMWARE_GP) { kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return EMULATE_DONE; + return 1; } if (emulation_type & EMULTYPE_SKIP) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; - return EMULATE_USER_EXIT; + return 0; } kvm_queue_exception(vcpu, UD_VECTOR); @@ -6318,10 +6318,10 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; - return EMULATE_USER_EXIT; + return 0; } - return EMULATE_DONE; + return 1; } static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, @@ -6485,10 +6485,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; - return EMULATE_USER_EXIT; + return 0; } kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); - return EMULATE_DONE; + return 1; } int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) @@ -6497,7 +6497,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) int r; r = kvm_x86_ops->skip_emulated_instruction(vcpu); - if (unlikely(r != EMULATE_DONE)) + if (unlikely(!r)) return 0; /* @@ -6510,7 +6510,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) */ if (unlikely(rflags & X86_EFLAGS_TF)) r = kvm_vcpu_do_singlestep(vcpu); - return r == EMULATE_DONE; + return r; } EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); @@ -6529,7 +6529,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) kvm_run->debug.arch.pc = eip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; + *r = 0; return true; } } @@ -6545,7 +6545,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= dr6 | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); - *r = EMULATE_DONE; + *r = 1; return true; } } @@ -6632,11 +6632,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if ((emulation_type & EMULTYPE_TRAP_UD) || (emulation_type & EMULTYPE_TRAP_UD_FORCED)) { kvm_queue_exception(vcpu, UD_VECTOR); - return EMULATE_DONE; + return 1; } if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) - return EMULATE_DONE; + return 1; if (ctxt->have_exception) { /* * #UD should result in just EMULATION_FAILED, and trap-like @@ -6645,7 +6645,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR || exception_type(ctxt->exception.vector) == EXCPT_TRAP); inject_emulated_exception(vcpu); - return EMULATE_DONE; + return 1; } return handle_emulation_failure(vcpu, emulation_type); } @@ -6654,7 +6654,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if ((emulation_type & EMULTYPE_VMWARE_GP) && !is_vmware_backdoor_opcode(ctxt)) { kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return EMULATE_DONE; + return 1; } if (emulation_type & EMULTYPE_SKIP) { @@ -6662,11 +6662,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); kvm_x86_ops->set_interrupt_shadow(vcpu, 0); - return EMULATE_DONE; + return 1; } if (retry_instruction(ctxt, cr2, emulation_type)) - return EMULATE_DONE; + return 1; /* this is needed for vmware backdoor interface to work since it changes registers values during IO operation */ @@ -6682,18 +6682,18 @@ restart: r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) - return EMULATE_DONE; + return 1; if (r == EMULATION_FAILED) { if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) - return EMULATE_DONE; + return 1; return handle_emulation_failure(vcpu, emulation_type); } if (ctxt->have_exception) { - r = EMULATE_DONE; + r = 1; if (inject_emulated_exception(vcpu)) return r; } else if (vcpu->arch.pio.count) { @@ -6704,18 +6704,18 @@ restart: writeback = false; vcpu->arch.complete_userspace_io = complete_emulated_pio; } - r = EMULATE_USER_EXIT; + r = 0; } else if (vcpu->mmio_needed) { ++vcpu->stat.mmio_exits; if (!vcpu->mmio_is_write) writeback = false; - r = EMULATE_USER_EXIT; + r = 0; vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; else - r = EMULATE_DONE; + r = 1; if (writeback) { unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); @@ -6724,7 +6724,7 @@ restart: if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE && ctxt->tf) + if (r && ctxt->tf) r = kvm_vcpu_do_singlestep(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -8319,12 +8319,11 @@ static int vcpu_run(struct kvm_vcpu *vcpu) static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { int r; + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) - return 0; - return 1; + return r; } static int complete_emulated_pio(struct kvm_vcpu *vcpu) @@ -8696,13 +8695,13 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; - return EMULATE_USER_EXIT; + return 0; } kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); - return EMULATE_DONE; + return 1; } EXPORT_SYMBOL_GPL(kvm_task_switch); From 1957aa63be5395307f8b80458f011871b34617ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:39 -0700 Subject: [PATCH 0901/2880] KVM: VMX: Handle single-step #DB for EMULTYPE_SKIP on EPT misconfig VMX's EPT misconfig flow to handle fast-MMIO path falls back to decoding the instruction to determine the instruction length when running as a guest (Hyper-V doesn't fill VMCS.VM_EXIT_INSTRUCTION_LEN because it's technically not defined for EPT misconfigs). Rather than implement the slow skip in VMX's generic skip_emulated_instruction(), handle_ept_misconfig() directly calls kvm_emulate_instruction() with EMULTYPE_SKIP, which intentionally doesn't do single-step detection, and so handle_ept_misconfig() misses a single-step #DB. Rework the EPT misconfig fallback case to route it through kvm_skip_emulated_instruction() so that single-step #DBs and interrupt shadow updates are handled automatically. I.e. make VMX's slow skip logic match SVM's and have the SVM flow not intentionally avoid the shadow update. Alternatively, the handle_ept_misconfig() could manually handle single- step detection, but that results in EMULTYPE_SKIP having split logic for the interrupt shadow vs. single-step #DBs, and split emulator logic is largely what led to this mess in the first place. Modifying SVM to mirror VMX flow isn't really an option as SVM's case isn't limited to a specific exit reason, i.e. handling the slow skip in skip_emulated_instruction() is mandatory for all intents and purposes. Drop VMX's skip_emulated_instruction() wrapper since it can now fail, and instead WARN if it fails unexpectedly, e.g. if exit_reason somehow becomes corrupted. Cc: Vitaly Kuznetsov Fixes: d391f12070672 ("x86/kvm/vmx: do not use vm-exit instruction length for fast MMIO when running nested") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 17 +++++++------- arch/x86/kvm/vmx/vmx.c | 52 ++++++++++++++++++------------------------ arch/x86/kvm/x86.c | 6 ++++- 3 files changed, 36 insertions(+), 39 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cb7ad362e9b9..ce75296b1b10 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -777,14 +777,15 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) svm->next_rip = svm->vmcb->control.next_rip; } - if (!svm->next_rip) - return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP); - - if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) - printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", - __func__, kvm_rip_read(vcpu), svm->next_rip); - - kvm_rip_write(vcpu, svm->next_rip); + if (!svm->next_rip) { + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) + return 0; + } else { + if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) + pr_err("%s: ip 0x%lx next 0x%llx\n", + __func__, kvm_rip_read(vcpu), svm->next_rip); + kvm_rip_write(vcpu, svm->next_rip); + } svm_set_interrupt_shadow(vcpu, 0); return 1; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e71dc36850cb..ef98311ad153 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1501,17 +1501,27 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -/* - * Returns an int to be compatible with SVM implementation (which can fail). - * Do not use directly, use skip_emulated_instruction() instead. - */ -static int __skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; - rip = kvm_rip_read(vcpu); - rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - kvm_rip_write(vcpu, rip); + /* + * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on + * undefined behavior: Intel's SDM doesn't mandate the VMCS field be + * set when EPT misconfig occurs. In practice, real hardware updates + * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors + * (namely Hyper-V) don't set it due to it being undefined behavior, + * i.e. we end up advancing IP with some random value. + */ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || + to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) { + rip = kvm_rip_read(vcpu); + rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + kvm_rip_write(vcpu, rip); + } else { + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) + return 0; + } /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); @@ -1519,11 +1529,6 @@ static int __skip_emulated_instruction(struct kvm_vcpu *vcpu) return 1; } -static inline void skip_emulated_instruction(struct kvm_vcpu *vcpu) -{ - (void)__skip_emulated_instruction(vcpu); -} - static void vmx_clear_hlt(struct kvm_vcpu *vcpu) { /* @@ -4587,7 +4592,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= dr6 | DR6_RTM; if (is_icebp(intr_info)) - skip_emulated_instruction(vcpu); + WARN_ON(!skip_emulated_instruction(vcpu)); kvm_queue_exception(vcpu, DB_VECTOR); return 1; @@ -5068,7 +5073,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && type != INTR_TYPE_EXT_INTR && type != INTR_TYPE_NMI_INTR)) - skip_emulated_instruction(vcpu); + WARN_ON(!skip_emulated_instruction(vcpu)); /* * TODO: What about debug traps on tss switch? @@ -5135,20 +5140,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu) && !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { trace_kvm_fast_mmio(gpa); - /* - * Doing kvm_skip_emulated_instruction() depends on undefined - * behavior: Intel's manual doesn't mandate - * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG - * occurs and while on real hardware it was observed to be set, - * other hypervisors (namely Hyper-V) don't set it, we end up - * advancing IP with some random value. Disable fast mmio when - * running nested and keep it for real hardware in hope that - * VM_EXIT_INSTRUCTION_LEN will always be set correctly. - */ - if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) - return kvm_skip_emulated_instruction(vcpu); - else - return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP); + return kvm_skip_emulated_instruction(vcpu); } return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); @@ -7722,7 +7714,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = __skip_emulated_instruction, + .skip_emulated_instruction = skip_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a83b269126a0..c38d247dbffb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6657,11 +6657,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return 1; } + /* + * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks + * for kvm_skip_emulated_instruction(). The caller is responsible for + * updating interruptibility state and injecting single-step #DBs. + */ if (emulation_type & EMULTYPE_SKIP) { kvm_rip_write(vcpu, ctxt->_eip); if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); - kvm_x86_ops->set_interrupt_shadow(vcpu, 0); return 1; } From 41577ab8bd72f8813cfd74cd01da3ae6a643878e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Aug 2019 14:40:40 -0700 Subject: [PATCH 0902/2880] KVM: x86: Add comments to document various emulation types Document the intended usage of each emulation type as each exists to handle an edge case of one kind or another and can be easily misinterpreted at first glance. Cc: Liran Alon Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 26f85a24b5e7..78f0a919173e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1315,6 +1315,36 @@ extern u64 kvm_default_tsc_scaling_ratio; extern u64 kvm_mce_cap_supported; +/* + * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing + * userspace I/O) to indicate that the emulation context + * should be resued as is, i.e. skip initialization of + * emulation context, instruction fetch and decode. + * + * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware. + * Indicates that only select instructions (tagged with + * EmulateOnUD) should be emulated (to minimize the emulator + * attack surface). See also EMULTYPE_TRAP_UD_FORCED. + * + * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to + * decode the instruction length. For use *only* by + * kvm_x86_ops->skip_emulated_instruction() implementations. + * + * EMULTYPE_ALLOW_RETRY - Set when the emulator should resume the guest to + * retry native execution under certain conditions. + * + * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was + * triggered by KVM's magic "force emulation" prefix, + * which is opt in via module param (off by default). + * Bypasses EmulateOnUD restriction despite emulating + * due to an intercepted #UD (see EMULTYPE_TRAP_UD). + * Used to test the full emulator from userspace. + * + * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware + * backdoor emulation, which is opt in via module param. + * VMware backoor emulation handles select instructions + * and reinjects the #GP for all other cases. + */ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) From e69e72faa3a0709dd23df6a4ca060a15e99168a1 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Tue, 16 Jul 2019 14:55:49 +0800 Subject: [PATCH 0903/2880] KVM: x86: Add support for user wait instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UMONITOR, UMWAIT and TPAUSE are a set of user wait instructions. This patch adds support for user wait instructions in KVM. Availability of the user wait instructions is indicated by the presence of the CPUID feature flag WAITPKG CPUID.0x07.0x0:ECX[5]. User wait instructions may be executed at any privilege level, and use 32bit IA32_UMWAIT_CONTROL MSR to set the maximum time. The behavior of user wait instructions in VMX non-root operation is determined first by the setting of the "enable user wait and pause" secondary processor-based VM-execution control bit 26. If the VM-execution control is 0, UMONITOR/UMWAIT/TPAUSE cause an invalid-opcode exception (#UD). If the VM-execution control is 1, treatment is based on the setting of the “RDTSC exiting†VM-execution control. Because KVM never enables RDTSC exiting, if the instruction causes a delay, the amount of time delayed is called here the physical delay. The physical delay is first computed by determining the virtual delay. If IA32_UMWAIT_CONTROL[31:2] is zero, the virtual delay is the value in EDX:EAX minus the value that RDTSC would return; if IA32_UMWAIT_CONTROL[31:2] is not zero, the virtual delay is the minimum of that difference and AND(IA32_UMWAIT_CONTROL,FFFFFFFCH). Because umwait and tpause can put a (psysical) CPU into a power saving state, by default we dont't expose it to kvm and enable it only when guest CPUID has it. Detailed information about user wait instructions can be found in the latest Intel 64 and IA-32 Architectures Software Developer's Manual. Co-developed-by: Jingqi Liu Signed-off-by: Jingqi Liu Signed-off-by: Tao Xu Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/vmx/capabilities.h | 6 ++++++ arch/x86/kvm/vmx/nested.c | 1 + arch/x86/kvm/vmx/vmx.c | 18 ++++++++++++++++++ 5 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index b15e6465870f..fdad81626829 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -69,6 +69,7 @@ #define SECONDARY_EXEC_PT_USE_GPA 0x01000000 #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 +#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE 0x04000000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index dd5985eb61b4..4e5a835989d1 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -360,7 +360,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index d6664ee3d127..7aa69716d516 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -247,6 +247,12 @@ static inline bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } +static inline bool vmx_waitpkg_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; +} + static inline bool cpu_has_vmx_tsc_scaling(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 75ed0a63abbe..c5b7ba795f95 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2089,6 +2089,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ef98311ad153..bb55f54e29b1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2323,6 +2323,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC | @@ -4059,6 +4060,23 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } } + if (vmx_waitpkg_supported()) { + bool waitpkg_enabled = + guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG); + + if (!waitpkg_enabled) + exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + + if (nested) { + if (waitpkg_enabled) + vmx->nested.msrs.secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + else + vmx->nested.msrs.secondary_ctls_high &= + ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + } + } + vmx->secondary_exec_control = exec_control; } From 6e3ba4abcea5681eebbfc10f1b56c9fbe80b6685 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Tue, 16 Jul 2019 14:55:50 +0800 Subject: [PATCH 0904/2880] KVM: vmx: Emulate MSR IA32_UMWAIT_CONTROL UMWAIT and TPAUSE instructions use 32bit IA32_UMWAIT_CONTROL at MSR index E1H to determines the maximum time in TSC-quanta that the processor can reside in either C0.1 or C0.2. This patch emulates MSR IA32_UMWAIT_CONTROL in guest and differentiate IA32_UMWAIT_CONTROL between host and guest. The variable mwait_control_cached in arch/x86/kernel/cpu/umwait.c caches the MSR value, so this patch uses it to avoid frequently rdmsr of IA32_UMWAIT_CONTROL. Co-developed-by: Jingqi Liu Signed-off-by: Jingqi Liu Signed-off-by: Tao Xu Signed-off-by: Paolo Bonzini --- arch/x86/kernel/cpu/umwait.c | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.h | 9 +++++++++ arch/x86/kvm/x86.c | 2 ++ 4 files changed, 53 insertions(+) diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 32b4dc9030aa..c222f283b456 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -17,6 +17,12 @@ */ static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); +u32 get_umwait_control_msr(void) +{ + return umwait_control_cached; +} +EXPORT_SYMBOL_GPL(get_umwait_control_msr); + /* * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by * hardware or BIOS before kernel boot. diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bb55f54e29b1..52dd2241b749 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1733,6 +1733,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); + case MSR_IA32_UMWAIT_CONTROL: + if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) + return 1; + + msr_info->data = vmx->msr_ia32_umwait_control; + break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -1906,6 +1912,16 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; + case MSR_IA32_UMWAIT_CONTROL: + if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) + return 1; + + /* The reserved bit 1 and non-32 bit [63:32] should be zero */ + if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) + return 1; + + vmx->msr_ia32_umwait_control = data; + break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -4211,6 +4227,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; + vmx->msr_ia32_umwait_control = 0; + vcpu->arch.microcode_version = 0x100000000ULL; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); vmx->hv_deadline_tsc = -1; @@ -6384,6 +6402,23 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } +static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx) +{ + u32 host_umwait_control; + + if (!vmx_has_waitpkg(vmx)) + return; + + host_umwait_control = get_umwait_control_msr(); + + if (vmx->msr_ia32_umwait_control != host_umwait_control) + add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL, + vmx->msr_ia32_umwait_control, + host_umwait_control, false); + else + clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL); +} + static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6478,6 +6513,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) pt_guest_enter(vmx); atomic_switch_perf_msrs(vmx); + atomic_switch_umwait_control_msr(vmx); if (enable_preemption_timer) vmx_update_hv_timer(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 64d5a4890aa9..bee16687dc0b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -14,6 +14,8 @@ extern const u32 vmx_msr_index[]; extern u64 host_efer; +extern u32 get_umwait_control_msr(void); + #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 #define MSR_TYPE_RW 3 @@ -211,6 +213,7 @@ struct vcpu_vmx { #endif u64 spec_ctrl; + u32 msr_ia32_umwait_control; u32 secondary_exec_control; @@ -497,6 +500,12 @@ static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); } +static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) +{ + return vmx->secondary_exec_control & + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; +} + void dump_vmcs(void); #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c38d247dbffb..977b36348bed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1145,6 +1145,8 @@ static u32 msrs_to_save[] = { MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, + MSR_IA32_UMWAIT_CONTROL, + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, From dd9212a885ca4a95443905c7c3781122a4d664e8 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 20 Sep 2019 15:13:24 -0500 Subject: [PATCH 0905/2880] drm/amdgpu/display: fix 64 bit divide Use proper helper for 32 bit. Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c index 36277bca0326..b1e657e137a9 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c @@ -197,7 +197,9 @@ void dce11_pplib_apply_display_requirements( */ if (ASICREV_IS_VEGA20_P(dc->ctx->asic_id.hw_internal_rev) && (context->stream_count >= 2)) { pp_display_cfg->min_memory_clock_khz = max(pp_display_cfg->min_memory_clock_khz, - (uint32_t) (dc->bw_vbios->high_yclk.value / memory_type_multiplier / 10000)); + (uint32_t) div64_s64( + div64_s64(dc->bw_vbios->high_yclk.value, + memory_type_multiplier), 10000)); } else { pp_display_cfg->min_memory_clock_khz = context->bw_ctx.bw.dce.yclk_khz / memory_type_multiplier; From bf653b78f9608d66db174eabcbda7454c8fde6d5 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Tue, 16 Jul 2019 14:55:51 +0800 Subject: [PATCH 0906/2880] KVM: vmx: Introduce handle_unexpected_vmexit and handle WAITPKG vmexit As the latest Intel 64 and IA-32 Architectures Software Developer's Manual, UMWAIT and TPAUSE instructions cause a VM exit if the RDTSC exiting and enable user wait and pause VM-execution controls are both 1. Because KVM never enable RDTSC exiting, the vm-exit for UMWAIT and TPAUSE should never happen. Considering EXIT_REASON_XSAVES and EXIT_REASON_XRSTORS is also unexpected VM-exit for KVM. Introduce a common exit helper handle_unexpected_vmexit() to handle these unexpected VM-exit. Suggested-by: Sean Christopherson Co-developed-by: Jingqi Liu Signed-off-by: Jingqi Liu Signed-off-by: Tao Xu Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/vmx.h | 6 +++++- arch/x86/kvm/vmx/nested.c | 4 ++++ arch/x86/kvm/vmx/vmx.c | 28 ++++++++++++---------------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index f01950aa7fae..3eb8411ab60e 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -86,6 +86,8 @@ #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 +#define EXIT_REASON_UMWAIT 67 +#define EXIT_REASON_TPAUSE 68 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -144,7 +146,9 @@ { EXIT_REASON_RDSEED, "RDSEED" }, \ { EXIT_REASON_PML_FULL, "PML_FULL" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ - { EXIT_REASON_XRSTORS, "XRSTORS" } + { EXIT_REASON_XRSTORS, "XRSTORS" }, \ + { EXIT_REASON_UMWAIT, "UMWAIT" }, \ + { EXIT_REASON_TPAUSE, "TPAUSE" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c5b7ba795f95..77548ef1ad3b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5470,6 +5470,10 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_ENCLS: /* SGX is never exposed to L1 */ return false; + case EXIT_REASON_UMWAIT: + case EXIT_REASON_TPAUSE: + return nested_cpu_has2(vmcs12, + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); default: return true; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 52dd2241b749..a7c9922e3905 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5007,20 +5007,6 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) return 1; } -static int handle_xsaves(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); - return 1; -} - -static int handle_xrstors(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); - return 1; -} - static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { @@ -5514,6 +5500,14 @@ static int handle_encls(struct kvm_vcpu *vcpu) return 1; } +static int handle_unexpected_vmexit(struct kvm_vcpu *vcpu) +{ + kvm_skip_emulated_instruction(vcpu); + WARN_ONCE(1, "Unexpected VM-Exit Reason = 0x%x", + vmcs_read32(VM_EXIT_REASON)); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5565,13 +5559,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVVPID] = handle_vmx_instruction, [EXIT_REASON_RDRAND] = handle_invalid_op, [EXIT_REASON_RDSEED] = handle_invalid_op, - [EXIT_REASON_XSAVES] = handle_xsaves, - [EXIT_REASON_XRSTORS] = handle_xrstors, + [EXIT_REASON_XSAVES] = handle_unexpected_vmexit, + [EXIT_REASON_XRSTORS] = handle_unexpected_vmexit, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, + [EXIT_REASON_UMWAIT] = handle_unexpected_vmexit, + [EXIT_REASON_TPAUSE] = handle_unexpected_vmexit, }; static const int kvm_vmx_max_exit_handlers = From d0f5a86a34072c46b7fe1b8f8768e666d9d6f87f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 17 Sep 2019 16:16:26 +0800 Subject: [PATCH 0907/2880] KVM: LAPIC: Tune lapic_timer_advance_ns smoothly Filter out drastic fluctuation and random fluctuation, remove timer_advance_adjust_done altogether, the adjustment would be continuous. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 28 ++++++++++++++-------------- arch/x86/kvm/lapic.h | 1 - 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8675458c2205..3a3a6854dcca 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -65,7 +65,9 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul -#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 +static bool lapic_timer_advance_dynamic __read_mostly; +#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 +#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 5000 #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -1485,26 +1487,25 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u64 ns; + /* Do not adjust for tiny fluctuations or large random spikes. */ + if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX || + abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN) + return; + /* too early */ if (advance_expire_delta < 0) { ns = -advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns -= min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } else { /* too late */ ns = advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns += min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } - if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) - apic->lapic_timer.timer_advance_adjust_done = true; - if (unlikely(timer_advance_ns > 5000)) { + if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_ADJUST_MAX)) timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; - apic->lapic_timer.timer_advance_adjust_done = false; - } apic->lapic_timer.timer_advance_ns = timer_advance_ns; } @@ -1524,7 +1525,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) + if (lapic_timer_advance_dynamic) adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } @@ -2302,13 +2303,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; - apic->lapic_timer.timer_advance_adjust_done = false; + lapic_timer_advance_dynamic = true; } else { apic->lapic_timer.timer_advance_ns = timer_advance_ns; - apic->lapic_timer.timer_advance_adjust_done = true; + lapic_timer_advance_dynamic = false; } - /* * APIC is created enabled. This will prevent kvm_lapic_set_base from * thinking that APIC state has changed. diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 50053d2b8b7b..2aad7e226fc0 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -35,7 +35,6 @@ struct kvm_timer { s64 advance_expire_delta; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; - bool timer_advance_adjust_done; }; struct kvm_lapic { From fac026dac0bc4dc3bb2e72309fa47ff3c4dd7c16 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Sep 2019 19:46:03 -0700 Subject: [PATCH 0908/2880] KVM: x86/mmu: Treat invalid shadow pages as obsolete Treat invalid shadow pages as obsolete to fix a bug where an obsolete and invalid page with a non-zero root count could become non-obsolete due to mmu_valid_gen wrapping. The bug is largely theoretical with the current code base, as an unsigned long will effectively never wrap on 64-bit KVM, and userspace would have to deliberately stall a vCPU in order to keep an obsolete invalid page on the active list while simultaneously modifying memslots billions of times to trigger a wrap. The obvious alternative is to use a 64-bit value for mmu_valid_gen, but it's actually desirable to go in the opposite direction, i.e. using a smaller 8-bit value to reduce KVM's memory footprint by 8 bytes per shadow page, and relying on proper treatment of invalid pages instead of preventing the generation from wrapping. Note, "Fixes" points at a commit that was at one point reverted, but has since been restored. Fixes: 5304b8d37c2a5 ("KVM: MMU: fast invalidate all pages") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 84e1a9c661dd..151c4b94696f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2252,7 +2252,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, #define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ + if (is_obsolete_sp((_kvm), (_sp))) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ @@ -2311,7 +2311,8 @@ static void mmu_audit_disable(void) { } static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { - return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); + return sp->role.invalid || + unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, From 92f58b5c0181596d9f1e317b49ada2e728fb76eb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Sep 2019 19:46:04 -0700 Subject: [PATCH 0909/2880] KVM: x86/mmu: Use fast invalidate mechanism to zap MMIO sptes Use the fast invalidate mechasim to zap MMIO sptes on a MMIO generation wrap. The fast invalidate flow was reintroduced to fix a livelock bug in kvm_mmu_zap_all() that can occur if kvm_mmu_zap_all() is invoked when the guest has live vCPUs. I.e. using kvm_mmu_zap_all() to handle the MMIO generation wrap is theoretically susceptible to the livelock bug. This effectively reverts commit 4771450c345dc ("Revert "KVM: MMU: drop kvm_mmu_zap_mmio_sptes""), i.e. restores the behavior of commit a8eca9dcc656a ("KVM: MMU: drop kvm_mmu_zap_mmio_sptes"). Note, this actually fixes commit 571c5af06e303 ("KVM: x86/mmu: Voluntarily reschedule as needed when zapping MMIO sptes"), but there is no need to incrementally revert back to using fast invalidate, e.g. doing so doesn't provide any bisection or stability benefits. Fixes: 571c5af06e303 ("KVM: x86/mmu: Voluntarily reschedule as needed when zapping MMIO sptes") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 151c4b94696f..6319f1e208f6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -403,8 +403,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, mask |= (gpa & shadow_nonpresent_or_rsvd_mask) << shadow_nonpresent_or_rsvd_mask_len; - page_header(__pa(sptep))->mmio_cached = true; - trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); } @@ -5948,7 +5946,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); -static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) +void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); @@ -5957,14 +5955,10 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (mmio_only && !sp->mmio_cached) - continue; if (sp->role.invalid && sp->root_count) continue; - if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) { - WARN_ON_ONCE(mmio_only); + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; - } if (cond_resched_lock(&kvm->mmu_lock)) goto restart; } @@ -5973,11 +5967,6 @@ restart: spin_unlock(&kvm->mmu_lock); } -void kvm_mmu_zap_all(struct kvm *kvm) -{ - return __kvm_mmu_zap_all(kvm, false); -} - void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); @@ -5999,7 +5988,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) */ if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); - __kvm_mmu_zap_all(kvm, true); + kvm_mmu_zap_all_fast(kvm); } } From dd6223c76205c5eec16d12d8ea7cd9090e537357 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Sep 2019 19:46:05 -0700 Subject: [PATCH 0910/2880] KVM: x86/mmu: Revert "Revert "KVM: MMU: show mmu_valid_gen in shadow page related tracepoints"" Now that the fast invalidate mechanism has been reintroduced, restore tracing of the generation number in shadow page tracepoints. This reverts commit b59c4830ca185ba0e9f9e046fb1cd10a4a92627a. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmutrace.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index d8001b4bca05..e9832b5ec53c 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -8,16 +8,18 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define KVM_MMU_PAGE_FIELDS \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ +#define KVM_MMU_PAGE_FIELDS \ + __field(unsigned long, mmu_valid_gen) \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ __field(bool, unsync) -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->mmu_valid_gen = sp->mmu_valid_gen; \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ @@ -29,8 +31,9 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s" \ + trace_seq_printf(p, "sp gen %lx gfn %llx l%u %u-byte q%u%s %s%s"\ " %snxe %sad root %u %s%c", \ + __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.gpte_is_8_bytes ? 8 : 4, \ role.quadrant, \ From 14a3c4f498edac65ce2ff1f05d849cd72e886d78 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Sep 2019 19:46:06 -0700 Subject: [PATCH 0911/2880] KVM: x86/mmu: Revert "Revert "KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages"" Now that the fast invalidate mechanism has been reintroduced, restore the tracepoint associated with said mechanism. Note, the name of the tracepoint deviates from the original tracepoint so as to match KVM's current nomenclature. This reverts commit 42560fb1f3c6c7f730897b7fa7a478bc37e0be50. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/mmutrace.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6319f1e208f6..2ab0ac88d86e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5743,6 +5743,7 @@ restart: static void kvm_mmu_zap_all_fast(struct kvm *kvm) { spin_lock(&kvm->mmu_lock); + trace_kvm_mmu_zap_all_fast(kvm); kvm->arch.mmu_valid_gen++; kvm_zap_obsolete_pages(kvm); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index e9832b5ec53c..1a063ba76281 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -282,6 +282,27 @@ TRACE_EVENT( ) ); +TRACE_EVENT( + kvm_mmu_zap_all_fast, + TP_PROTO(struct kvm *kvm), + TP_ARGS(kvm), + + TP_STRUCT__entry( + __field(unsigned long, mmu_valid_gen) + __field(unsigned int, mmu_used_pages) + ), + + TP_fast_assign( + __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; + __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; + ), + + TP_printk("kvm-mmu-valid-gen %lx used_pages %x", + __entry->mmu_valid_gen, __entry->mmu_used_pages + ) +); + + TRACE_EVENT( check_mmio_spte, TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), From fbb158cb88b6e4f8279707a1842ee0332f64b1a3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Sep 2019 19:46:07 -0700 Subject: [PATCH 0912/2880] KVM: x86/mmu: Revert "Revert "KVM: MMU: zap pages in batch"" Now that the fast invalidate mechanism has been reintroduced, restore the performance tweaks for fast invalidation that existed prior to its removal. Paraphrashing the original changelog: Zap at least 10 shadow pages before releasing mmu_lock to reduce the overhead associated with re-acquiring the lock. Note: "10" is an arbitrary number, speculated to be high enough so that a vCPU isn't stuck zapping obsolete pages for an extended period, but small enough so that other vCPUs aren't starved waiting for mmu_lock. This reverts commit 43d2b14b105fb00b8864c7b0ee7043cc1cc4a969. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2ab0ac88d86e..3acc4b046d47 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5671,12 +5671,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) return ret; } - +#define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); - int ign; + int nr_zapped, batch = 0; restart: list_for_each_entry_safe_reverse(sp, node, @@ -5689,28 +5689,6 @@ restart: break; /* - * Do not repeatedly zap a root page to avoid unnecessary - * KVM_REQ_MMU_RELOAD, otherwise we may not be able to - * progress: - * vcpu 0 vcpu 1 - * call vcpu_enter_guest(): - * 1): handle KVM_REQ_MMU_RELOAD - * and require mmu-lock to - * load mmu - * repeat: - * 1): zap root page and - * send KVM_REQ_MMU_RELOAD - * - * 2): if (cond_resched_lock(mmu-lock)) - * - * 2): hold mmu-lock and load mmu - * - * 3): see KVM_REQ_MMU_RELOAD bit - * on vcpu->requests is set - * then return 1 to call - * vcpu_enter_guest() again. - * goto repeat; - * * Since we are reversely walking the list and the invalid * list will be moved to the head, skip the invalid page * can help us to avoid the infinity list walking. @@ -5718,14 +5696,19 @@ restart: if (sp->role.invalid) continue; - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (batch >= BATCH_ZAP_PAGES && + (need_resched() || spin_needbreak(&kvm->mmu_lock))) { + batch = 0; kvm_mmu_commit_zap_page(kvm, &invalid_list); cond_resched_lock(&kvm->mmu_lock); goto restart; } - if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, + &nr_zapped)) { + batch += nr_zapped; goto restart; + } } kvm_mmu_commit_zap_page(kvm, &invalid_list); From 4506ecf4855e53c3e5a2b40312a240696863582c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Sep 2019 19:46:08 -0700 Subject: [PATCH 0913/2880] KVM: x86/mmu: Revert "Revert "KVM: MMU: collapse TLB flushes when zap all pages"" Now that the fast invalidate mechanism has been reintroduced, restore the performance tweaks for fast invalidation that existed prior to its removal. Paraphrashing the original changelog: Reload the mmu on all vCPUs after updating the generation number so that obsolete pages are not used by any vCPUs. This allows collapsing all TLB flushes during obsolete page zapping into a single flush, as there is no need to flush when dropping mmu_lock (to reschedule). Note: a remote TLB flush is still needed before freeing the pages as other vCPUs may be doing a lockless shadow page walk. Opportunstically improve the comments restored by the revert (the code itself is a true revert). This reverts commit f34d251d66ba263c077ed9d2bbd1874339a4c887. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3acc4b046d47..652f2ce14583 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5696,11 +5696,15 @@ restart: if (sp->role.invalid) continue; + /* + * No need to flush the TLB since we're only zapping shadow + * pages with an obsolete generation number and all vCPUS have + * loaded a new root, i.e. the shadow pages being zapped cannot + * be in active use by the guest. + */ if (batch >= BATCH_ZAP_PAGES && - (need_resched() || spin_needbreak(&kvm->mmu_lock))) { + cond_resched_lock(&kvm->mmu_lock)) { batch = 0; - kvm_mmu_commit_zap_page(kvm, &invalid_list); - cond_resched_lock(&kvm->mmu_lock); goto restart; } @@ -5711,6 +5715,11 @@ restart: } } + /* + * Trigger a remote TLB flush before freeing the page tables to ensure + * KVM is not in the middle of a lockless shadow page table walk, which + * may reference the pages. + */ kvm_mmu_commit_zap_page(kvm, &invalid_list); } @@ -5729,6 +5738,16 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) trace_kvm_mmu_zap_all_fast(kvm); kvm->arch.mmu_valid_gen++; + /* + * Notify all vcpus to reload its shadow page table and flush TLB. + * Then all vcpus will switch to new shadow page table with the new + * mmu_valid_gen. + * + * Note: we need to do this under the protection of mmu_lock, + * otherwise, vcpu would purge shadow page but miss tlb flush. + */ + kvm_reload_remote_mmus(kvm); + kvm_zap_obsolete_pages(kvm); spin_unlock(&kvm->mmu_lock); } From 31741eb11a43066a3da92996bcfccfb42e248d44 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Sep 2019 19:46:09 -0700 Subject: [PATCH 0914/2880] KVM: x86/mmu: Revert "Revert "KVM: MMU: reclaim the zapped-obsolete page first"" Now that the fast invalidate mechanism has been reintroduced, restore the performance tweaks for fast invalidation that existed prior to its removal. Paraphrashing the original changelog: Introduce a per-VM list to track obsolete shadow pages, i.e. pages which have been deleted from the mmu cache but haven't yet been freed. When page reclaiming is needed, zap/free the deleted pages first. This reverts commit 52d5dedc79bdcbac2976159a172069618cf31be5. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 78f0a919173e..53e0b956ed3c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -865,6 +865,7 @@ struct kvm_arch { * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; + struct list_head zapped_obsolete_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; From 10605204e91f46bdec98bef81923263d245515a0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Sep 2019 19:46:10 -0700 Subject: [PATCH 0915/2880] KVM: x86/mmu: Revert "KVM: x86/mmu: Remove is_obsolete() call" Now that the fast invalidate mechanism has been reintroduced, restore the performance tweaks for fast invalidation that existed prior to its removal. Paraphrasing the original changelog (commit 5ff0568374ed2 was itself a partial revert): Don't force reloading the remote mmu when zapping an obsolete page, as a MMU_RELOAD request has already been issued by kvm_mmu_zap_all_fast() immediately after incrementing mmu_valid_gen, i.e. after marking pages obsolete. This reverts commit 5ff0568374ed2e585376a3832857ade5daccd381. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 29 +++++++++++++++++++++++------ arch/x86/kvm/x86.c | 1 + 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 652f2ce14583..e552fd5cd33d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2752,7 +2752,12 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); - if (!sp->role.invalid) + /* + * Obsolete pages cannot be used on any vCPUs, see the comment + * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also + * treats invalid shadow pages as being obsolete. + */ + if (!is_obsolete_sp(kvm, sp)) kvm_reload_remote_mmus(kvm); } @@ -5675,7 +5680,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); int nr_zapped, batch = 0; restart: @@ -5708,8 +5712,8 @@ restart: goto restart; } - if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, - &nr_zapped)) { + if (__kvm_mmu_prepare_zap_page(kvm, sp, + &kvm->arch.zapped_obsolete_pages, &nr_zapped)) { batch += nr_zapped; goto restart; } @@ -5720,7 +5724,7 @@ restart: * KVM is not in the middle of a lockless shadow page table walk, which * may reference the pages. */ - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); } /* @@ -5752,6 +5756,11 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) spin_unlock(&kvm->mmu_lock); } +static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) +{ + return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); +} + static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node) @@ -6022,16 +6031,24 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) * want to shrink a VM that only started to populate its MMU * anyway. */ - if (!kvm->arch.n_used_mmu_pages) + if (!kvm->arch.n_used_mmu_pages && + !kvm_has_zapped_obsolete_pages(kvm)) continue; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); + if (kvm_has_zapped_obsolete_pages(kvm)) { + kvm_mmu_commit_zap_page(kvm, + &kvm->arch.zapped_obsolete_pages); + goto unlock; + } + if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) freed++; kvm_mmu_commit_zap_page(kvm, &invalid_list); +unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 977b36348bed..c9a3d8efe1c2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9425,6 +9425,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); From ca333add6933ad9732ba2c6671f133d7367ad96c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Sep 2019 19:46:11 -0700 Subject: [PATCH 0916/2880] KVM: x86/mmu: Explicitly track only a single invalid mmu generation Toggle mmu_valid_gen between '0' and '1' instead of blindly incrementing the generation. Because slots_lock is held for the entire duration of zapping obsolete pages, it's impossible for there to be multiple invalid generations associated with shadow pages at any given time. Toggling between the two generations (valid vs. invalid) allows changing mmu_valid_gen from an unsigned long to a u8, which reduces the size of struct kvm_mmu_page from 160 to 152 bytes on 64-bit KVM, i.e. reduces KVM's memory footprint by 8 bytes per shadow page. Set sp->mmu_valid_gen before it is added to active_mmu_pages. Functionally this has no effect as kvm_mmu_alloc_page() has a single caller that sets sp->mmu_valid_gen soon thereafter, but visually it is jarring to see a shadow page being added to the list without its mmu_valid_gen first being set. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/mmu.c | 14 ++++++++++++-- arch/x86/kvm/mmutrace.h | 16 ++++++++-------- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 53e0b956ed3c..c5ed92482e2c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -320,6 +320,7 @@ struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; bool unsync; + u8 mmu_valid_gen; bool mmio_cached; /* @@ -335,7 +336,6 @@ struct kvm_mmu_page { int root_count; /* Currently serving as active root */ unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - unsigned long mmu_valid_gen; DECLARE_BITMAP(unsync_child_bitmap, 512); #ifdef CONFIG_X86_32 @@ -859,7 +859,7 @@ struct kvm_arch { unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; - unsigned long mmu_valid_gen; + u8 mmu_valid_gen; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e552fd5cd33d..5f0864000360 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2101,6 +2101,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct * depends on valid pages being added to the head of the list. See * comments in kvm_zap_obsolete_pages(). */ + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; @@ -2537,7 +2538,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PT_PAGE_TABLE_LEVEL && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); @@ -5738,9 +5738,19 @@ restart: */ static void kvm_mmu_zap_all_fast(struct kvm *kvm) { + lockdep_assert_held(&kvm->slots_lock); + spin_lock(&kvm->mmu_lock); trace_kvm_mmu_zap_all_fast(kvm); - kvm->arch.mmu_valid_gen++; + + /* + * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is + * held for the entire duration of zapping obsolete pages, it's + * impossible for there to be multiple invalid generations associated + * with *valid* shadow pages at any given time, i.e. there is exactly + * one valid generation and (at most) one invalid generation. + */ + kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; /* * Notify all vcpus to reload its shadow page table and flush TLB. diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 1a063ba76281..7ca8831c7d1a 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -8,11 +8,11 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define KVM_MMU_PAGE_FIELDS \ - __field(unsigned long, mmu_valid_gen) \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ +#define KVM_MMU_PAGE_FIELDS \ + __field(__u8, mmu_valid_gen) \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ __field(bool, unsync) #define KVM_MMU_PAGE_ASSIGN(sp) \ @@ -31,7 +31,7 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gen %lx gfn %llx l%u %u-byte q%u%s %s%s"\ + trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" \ " %snxe %sad root %u %s%c", \ __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ @@ -288,7 +288,7 @@ TRACE_EVENT( TP_ARGS(kvm), TP_STRUCT__entry( - __field(unsigned long, mmu_valid_gen) + __field(__u8, mmu_valid_gen) __field(unsigned int, mmu_used_pages) ), @@ -297,7 +297,7 @@ TRACE_EVENT( __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; ), - TP_printk("kvm-mmu-valid-gen %lx used_pages %x", + TP_printk("kvm-mmu-valid-gen %u used_pages %x", __entry->mmu_valid_gen, __entry->mmu_used_pages ) ); From 9a5c034c9abaef81ad9df0221638785a088942b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Sep 2019 19:46:12 -0700 Subject: [PATCH 0917/2880] KVM: x86/mmu: Skip invalid pages during zapping iff root_count is zero Do not skip invalid shadow pages when zapping obsolete pages if the pages' root_count has reached zero, in which case the page can be immediately zapped and freed. Update the comment accordingly. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5f0864000360..5269aa057dfa 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5693,11 +5693,12 @@ restart: break; /* - * Since we are reversely walking the list and the invalid - * list will be moved to the head, skip the invalid page - * can help us to avoid the infinity list walking. + * Skip invalid pages with a non-zero root count, zapping pages + * with a non-zero root count will never succeed, i.e. the page + * will get thrown back on active_mmu_pages and we'll get stuck + * in an infinite loop. */ - if (sp->role.invalid) + if (sp->role.invalid && sp->root_count) continue; /* From 9f7fec0ba89108b9385f1b9fb167861224912a4a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Sep 2019 13:08:52 +0100 Subject: [PATCH 0918/2880] Btrfs: fix selftests failure due to uninitialized i_mode in test inodes Some of the self tests create a test inode, setup some extents and then do calls to btrfs_get_extent() to test that the corresponding extent maps exist and are correct. However btrfs_get_extent(), since the 5.2 merge window, now errors out when it finds a regular or prealloc extent for an inode that does not correspond to a regular file (its ->i_mode is not S_IFREG). This causes the self tests to fail sometimes, specially when KASAN, slub_debug and page poisoning are enabled: $ modprobe btrfs modprobe: ERROR: could not insert 'btrfs': Invalid argument $ dmesg [ 9414.691648] Btrfs loaded, crc32c=crc32c-intel, debug=on, assert=on, integrity-checker=on, ref-verify=on [ 9414.692655] BTRFS: selftest: sectorsize: 4096 nodesize: 4096 [ 9414.692658] BTRFS: selftest: running btrfs free space cache tests [ 9414.692918] BTRFS: selftest: running extent only tests [ 9414.693061] BTRFS: selftest: running bitmap only tests [ 9414.693366] BTRFS: selftest: running bitmap and extent tests [ 9414.696455] BTRFS: selftest: running space stealing from bitmap to extent tests [ 9414.697131] BTRFS: selftest: running extent buffer operation tests [ 9414.697133] BTRFS: selftest: running btrfs_split_item tests [ 9414.697564] BTRFS: selftest: running extent I/O tests [ 9414.697583] BTRFS: selftest: running find delalloc tests [ 9415.081125] BTRFS: selftest: running find_first_clear_extent_bit test [ 9415.081278] BTRFS: selftest: running extent buffer bitmap tests [ 9415.124192] BTRFS: selftest: running inode tests [ 9415.124195] BTRFS: selftest: running btrfs_get_extent tests [ 9415.127909] BTRFS: selftest: running hole first btrfs_get_extent test [ 9415.128343] BTRFS critical (device (efault)): regular/prealloc extent found for non-regular inode 256 [ 9415.131428] BTRFS: selftest: fs/btrfs/tests/inode-tests.c:904 expected a real extent, got 0 This happens because the test inodes are created without ever initializing the i_mode field of the inode, and neither VFS's new_inode() nor the btrfs callback btrfs_alloc_inode() initialize the i_mode. Initialization of the i_mode is done through the various callbacks used by the VFS to create new inodes (regular files, directories, symlinks, tmpfiles, etc), which all call btrfs_new_inode() which in turn calls inode_init_owner(), which sets the inode's i_mode. Since the tests only uses new_inode() to create the test inodes, the i_mode was never initialized. This always happens on a VM I used with kasan, slub_debug and many other debug facilities enabled. It also happened to someone who reported this on bugzilla (on a 5.3-rc). Fix this by setting i_mode to S_IFREG at btrfs_new_test_inode(). Fixes: 6bf9e4bd6a2778 ("btrfs: inode: Verify inode mode to avoid NULL pointer dereference") Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204397 Signed-off-by: Filipe Manana Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/tests/btrfs-tests.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b5e80563efaa..99fe9bf3fdac 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -52,7 +52,13 @@ static struct file_system_type test_type = { struct inode *btrfs_new_test_inode(void) { - return new_inode(test_mnt->mnt_sb); + struct inode *inode; + + inode = new_inode(test_mnt->mnt_sb); + if (inode) + inode_init_owner(inode, NULL, S_IFREG); + + return inode; } static int btrfs_init_test_fs(void) From eb5b64f142504a597d67e2109d603055ff765e52 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 13 Sep 2019 14:54:07 +0100 Subject: [PATCH 0919/2880] btrfs: adjust dirty_metadata_bytes after writeback failure of extent buffer Before, if a eb failed to write out, we would end up triggering a BUG_ON(). As of f4340622e0226 ("btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up"), we no longer BUG_ON(), so we should make life consistent and add back the unwritten bytes to dirty_metadata_bytes. Fixes: f4340622e022 ("btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up") CC: stable@vger.kernel.org # 5.2+ Reviewed-by: Filipe Manana Signed-off-by: Dennis Zhou Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dc5e6939856..67066ece4de3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3728,11 +3728,20 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) static void set_btree_ioerr(struct page *page) { struct extent_buffer *eb = (struct extent_buffer *)page->private; + struct btrfs_fs_info *fs_info; SetPageError(page); if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; + /* + * If we error out, we should add back the dirty_metadata_bytes + * to make it consistent. + */ + fs_info = eb->fs_info; + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + eb->len, fs_info->dirty_metadata_batch); + /* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. From 0607eb1d452d45c5ac4c745a9e9e0d95152ea9d0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 11 Sep 2019 17:42:28 +0100 Subject: [PATCH 0920/2880] Btrfs: fix missing error return if writeback for extent buffer never started If lock_extent_buffer_for_io() fails, it returns a negative value, but its caller btree_write_cache_pages() ignores such error. This means that a call to flush_write_bio(), from lock_extent_buffer_for_io(), might have failed. We should make btree_write_cache_pages() notice such error values and stop immediatelly, making sure filemap_fdatawrite_range() returns an error to the transaction commit path. A failure from flush_write_bio() should also result in the endio callback end_bio_extent_buffer_writepage() being invoked, which sets the BTRFS_FS_*_ERR bits appropriately, so that there's no risk a transaction or log commit doesn't catch a writeback failure. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 67066ece4de3..380ced84d4b1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3978,6 +3978,10 @@ retry: if (!ret) { free_extent_buffer(eb); continue; + } else if (ret < 0) { + done = 1; + free_extent_buffer(eb); + break; } ret = write_one_eb(eb, wbc, &epd); From daa5de5415849b9a53056ec1e1e88fe4c5c9aa2b Mon Sep 17 00:00:00 2001 From: yangerkun Date: Tue, 24 Sep 2019 20:53:34 +0800 Subject: [PATCH 0921/2880] io_uring: compare cached_cq_tail with cq.head in_io_uring_poll After 75b28af("io_uring: allocate the two rings together"), we compare sq.head with cached_cq_tail to determine does there any cq invalid. Actually, we should use cq.head. Fixes: 75b28affdd6a ("io_uring: allocate the two rings together") Signed-off-by: yangerkun Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ca7570aca430..9b84232e5cc4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3455,7 +3455,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != ctx->rings->sq_ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; - if (READ_ONCE(ctx->rings->sq.head) != ctx->cached_cq_tail) + if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; From d5880c7a8620290a6c90ced7a0e8bd0ad9419601 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 13 Sep 2019 18:17:11 +0300 Subject: [PATCH 0922/2880] fuse: fix missing unlock_page in fuse_writepage() unlock_page() was missing in case of an already in-flight write against the same page. Signed-off-by: Vasily Averin Fixes: ff17be086477 ("fuse: writepage: skip already in flight") Cc: # v3.13 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a2ea347c4d2c..8c7578b95d2c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1861,6 +1861,7 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) WARN_ON(wbc->sync_mode == WB_SYNC_ALL); redirty_page_for_writepage(wbc, page); + unlock_page(page); return 0; } From 30c6a23d34cbe19162240e9f9c2c122ba807e58c Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Mon, 16 Sep 2019 16:56:41 -0700 Subject: [PATCH 0923/2880] fuse: on 64-bit store time in d_fsdata directly Implements the optimization noted in commit f75fdf22b0a8 ("fuse: don't use ->d_time"), as the additional memory can be significant. (In particular, on SLAB configurations this 8-byte alloc becomes 32 bytes). Per-dentry, this can consume significant memory. Reviewed-by: Shakeel Butt Signed-off-by: Khazhismel Kumykov Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ba0a175d7578..58557d4817e9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -24,11 +24,34 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } +#if BITS_PER_LONG >= 64 +static inline void __fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_fsdata = (void *) time; +} + +static inline u64 fuse_dentry_time(const struct dentry *entry) +{ + return (u64)entry->d_fsdata; +} + +#else union fuse_dentry { u64 time; struct rcu_head rcu; }; +static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time) +{ + ((union fuse_dentry *) dentry->d_fsdata)->time = time; +} + +static inline u64 fuse_dentry_time(const struct dentry *entry) +{ + return ((union fuse_dentry *) entry->d_fsdata)->time; +} +#endif + static void fuse_dentry_settime(struct dentry *dentry, u64 time) { struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); @@ -47,12 +70,7 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time) spin_unlock(&dentry->d_lock); } - ((union fuse_dentry *) dentry->d_fsdata)->time = time; -} - -static inline u64 fuse_dentry_time(const struct dentry *entry) -{ - return ((union fuse_dentry *) entry->d_fsdata)->time; + __fuse_dentry_settime(dentry, time); } /* @@ -258,6 +276,7 @@ invalid: goto out; } +#if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); @@ -270,6 +289,7 @@ static void fuse_dentry_release(struct dentry *dentry) kfree_rcu(fd, rcu); } +#endif static int fuse_dentry_delete(const struct dentry *dentry) { @@ -279,13 +299,17 @@ static int fuse_dentry_delete(const struct dentry *dentry) const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; const struct dentry_operations fuse_root_dentry_operations = { +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; int fuse_valid_type(int m) From dc69e98c241e1456e37d73b862f7b8b8900ba50f Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Tue, 17 Sep 2019 12:35:33 -0700 Subject: [PATCH 0924/2880] fuse: kmemcg account fs data account per-file, dentry, and inode data blockdev/superblock and temporary per-request data was left alone, as this usually isn't accounted Reviewed-by: Shakeel Butt Signed-off-by: Khazhismel Kumykov Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 3 ++- fs/fuse/file.c | 5 +++-- fs/fuse/inode.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 58557d4817e9..d572c900bb0f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -279,7 +279,8 @@ invalid: #if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { - dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); return dentry->d_fsdata ? 0 : -ENOMEM; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8c7578b95d2c..0f0225686aee 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -63,12 +63,13 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT); if (unlikely(!ff)) return NULL; ff->fc = fc; - ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL); + ff->release_args = kzalloc(sizeof(*ff->release_args), + GFP_KERNEL_ACCOUNT); if (!ff->release_args) { kfree(ff); return NULL; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 10d193b24fb8..51cb471f4dc3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -66,7 +66,7 @@ static struct file_system_type fuseblk_fs_type; struct fuse_forget_link *fuse_alloc_forget(void) { - return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT); } static struct inode *fuse_alloc_inode(struct super_block *sb) From 0ed4059302a72ad0d70c874b3f4fd5489da711c6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Sep 2019 21:58:16 +0200 Subject: [PATCH 0925/2880] fuse: unexport fuse_put_request This function has been made static, which now causes a compile-time warning: WARNING: "fuse_put_request" [vmlinux] is a static EXPORT_SYMBOL_GPL Remove the unneeded export. Fixes: 66abc3599c3c ("fuse: unexport request ops") Signed-off-by: Arnd Bergmann Reviewed-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b2e18861c59c..04fe6788e405 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -175,7 +175,6 @@ static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) fuse_request_free(req); } } -EXPORT_SYMBOL_GPL(fuse_put_request); unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) { From e5854b1cdf6cb48a20e01e3bdad0476a4c60a077 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 22 Sep 2019 06:19:36 -0700 Subject: [PATCH 0926/2880] fuse: fix beyond-end-of-page access in fuse_parse_cache() With DEBUG_PAGEALLOC on, the following triggers. BUG: unable to handle page fault for address: ffff88859367c000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 3001067 P4D 3001067 PUD 406d3a8067 PMD 406d30c067 PTE 800ffffa6c983060 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 38 PID: 3110657 Comm: python2.7 RIP: 0010:fuse_readdir+0x88f/0xe7a [fuse] Code: 49 8b 4d 08 49 39 4e 60 0f 84 44 04 00 00 48 8b 43 08 43 8d 1c 3c 4d 01 7e 68 49 89 dc 48 03 5c 24 38 49 89 46 60 8b 44 24 30 <8b> 4b 10 44 29 e0 48 89 ca 48 83 c1 1f 48 83 e1 f8 83 f8 17 49 89 RSP: 0018:ffffc90035edbde0 EFLAGS: 00010286 RAX: 0000000000001000 RBX: ffff88859367bff0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88859367bfed RDI: 0000000000920907 RBP: ffffc90035edbe90 R08: 000000000000014b R09: 0000000000000004 R10: ffff88859367b000 R11: 0000000000000000 R12: 0000000000000ff0 R13: ffffc90035edbee0 R14: ffff889fb8546180 R15: 0000000000000020 FS: 00007f80b5f4a740(0000) GS:ffff889fffa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff88859367c000 CR3: 0000001c170c2001 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: iterate_dir+0x122/0x180 __x64_sys_getdents+0xa6/0x140 do_syscall_64+0x42/0x100 entry_SYSCALL_64_after_hwframe+0x44/0xa9 It's in fuse_parse_cache(). %rbx (ffff88859367bff0) is fuse_dirent pointer - addr + offset. FUSE_DIRENT_SIZE() is trying to dereference namelen off of it but that derefs into the next page which is disabled by pagealloc debug causing a PF. This is caused by dirent->namelen being accessed before ensuring that there's enough bytes in the page for the dirent. Fix it by pushing down reclen calculation. Signed-off-by: Tejun Heo Fixes: 5d7bc7e8680c ("fuse: allow using readdir cache") Cc: stable@vger.kernel.org # v4.20+ Signed-off-by: Miklos Szeredi --- fs/fuse/readdir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index fae025db06f4..5c38b9d84c6e 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -386,11 +386,13 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, for (;;) { struct fuse_dirent *dirent = addr + offset; unsigned int nbytes = size - offset; - size_t reclen = FUSE_DIRENT_SIZE(dirent); + size_t reclen; if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen) break; + reclen = FUSE_DIRENT_SIZE(dirent); /* derefs ->namelen */ + if (WARN_ON(dirent->namelen > FUSE_NAME_MAX)) return FOUND_ERR; if (WARN_ON(reclen > nbytes)) From 9ad09b1976c562061636ff1e01bfc3a57aebe56b Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 14 Aug 2019 15:59:09 +0800 Subject: [PATCH 0927/2880] fuse: fix memleak in cuse_channel_open If cuse_send_init fails, need to fuse_conn_put cc->fc. cuse_channel_open->fuse_conn_init->refcount_set(&fc->count, 1) ->fuse_dev_alloc->fuse_conn_get ->fuse_dev_free->fuse_conn_put Fixes: cc080e9e9be1 ("fuse: introduce per-instance fuse_dev structure") Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 45762bb7a934..00015d851382 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -521,6 +521,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) rc = cuse_send_init(cc); if (rc) { fuse_dev_free(fud); + fuse_conn_put(&cc->fc); return rc; } file->private_data = fud; From 5addcd5dbd8c2d2bcf719a2eb41a9b43bf9a7935 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Sep 2019 13:52:31 +0800 Subject: [PATCH 0928/2880] fuse: Make fuse_args_to_req static Fix sparse warning: fs/fuse/dev.c:468:6: warning: symbol 'fuse_args_to_req' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Fixes: 68583165f962 ("fuse: add pages to fuse_args") Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 04fe6788e405..dadd617d826c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -464,7 +464,7 @@ static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); } -void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) +static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) { req->in.h.opcode = args->opcode; req->in.h.nodeid = args->nodeid; From a06dcd625d6181747fac7f4c140b5a4c397a778c Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 12 Sep 2019 09:55:03 -0700 Subject: [PATCH 0929/2880] kvm: x86: Add "significant index" flag to a few CPUID leaves According to the Intel SDM, volume 2, "CPUID," the index is significant (or partially significant) for CPUID leaves 0FH, 10H, 12H, 17H, 18H, and 1FH. Add the corresponding flag to these CPUID leaves in do_host_cpuid(). Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Reviewed-by: Steve Rutherford Fixes: a87f2d3a6eadab ("KVM: x86: Add Intel CPUID.1F cpuid emulation support") Reviewed-by: Krish Sadhukhan Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 4e5a835989d1..63316036f85a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -304,7 +304,13 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function, case 7: case 0xb: case 0xd: + case 0xf: + case 0x10: + case 0x12: case 0x14: + case 0x17: + case 0x18: + case 0x1f: case 0x8000001d: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; break; From 0cb8410b90e78948984f35f2c4d50c2c0b7ee675 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 19 Sep 2019 15:59:17 -0700 Subject: [PATCH 0930/2880] kvm: svm: Intercept RDPRU The RDPRU instruction gives the guest read access to the IA32_APERF MSR and the IA32_MPERF MSR. According to volume 3 of the APM, "When virtualization is enabled, this instruction can be intercepted by the Hypervisor. The intercept bit is at VMCB byte offset 10h, bit 14." Since we don't enumerate the instruction in KVM_SUPPORTED_CPUID, intercept it and synthesize #UD. Signed-off-by: Jim Mattson Reviewed-by: Drew Schmitt Reviewed-by: Jacob Xu Reviewed-by: Peter Shier Reviewed-by: Krish Sadhukhan Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 1 + arch/x86/include/uapi/asm/svm.h | 1 + arch/x86/kvm/svm.c | 8 ++++++++ 3 files changed, 10 insertions(+) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index dec9c1e84c78..6ece8561ba66 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -52,6 +52,7 @@ enum { INTERCEPT_MWAIT, INTERCEPT_MWAIT_COND, INTERCEPT_XSETBV, + INTERCEPT_RDPRU, }; diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index a9731f8a480f..2e8a30f06c74 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -75,6 +75,7 @@ #define SVM_EXIT_MWAIT 0x08b #define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_XSETBV 0x08d +#define SVM_EXIT_RDPRU 0x08e #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce75296b1b10..f8ecb6df5106 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1540,6 +1540,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_SKINIT); set_intercept(svm, INTERCEPT_WBINVD); set_intercept(svm, INTERCEPT_XSETBV); + set_intercept(svm, INTERCEPT_RDPRU); set_intercept(svm, INTERCEPT_RSM); if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { @@ -3832,6 +3833,12 @@ static int xsetbv_interception(struct vcpu_svm *svm) return 1; } +static int rdpru_interception(struct vcpu_svm *svm) +{ + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; +} + static int task_switch_interception(struct vcpu_svm *svm) { u16 tss_selector; @@ -4783,6 +4790,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MONITOR] = monitor_interception, [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, + [SVM_EXIT_RDPRU] = rdpru_interception, [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, From f0b5105af6e0ca781fcc9aaf9277635ea7df2d36 Mon Sep 17 00:00:00 2001 From: Marc Orr Date: Tue, 17 Sep 2019 11:50:57 -0700 Subject: [PATCH 0931/2880] kvm: nvmx: limit atomic switch MSRs Allowing an unlimited number of MSRs to be specified via the VMX load/store MSR lists (e.g., vm-entry MSR load list) is bad for two reasons. First, a guest can specify an unreasonable number of MSRs, forcing KVM to process all of them in software. Second, the SDM bounds the number of MSRs allowed to be packed into the atomic switch MSR lists. Quoting the "Miscellaneous Data" section in the "VMX Capability Reporting Facility" appendix: "Bits 27:25 is used to compute the recommended maximum number of MSRs that should appear in the VM-exit MSR-store list, the VM-exit MSR-load list, or the VM-entry MSR-load list. Specifically, if the value bits 27:25 of IA32_VMX_MISC is N, then 512 * (N + 1) is the recommended maximum number of MSRs to be included in each list. If the limit is exceeded, undefined processor behavior may result (including a machine check during the VMX transition)." Because KVM needs to protect itself and can't model "undefined processor behavior", arbitrarily force a VM-entry to fail due to MSR loading when the MSR load list is too large. Similarly, trigger an abort during a VM exit that encounters an MSR load list or MSR store list that is too large. The MSR list size is intentionally not pre-checked so as to maintain compatibility with hardware inasmuch as possible. Test these new checks with the kvm-unit-test "x86: nvmx: test max atomic switch MSRs". Suggested-by: Jim Mattson Reviewed-by: Jim Mattson Reviewed-by: Peter Shier Signed-off-by: Marc Orr Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx/nested.c | 43 ++++++++++++++++++++++++++++---------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index fdad81626829..1835767aa335 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -111,6 +111,7 @@ #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 #define VMX_MISC_ZERO_LEN_INS 0x40000000 +#define VMX_MISC_MSR_LIST_MULTIPLIER 512 /* VMFUNC functions */ #define VMX_VMFUNC_EPTP_SWITCHING 0x00000001 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 77548ef1ad3b..70d59d9304f2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -198,6 +198,16 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); } +static inline bool vmx_control_verify(u32 control, u32 low, u32 high) +{ + return fixed_bits_valid(control, low, high); +} + +static inline u64 vmx_control_msr(u32 low, u32 high) +{ + return low | ((u64)high << 32); +} + static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); @@ -866,16 +876,34 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, return 0; } +static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); + + return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; +} + /* * Load guest's/host's msr at nested entry/exit. * return 0 for success, entry index for failure. + * + * One of the failure modes for MSR load/store is when a list exceeds the + * virtual hardware's capacity. To maintain compatibility with hardware inasmuch + * as possible, process all valid entries before failing rather than precheck + * for a capacity violation. */ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { u32 i; struct vmx_msr_entry e; + u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { + if (unlikely(i >= max_msr_list_size)) + goto fail; + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, sizeof(e))) { pr_debug_ratelimited( @@ -906,8 +934,12 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) u64 data; u32 i; struct vmx_msr_entry e; + u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { + if (unlikely(i >= max_msr_list_size)) + return -EINVAL; + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, 2 * sizeof(u32))) { @@ -1013,17 +1045,6 @@ static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; } - -static inline bool vmx_control_verify(u32 control, u32 low, u32 high) -{ - return fixed_bits_valid(control, low, high); -} - -static inline u64 vmx_control_msr(u32 low, u32 high) -{ - return low | ((u64)high << 32); -} - static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) { superset &= mask; From 3d66b89c30f9220a72e92847768fc8ba4d027d88 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Sep 2019 12:57:04 -0700 Subject: [PATCH 0932/2880] net: sched: fix possible crash in tcf_action_destroy() If the allocation done in tcf_exts_init() failed, we end up with a NULL pointer in exts->actions. kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 8198 Comm: syz-executor.3 Not tainted 5.3.0-rc8+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:tcf_action_destroy+0x71/0x160 net/sched/act_api.c:705 Code: c3 08 44 89 ee e8 4f cb bb fb 41 83 fd 20 0f 84 c9 00 00 00 e8 c0 c9 bb fb 48 89 d8 48 b9 00 00 00 00 00 fc ff df 48 c1 e8 03 <80> 3c 08 00 0f 85 c0 00 00 00 4c 8b 33 4d 85 f6 0f 84 9d 00 00 00 RSP: 0018:ffff888096e16ff0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: dffffc0000000000 RDX: 0000000000040000 RSI: ffffffff85b6ab30 RDI: 0000000000000000 RBP: ffff888096e17020 R08: ffff8880993f6140 R09: fffffbfff11cae67 R10: fffffbfff11cae66 R11: ffffffff88e57333 R12: 0000000000000000 R13: 0000000000000000 R14: ffff888096e177a0 R15: 0000000000000001 FS: 00007f62bc84a700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000758040 CR3: 0000000088b64000 CR4: 00000000001426e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcf_exts_destroy+0x38/0xb0 net/sched/cls_api.c:3030 tcindex_set_parms+0xf7f/0x1e50 net/sched/cls_tcindex.c:488 tcindex_change+0x230/0x318 net/sched/cls_tcindex.c:519 tc_new_tfilter+0xa4b/0x1c70 net/sched/cls_api.c:2152 rtnetlink_rcv_msg+0x838/0xb00 net/core/rtnetlink.c:5214 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5241 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x531/0x710 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x8a5/0xd60 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x3e2/0x920 net/socket.c:2311 __sys_sendmmsg+0x1bf/0x4d0 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] Fixes: 90b73b77d08e ("net: sched: change action API to use array of pointers to actions") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Vlad Buslov Cc: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 32577c248968..64584a1df425 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2894,8 +2894,10 @@ out: void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); - kfree(exts->actions); + if (exts->actions) { + tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); + kfree(exts->actions); + } exts->nr_actions = 0; #endif } From b91ee4aa2a2199ba4d4650706c272985a5a32d80 Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:45 +0200 Subject: [PATCH 0933/2880] mISDN: enforce CAP_NET_RAW for raw sockets When creating a raw AF_ISDN socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- drivers/isdn/mISDN/socket.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index c6ba37df4b9d..dff4132b3702 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -754,6 +754,8 @@ base_sock_create(struct net *net, struct socket *sock, int protocol, int kern) if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; + if (!capable(CAP_NET_RAW)) + return -EPERM; sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) From 6cc03e8aa36c51f3b26a0d21a3c4ce2809c842ac Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:46 +0200 Subject: [PATCH 0934/2880] appletalk: enforce CAP_NET_RAW for raw sockets When creating a raw AF_APPLETALK socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 4072e9d394d6..b41375d4d295 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1023,6 +1023,11 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol, */ if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) goto out; + + rc = -EPERM; + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + goto out; + rc = -ENOMEM; sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern); if (!sk) From 0614e2b73768b502fc32a75349823356d98aae2c Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:47 +0200 Subject: [PATCH 0935/2880] ax25: enforce CAP_NET_RAW for raw sockets When creating a raw AF_AX25 socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index ca5207767dc2..bb222b882b67 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -855,6 +855,8 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, break; case SOCK_RAW: + if (!capable(CAP_NET_RAW)) + return -EPERM; break; default: return -ESOCKTNOSUPPORT; From e69dbd4619e7674c1679cba49afd9dd9ac347eef Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:48 +0200 Subject: [PATCH 0936/2880] ieee802154: enforce CAP_NET_RAW for raw sockets When creating a raw AF_IEEE802154 socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Acked-by: Stefan Schmidt Signed-off-by: David S. Miller --- net/ieee802154/socket.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index badc5cfe4dc6..d93d4531aa9b 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -1008,6 +1008,9 @@ static int ieee802154_create(struct net *net, struct socket *sock, switch (sock->type) { case SOCK_RAW: + rc = -EPERM; + if (!capable(CAP_NET_RAW)) + goto out; proto = &ieee802154_raw_prot; ops = &ieee802154_raw_ops; break; From 3a359798b176183ef09efb7a3dc59abad1cc7104 Mon Sep 17 00:00:00 2001 From: Ori Nimron Date: Fri, 20 Sep 2019 09:35:49 +0200 Subject: [PATCH 0937/2880] nfc: enforce CAP_NET_RAW for raw sockets When creating a raw AF_NFC socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Ori Nimron Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- net/nfc/llcp_sock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 9b8742947aff..8dfea26536c9 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -1004,10 +1004,13 @@ static int llcp_sock_create(struct net *net, struct socket *sock, sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - if (sock->type == SOCK_RAW) + if (sock->type == SOCK_RAW) { + if (!capable(CAP_NET_RAW)) + return -EPERM; sock->ops = &llcp_rawsock_ops; - else + } else { sock->ops = &llcp_sock_ops; + } sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern); if (sk == NULL) From 13fc1d271a2e3ab8a02071e711add01fab9271f6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 24 Sep 2019 10:49:54 +0100 Subject: [PATCH 0938/2880] Btrfs: fix race setting up and completing qgroup rescan workers There is a race between setting up a qgroup rescan worker and completing a qgroup rescan worker that can lead to callers of the qgroup rescan wait ioctl to either not wait for the rescan worker to complete or to hang forever due to missing wake ups. The following diagram shows a sequence of steps that illustrates the race. CPU 1 CPU 2 CPU 3 btrfs_ioctl_quota_rescan() btrfs_qgroup_rescan() qgroup_rescan_init() mutex_lock(&fs_info->qgroup_rescan_lock) spin_lock(&fs_info->qgroup_lock) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN init_completion( &fs_info->qgroup_rescan_completion) fs_info->qgroup_rescan_running = true mutex_unlock(&fs_info->qgroup_rescan_lock) spin_unlock(&fs_info->qgroup_lock) btrfs_init_work() --> starts the worker btrfs_qgroup_rescan_worker() mutex_lock(&fs_info->qgroup_rescan_lock) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN mutex_unlock(&fs_info->qgroup_rescan_lock) starts transaction, updates qgroup status item, etc btrfs_ioctl_quota_rescan() btrfs_qgroup_rescan() qgroup_rescan_init() mutex_lock(&fs_info->qgroup_rescan_lock) spin_lock(&fs_info->qgroup_lock) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN init_completion( &fs_info->qgroup_rescan_completion) fs_info->qgroup_rescan_running = true mutex_unlock(&fs_info->qgroup_rescan_lock) spin_unlock(&fs_info->qgroup_lock) btrfs_init_work() --> starts another worker mutex_lock(&fs_info->qgroup_rescan_lock) fs_info->qgroup_rescan_running = false mutex_unlock(&fs_info->qgroup_rescan_lock) complete_all(&fs_info->qgroup_rescan_completion) Before the rescan worker started by the task at CPU 3 completes, if another task calls btrfs_ioctl_quota_rescan(), it will get -EINPROGRESS because the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN is set at fs_info->qgroup_flags, which is expected and correct behaviour. However if other task calls btrfs_ioctl_quota_rescan_wait() before the rescan worker started by the task at CPU 3 completes, it will return immediately without waiting for the new rescan worker to complete, because fs_info->qgroup_rescan_running is set to false by CPU 2. This race is making test case btrfs/171 (from fstests) to fail often: btrfs/171 9s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad) --- tests/btrfs/171.out 2018-09-16 21:30:48.505104287 +0100 +++ /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad 2019-09-19 02:01:36.938486039 +0100 @@ -1,2 +1,3 @@ QA output created by 171 +ERROR: quota rescan failed: Operation now in progress Silence is golden ... (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/171.out /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad' to see the entire diff) That is because the test calls the btrfs-progs commands "qgroup quota rescan -w", "qgroup assign" and "qgroup remove" in a sequence that makes calls to the rescan start ioctl fail with -EINPROGRESS (note the "btrfs" commands 'qgroup assign' and 'qgroup remove' often call the rescan start ioctl after calling the qgroup assign ioctl, btrfs_ioctl_qgroup_assign()), since previous waits didn't actually wait for a rescan worker to complete. Another problem the race can cause is missing wake ups for waiters, since the call to complete_all() happens outside a critical section and after clearing the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN. In the sequence diagram above, if we have a waiter for the first rescan task (executed by CPU 2), then fs_info->qgroup_rescan_completion.wait is not empty, and if after the rescan worker clears BTRFS_QGROUP_STATUS_FLAG_RESCAN and before it calls complete_all() against fs_info->qgroup_rescan_completion, the task at CPU 3 calls init_completion() against fs_info->qgroup_rescan_completion which re-initilizes its wait queue to an empty queue, therefore causing the rescan worker at CPU 2 to call complete_all() against an empty queue, never waking up the task waiting for that rescan worker. Fix this by clearing BTRFS_QGROUP_STATUS_FLAG_RESCAN and setting fs_info->qgroup_rescan_running to false in the same critical section, delimited by the mutex fs_info->qgroup_rescan_lock, as well as doing the call to complete_all() in that same critical section. This gives the protection needed to avoid rescan wait ioctl callers not waiting for a running rescan worker and the lost wake ups problem, since setting that rescan flag and boolean as well as initializing the wait queue is done already in a critical section delimited by that mutex (at qgroup_rescan_init()). Fixes: 57254b6ebce4ce ("Btrfs: add ioctl to wait for qgroup rescan completion") Fixes: d2c609b834d62f ("btrfs: properly track when rescan worker is running") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8d3bd799ac7d..52701c1be109 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3166,9 +3166,6 @@ out: btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); - if (!btrfs_fs_closing(fs_info)) - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -3184,16 +3181,30 @@ out: trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); + trans = NULL; btrfs_err(fs_info, "fail to start transaction for status update: %d", err); - goto done; } - ret = update_qgroup_status_item(trans); - if (ret < 0) { - err = ret; - btrfs_err(fs_info, "fail to update qgroup status: %d", err); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (!btrfs_fs_closing(fs_info)) + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + if (trans) { + ret = update_qgroup_status_item(trans); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d", + err); + } } + fs_info->qgroup_rescan_running = false; + complete_all(&fs_info->qgroup_rescan_completion); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (!trans) + return; + btrfs_end_transaction(trans); if (btrfs_fs_closing(fs_info)) { @@ -3204,12 +3215,6 @@ out: } else { btrfs_err(fs_info, "qgroup scan failed with %d", err); } - -done: - mutex_lock(&fs_info->qgroup_rescan_lock); - fs_info->qgroup_rescan_running = false; - mutex_unlock(&fs_info->qgroup_rescan_lock); - complete_all(&fs_info->qgroup_rescan_completion); } /* From 9d4d0d06bbf9f7e576b0ebbb2f77672d0fc7f503 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sun, 22 Sep 2019 15:36:03 +0200 Subject: [PATCH 0939/2880] mt76: mt7615: fix mt7615 firmware path definitions mt7615 patch/n9/cr4 firmwares are available in mediatek folder in linux-firmware repository. Because of this mt7615 won't work on regular distributions like Ubuntu. Fix path definitions. Moreover remove useless firmware name pointers and use definitions directly Fixes: 04b8e65922f6 ("mt76: add mac80211 driver for MT7615 PCIe-based chipsets") Cc: stable@vger.kernel.org Signed-off-by: Lorenzo Bianconi Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/mt7615/mcu.c | 11 ++++------- drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h | 6 +++--- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c index 275d5eaed3b7..842cd81704db 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c @@ -333,7 +333,6 @@ static int mt7615_driver_own(struct mt7615_dev *dev) static int mt7615_load_patch(struct mt7615_dev *dev) { - const char *firmware = MT7615_ROM_PATCH; const struct mt7615_patch_hdr *hdr; const struct firmware *fw = NULL; int len, ret, sem; @@ -349,7 +348,7 @@ static int mt7615_load_patch(struct mt7615_dev *dev) return -EAGAIN; } - ret = request_firmware(&fw, firmware, dev->mt76.dev); + ret = request_firmware(&fw, MT7615_ROM_PATCH, dev->mt76.dev); if (ret) goto out; @@ -447,13 +446,11 @@ mt7615_mcu_send_ram_firmware(struct mt7615_dev *dev, static int mt7615_load_ram(struct mt7615_dev *dev) { - const struct firmware *fw; const struct mt7615_fw_trailer *hdr; - const char *n9_firmware = MT7615_FIRMWARE_N9; - const char *cr4_firmware = MT7615_FIRMWARE_CR4; + const struct firmware *fw; int ret; - ret = request_firmware(&fw, n9_firmware, dev->mt76.dev); + ret = request_firmware(&fw, MT7615_FIRMWARE_N9, dev->mt76.dev); if (ret) return ret; @@ -482,7 +479,7 @@ static int mt7615_load_ram(struct mt7615_dev *dev) release_firmware(fw); - ret = request_firmware(&fw, cr4_firmware, dev->mt76.dev); + ret = request_firmware(&fw, MT7615_FIRMWARE_CR4, dev->mt76.dev); if (ret) return ret; diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h index cef3fd43cb00..7963e302d705 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h @@ -26,9 +26,9 @@ #define MT7615_RX_RING_SIZE 1024 #define MT7615_RX_MCU_RING_SIZE 512 -#define MT7615_FIRMWARE_CR4 "mt7615_cr4.bin" -#define MT7615_FIRMWARE_N9 "mt7615_n9.bin" -#define MT7615_ROM_PATCH "mt7615_rom_patch.bin" +#define MT7615_FIRMWARE_CR4 "mediatek/mt7615_cr4.bin" +#define MT7615_FIRMWARE_N9 "mediatek/mt7615_n9.bin" +#define MT7615_ROM_PATCH "mediatek/mt7615_rom_patch.bin" #define MT7615_EEPROM_SIZE 1024 #define MT7615_TOKEN_SIZE 4096 From fddbfeece9c7882cc47754c7da460fe427e3e85b Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Tue, 24 Sep 2019 13:30:57 +0300 Subject: [PATCH 0940/2880] iwlwifi: fw: don't send GEO_TX_POWER_LIMIT command to FW version 36 The intention was to have the GEO_TX_POWER_LIMIT command in FW version 36 as well, but not all 8000 family got this feature enabled. The 8000 family is the only one using version 36, so skip this version entirely. If we try to send this command to the firmwares that do not support it, we get a BAD_COMMAND response from the firmware. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=204151. Cc: stable@vger.kernel.org # 4.19+ Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index 014eca6596e2..32a5e4e5461f 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -889,11 +889,13 @@ static bool iwl_mvm_sar_geo_support(struct iwl_mvm *mvm) * firmware versions. Unfortunately, we don't have a TLV API * flag to rely on, so rely on the major version which is in * the first byte of ucode_ver. This was implemented - * initially on version 38 and then backported to 36, 29 and - * 17. + * initially on version 38 and then backported to29 and 17. + * The intention was to have it in 36 as well, but not all + * 8000 family got this feature enabled. The 8000 family is + * the only one using version 36, so skip this version + * entirely. */ return IWL_UCODE_SERIAL(mvm->fw->ucode_ver) >= 38 || - IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 36 || IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 29 || IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 17; } From 02a07046834e64970f3bcd87a422ac2b0adb80de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 20 Sep 2019 16:08:21 +0200 Subject: [PATCH 0941/2880] arcnet: provide a buffer big enough to actually receive packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit struct archdr is only big enough to hold the header of various types of arcnet packets. So to provide enough space to hold the data read from hardware provide a buffer large enough to hold a packet with maximal size. The problem was noticed by the stack protector which makes the kernel oops. Signed-off-by: Uwe Kleine-König Acked-by: Michael Grzeschik Signed-off-by: David S. Miller --- drivers/net/arcnet/arcnet.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c index 8459115d9d4e..553776cc1d29 100644 --- a/drivers/net/arcnet/arcnet.c +++ b/drivers/net/arcnet/arcnet.c @@ -1063,31 +1063,34 @@ EXPORT_SYMBOL(arcnet_interrupt); static void arcnet_rx(struct net_device *dev, int bufnum) { struct arcnet_local *lp = netdev_priv(dev); - struct archdr pkt; + union { + struct archdr pkt; + char buf[512]; + } rxdata; struct arc_rfc1201 *soft; int length, ofs; - soft = &pkt.soft.rfc1201; + soft = &rxdata.pkt.soft.rfc1201; - lp->hw.copy_from_card(dev, bufnum, 0, &pkt, ARC_HDR_SIZE); - if (pkt.hard.offset[0]) { - ofs = pkt.hard.offset[0]; + lp->hw.copy_from_card(dev, bufnum, 0, &rxdata.pkt, ARC_HDR_SIZE); + if (rxdata.pkt.hard.offset[0]) { + ofs = rxdata.pkt.hard.offset[0]; length = 256 - ofs; } else { - ofs = pkt.hard.offset[1]; + ofs = rxdata.pkt.hard.offset[1]; length = 512 - ofs; } /* get the full header, if possible */ - if (sizeof(pkt.soft) <= length) { - lp->hw.copy_from_card(dev, bufnum, ofs, soft, sizeof(pkt.soft)); + if (sizeof(rxdata.pkt.soft) <= length) { + lp->hw.copy_from_card(dev, bufnum, ofs, soft, sizeof(rxdata.pkt.soft)); } else { - memset(&pkt.soft, 0, sizeof(pkt.soft)); + memset(&rxdata.pkt.soft, 0, sizeof(rxdata.pkt.soft)); lp->hw.copy_from_card(dev, bufnum, ofs, soft, length); } arc_printk(D_DURING, dev, "Buffer #%d: received packet from %02Xh to %02Xh (%d+4 bytes)\n", - bufnum, pkt.hard.source, pkt.hard.dest, length); + bufnum, rxdata.pkt.hard.source, rxdata.pkt.hard.dest, length); dev->stats.rx_packets++; dev->stats.rx_bytes += length + ARC_HDR_SIZE; @@ -1096,13 +1099,13 @@ static void arcnet_rx(struct net_device *dev, int bufnum) if (arc_proto_map[soft->proto]->is_ip) { if (BUGLVL(D_PROTO)) { struct ArcProto - *oldp = arc_proto_map[lp->default_proto[pkt.hard.source]], + *oldp = arc_proto_map[lp->default_proto[rxdata.pkt.hard.source]], *newp = arc_proto_map[soft->proto]; if (oldp != newp) { arc_printk(D_PROTO, dev, "got protocol %02Xh; encap for host %02Xh is now '%c' (was '%c')\n", - soft->proto, pkt.hard.source, + soft->proto, rxdata.pkt.hard.source, newp->suffix, oldp->suffix); } } @@ -1111,10 +1114,10 @@ static void arcnet_rx(struct net_device *dev, int bufnum) lp->default_proto[0] = soft->proto; /* in striking contrast, the following isn't a hack. */ - lp->default_proto[pkt.hard.source] = soft->proto; + lp->default_proto[rxdata.pkt.hard.source] = soft->proto; } /* call the protocol-specific receiver. */ - arc_proto_map[soft->proto]->rx(dev, bufnum, &pkt, length); + arc_proto_map[soft->proto]->rx(dev, bufnum, &rxdata.pkt, length); } static void null_rx(struct net_device *dev, int bufnum, From 5aafeb74b5bb65b34cc87c7623f9fa163a34fa3b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 20 Sep 2019 18:18:26 +0200 Subject: [PATCH 0942/2880] skge: fix checksum byte order Running old skge driver on PowerPC causes checksum errors because hardware reported 1's complement checksum is in little-endian byte order. Reported-by: Benoit Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/skge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c index 0a2ec387a482..095f6c71b4fa 100644 --- a/drivers/net/ethernet/marvell/skge.c +++ b/drivers/net/ethernet/marvell/skge.c @@ -3108,7 +3108,7 @@ static struct sk_buff *skge_rx_get(struct net_device *dev, skb_put(skb, len); if (dev->features & NETIF_F_RXCSUM) { - skb->csum = csum; + skb->csum = le16_to_cpu(csum); skb->ip_summed = CHECKSUM_COMPLETE; } From 6f4ff81a4602dcfba436c6e2307d61ce9e9f652c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 23 Sep 2019 16:53:37 -0700 Subject: [PATCH 0943/2880] xfs: log proper length of superblock xfs_trans_log_buf takes first byte, last byte as args. In this case, it should be from 0 to sizeof() - 1. Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_sb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a08dd8f40346..ac6cdca63e15 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -928,7 +928,7 @@ xfs_log_sb( xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); - xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); + xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } /* From 4ef5d76b453908f21341e661a9b6f96862f6f589 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 24 Sep 2019 09:24:28 -0700 Subject: [PATCH 0944/2880] ARM: dts: Fix gpio0 flags for am335x-icev2 The ti,no-idle-on-init and ti,no-reset-on-init flags need to be at the interconnect target module level for the modules that have it defined. Otherwise we get the following warnings: dts flag should be at module level for ti,no-idle-on-init dts flag should be at module level for ti,no-reset-on-init Fixes: 87fc89ced3a7 ("ARM: dts: am335x: Move l4 child devices to probe them with ti-sysc") Cc: Lokesh Vutla Reported-by: Suman Anna Reviewed-by: Lokesh Vutla Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am335x-icev2.dts | 2 +- arch/arm/boot/dts/am33xx-l4.dtsi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/am335x-icev2.dts b/arch/arm/boot/dts/am335x-icev2.dts index 18f70b35da4c..204bccfcc110 100644 --- a/arch/arm/boot/dts/am335x-icev2.dts +++ b/arch/arm/boot/dts/am335x-icev2.dts @@ -432,7 +432,7 @@ pinctrl-0 = <&mmc0_pins_default>; }; -&gpio0 { +&gpio0_target { /* Do not idle the GPIO used for holding the VTT regulator */ ti,no-reset-on-init; ti,no-idle-on-init; diff --git a/arch/arm/boot/dts/am33xx-l4.dtsi b/arch/arm/boot/dts/am33xx-l4.dtsi index 46849d6ecb3e..1515f4f91499 100644 --- a/arch/arm/boot/dts/am33xx-l4.dtsi +++ b/arch/arm/boot/dts/am33xx-l4.dtsi @@ -127,7 +127,7 @@ ranges = <0x0 0x5000 0x1000>; }; - target-module@7000 { /* 0x44e07000, ap 14 20.0 */ + gpio0_target: target-module@7000 { /* 0x44e07000, ap 14 20.0 */ compatible = "ti,sysc-omap2", "ti,sysc"; ti,hwmods = "gpio1"; reg = <0x7000 0x4>, From 88d32d3983e72f2a7de72a49b701e2529c48e9c1 Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Tue, 24 Sep 2019 08:00:50 -0700 Subject: [PATCH 0945/2880] xfs: avoid unused to_mp() function warning to_mp() was first introduced with the following commit: 'commit 801cc4e17a34c ("xfs: debug mode forced buffered write failure")' But the user of to_mp() was removed by below commit: 'commit f8c47250ba46e ("xfs: convert drop_writes to use the errortag mechanism")' So kernel build with clang throws below warning message: fs/xfs/xfs_sysfs.c:72:1: warning: unused function 'to_mp' [-Wunused-function] to_mp(struct kobject *kobject) Hence to_mp() might be removed safely to get rid of warning message. Signed-off-by: Austin Kim Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_sysfs.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index ddd0bf7a4740..f1bc88f4367c 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -63,19 +63,6 @@ static const struct sysfs_ops xfs_sysfs_ops = { .store = xfs_sysfs_object_store, }; -/* - * xfs_mount kobject. The mp kobject also serves as the per-mount parent object - * that is identified by the fsname under sysfs. - */ - -static inline struct xfs_mount * -to_mp(struct kobject *kobject) -{ - struct xfs_kobj *kobj = to_kobj(kobject); - - return container_of(kobj, struct xfs_mount, m_kobj); -} - static struct attribute *xfs_mp_attrs[] = { NULL, }; From a6f197f889ce0a5fd583674d9fa39259f5c8f72f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 09:54:40 +0530 Subject: [PATCH 0946/2880] powerpc/book3s64: Export has_transparent_hugepage() related functions. In later patch, we want to use hash_transparent_hugepage() in a kernel module. Export two related functions. Acked-by: Michael Ellerman Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20190924042440.27946-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- arch/powerpc/include/asm/book3s/64/radix.h | 8 +++++++- arch/powerpc/mm/book3s64/hash_pgtable.c | 2 ++ arch/powerpc/mm/book3s64/radix_pgtable.c | 7 ------- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index e04a839cb5b9..65a6966f1de4 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -254,7 +254,13 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); -extern int radix__has_transparent_hugepage(void); +static inline int radix__has_transparent_hugepage(void) +{ + /* For radix 2M at PMD level means thp */ + if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) + return 1; + return 0; +} #endif extern int __meminit radix__vmemmap_create_mapping(unsigned long start, diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index d1f390ac9cdb..64733b9cb20a 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -406,6 +406,8 @@ int hash__has_transparent_hugepage(void) return 1; } +EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_STRICT_KERNEL_RWX diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index b4ca9e95e678..dc7a38f0a45b 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1057,13 +1057,6 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, return old_pmd; } -int radix__has_transparent_hugepage(void) -{ - /* For radix 2M at PMD level means thp */ - if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) - return 1; - return 0; -} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, From f537669978a7abae29c1d3b489f300e4d8f47005 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 5 Sep 2019 21:16:03 +0530 Subject: [PATCH 0947/2880] libnvdimm/dax: Pick the right alignment default when creating dax devices Allow arch to provide the supported alignments and use hugepage alignment only if we support hugepage. Right now we depend on compile time configs whereas this patch switch this to runtime discovery. Architectures like ppc64 can have THP enabled in code, but then can have hugepage size disabled by the hypervisor. This allows us to create dax devices with PAGE_SIZE alignment in this case. Existing dax namespace with alignment larger than PAGE_SIZE will fail to initialize in this specific case. We still allow fsdax namespace initialization. With respect to identifying whether to enable hugepage fault for a dax device, if THP is enabled during compile, we default to taking hugepage fault and in dax fault handler if we find the fault size > alignment we retry with PAGE_SIZE fault size. This also addresses the below failure scenario on ppc64 ndctl create-namespace --mode=devdax | grep align "align":16777216, "align":16777216 cat /sys/devices/ndbus0/region0/dax0.0/supported_alignments 65536 16777216 daxio.static-debug -z -o /dev/dax0.0 Bus error (core dumped) $ dmesg | tail lpar: Failed hash pte insert with error -4 hash-mmu: mm: Hashing failure ! EA=0x7fff17000000 access=0x8000000000000006 current=daxio hash-mmu: trap=0x300 vsid=0x22cb7a3 ssize=1 base psize=2 psize 10 pte=0xc000000501002b86 daxio[3860]: bus error (7) at 7fff17000000 nip 7fff973c007c lr 7fff973bff34 code 2 in libpmem.so.1.0.0[7fff973b0000+20000] daxio[3860]: code: 792945e4 7d494b78 e95f0098 7d494b78 f93f00a0 4800012c e93f0088 f93f0120 daxio[3860]: code: e93f00a0 f93f0128 e93f0120 e95f0128 e93f0088 39290008 f93f0110 The failure was due to guest kernel using wrong page size. The namespaces created with 16M alignment will appear as below on a config with 16M page size disabled. $ ndctl list -Ni [ { "dev":"namespace0.1", "mode":"fsdax", "map":"dev", "size":5351931904, "uuid":"fc6e9667-461a-4718-82b4-69b24570bddb", "align":16777216, "blockdev":"pmem0.1", "supported_alignments":[ 65536 ] }, { "dev":"namespace0.0", "mode":"fsdax", <==== devdax 16M alignment marked disabled. "map":"mem", "size":5368709120, "uuid":"a4bdf81a-f2ee-4bc6-91db-7b87eddd0484", "state":"disabled" } ] Cc: linux-mm@kvack.org Cc: "Kirill A. Shutemov" Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20190905154603.10349-8-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- drivers/nvdimm/nd.h | 6 +--- drivers/nvdimm/pfn_devs.c | 75 ++++++++++++++++++++++++++++----------- include/linux/huge_mm.h | 7 +++- 3 files changed, 61 insertions(+), 27 deletions(-) diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index e89af4b2d8e9..ee5c04070ef9 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -289,11 +289,7 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region) struct nd_pfn *to_nd_pfn(struct device *dev); #if IS_ENABLED(CONFIG_NVDIMM_PFN) -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE -#else -#define PFN_DEFAULT_ALIGNMENT PAGE_SIZE -#endif +#define MAX_NVDIMM_ALIGN 4 int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns); bool is_nd_pfn(struct device *dev); diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index ce9ef18282dd..934cdcaaae97 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -103,39 +103,42 @@ static ssize_t align_show(struct device *dev, return sprintf(buf, "%ld\n", nd_pfn->align); } -static const unsigned long *nd_pfn_supported_alignments(void) +static unsigned long *nd_pfn_supported_alignments(unsigned long *alignments) { - /* - * This needs to be a non-static variable because the *_SIZE - * macros aren't always constants. - */ - const unsigned long supported_alignments[] = { - PAGE_SIZE, -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - HPAGE_PMD_SIZE, -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD - HPAGE_PUD_SIZE, -#endif -#endif - 0, - }; - static unsigned long data[ARRAY_SIZE(supported_alignments)]; - memcpy(data, supported_alignments, sizeof(data)); + alignments[0] = PAGE_SIZE; - return data; + if (has_transparent_hugepage()) { + alignments[1] = HPAGE_PMD_SIZE; + if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + alignments[2] = HPAGE_PUD_SIZE; + } + + return alignments; +} + +/* + * Use pmd mapping if supported as default alignment + */ +static unsigned long nd_pfn_default_alignment(void) +{ + + if (has_transparent_hugepage()) + return HPAGE_PMD_SIZE; + return PAGE_SIZE; } static ssize_t align_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + unsigned long aligns[MAX_NVDIMM_ALIGN] = { [0] = 0, }; ssize_t rc; nd_device_lock(dev); nvdimm_bus_lock(dev); rc = nd_size_select_store(dev, buf, &nd_pfn->align, - nd_pfn_supported_alignments()); + nd_pfn_supported_alignments(aligns)); dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); @@ -259,7 +262,10 @@ static DEVICE_ATTR_RO(size); static ssize_t supported_alignments_show(struct device *dev, struct device_attribute *attr, char *buf) { - return nd_size_select_show(0, nd_pfn_supported_alignments(), buf); + unsigned long aligns[MAX_NVDIMM_ALIGN] = { [0] = 0, }; + + return nd_size_select_show(0, + nd_pfn_supported_alignments(aligns), buf); } static DEVICE_ATTR_RO(supported_alignments); @@ -302,7 +308,7 @@ struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn, return NULL; nd_pfn->mode = PFN_MODE_NONE; - nd_pfn->align = PFN_DEFAULT_ALIGNMENT; + nd_pfn->align = nd_pfn_default_alignment(); dev = &nd_pfn->dev; device_initialize(&nd_pfn->dev); if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) { @@ -412,6 +418,21 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) return 0; } +static bool nd_supported_alignment(unsigned long align) +{ + int i; + unsigned long supported[MAX_NVDIMM_ALIGN] = { [0] = 0, }; + + if (align == 0) + return false; + + nd_pfn_supported_alignments(supported); + for (i = 0; supported[i]; i++) + if (align == supported[i]) + return true; + return false; +} + /** * nd_pfn_validate - read and validate info-block * @nd_pfn: fsdax namespace runtime state / properties @@ -496,6 +517,18 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) return -EOPNOTSUPP; } + /* + * Check whether the we support the alignment. For Dax if the + * superblock alignment is not matching, we won't initialize + * the device. + */ + if (!nd_supported_alignment(align) && + !memcmp(pfn_sb->signature, DAX_SIG, PFN_SIG_LEN)) { + dev_err(&nd_pfn->dev, "init failed, alignment mismatch: " + "%ld:%ld\n", nd_pfn->align, align); + return -EOPNOTSUPP; + } + if (!nd_pfn->uuid) { /* * When probing a namepace via nd_pfn_probe() the uuid diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 45ede62aa85b..376a81ff2c96 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -108,7 +108,12 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) return true; - + /* + * For dax vmas, try to always use hugepage mappings. If the kernel does + * not support hugepages, fsdax mappings will fallback to PAGE_SIZE + * mappings, and device-dax namespaces, that try to guarantee a given + * mapping size, will fail to enable + */ if (vma_is_dax(vma)) return true; From 86aa66687442ef45909ff9814b82b4d2bb892294 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 9 Aug 2019 13:17:26 +0530 Subject: [PATCH 0948/2880] =?UTF-8?q?libnvdimm:=20Fix=20endian=20conversio?= =?UTF-8?q?n=20issues=C2=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nd_label->dpa issue was observed when trying to enable the namespace created with little-endian kernel on a big-endian kernel. That made me run `sparse` on the rest of the code and other changes are the result of that. Fixes: d9b83c756953 ("libnvdimm, btt: rework error clearing") Fixes: 9dedc73a4658 ("libnvdimm/btt: Fix LBA masking during 'free list' population") Reviewed-by: Vishal Verma Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20190809074726.27815-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- drivers/nvdimm/btt.c | 8 ++++---- drivers/nvdimm/namespace_devs.c | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index a8d56887ec88..3e9f45aec8d1 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -392,9 +392,9 @@ static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub, arena->freelist[lane].sub = 1 - arena->freelist[lane].sub; if (++(arena->freelist[lane].seq) == 4) arena->freelist[lane].seq = 1; - if (ent_e_flag(ent->old_map)) + if (ent_e_flag(le32_to_cpu(ent->old_map))) arena->freelist[lane].has_err = 1; - arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map)); + arena->freelist[lane].block = ent_lba(le32_to_cpu(ent->old_map)); return ret; } @@ -560,8 +560,8 @@ static int btt_freelist_init(struct arena_info *arena) * FIXME: if error clearing fails during init, we want to make * the BTT read-only */ - if (ent_e_flag(log_new.old_map) && - !ent_normal(log_new.old_map)) { + if (ent_e_flag(le32_to_cpu(log_new.old_map)) && + !ent_normal(le32_to_cpu(log_new.old_map))) { arena->freelist[i].has_err = 1; ret = arena_clear_freelist_error(arena, i); if (ret) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 43401325c874..cca0a3ba1d2c 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1987,7 +1987,7 @@ static struct device *create_namespace_pmem(struct nd_region *nd_region, nd_mapping = &nd_region->mapping[i]; label_ent = list_first_entry_or_null(&nd_mapping->labels, typeof(*label_ent), list); - label0 = label_ent ? label_ent->label : 0; + label0 = label_ent ? label_ent->label : NULL; if (!label0) { WARN_ON(1); @@ -2322,8 +2322,9 @@ static struct device **scan_labels(struct nd_region *nd_region) continue; /* skip labels that describe extents outside of the region */ - if (nd_label->dpa < nd_mapping->start || nd_label->dpa > map_end) - continue; + if (__le64_to_cpu(nd_label->dpa) < nd_mapping->start || + __le64_to_cpu(nd_label->dpa) > map_end) + continue; i = add_namespace_resource(nd_region, nd_label, devs, count); if (i < 0) From cf387d9644d8c78721cf9b77af9f67bb5b04da16 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 10 Sep 2019 11:58:25 +0530 Subject: [PATCH 0949/2880] libnvdimm/altmap: Track namespace boundaries in altmap With PFN_MODE_PMEM namespace, the memmap area is allocated from the device area. Some architectures map the memmap area with large page size. On architectures like ppc64, 16MB page for memap mapping can map 262144 pfns. This maps a namespace size of 16G. When populating memmap region with 16MB page from the device area, make sure the allocated space is not used to map resources outside this namespace. Such usage of device area will prevent a namespace destroy. Add resource end pnf in altmap and use that to check if the memmap area allocation can map pfn outside the namespace. On ppc64 in such case we fallback to allocation from memory. This fix kernel crash reported below: [ 132.034989] WARNING: CPU: 13 PID: 13719 at mm/memremap.c:133 devm_memremap_pages_release+0x2d8/0x2e0 [ 133.464754] BUG: Unable to handle kernel data access at 0xc00c00010b204000 [ 133.464760] Faulting instruction address: 0xc00000000007580c [ 133.464766] Oops: Kernel access of bad area, sig: 11 [#1] [ 133.464771] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries ..... [ 133.464901] NIP [c00000000007580c] vmemmap_free+0x2ac/0x3d0 [ 133.464906] LR [c0000000000757f8] vmemmap_free+0x298/0x3d0 [ 133.464910] Call Trace: [ 133.464914] [c000007cbfd0f7b0] [c0000000000757f8] vmemmap_free+0x298/0x3d0 (unreliable) [ 133.464921] [c000007cbfd0f8d0] [c000000000370a44] section_deactivate+0x1a4/0x240 [ 133.464928] [c000007cbfd0f980] [c000000000386270] __remove_pages+0x3a0/0x590 [ 133.464935] [c000007cbfd0fa50] [c000000000074158] arch_remove_memory+0x88/0x160 [ 133.464942] [c000007cbfd0fae0] [c0000000003be8c0] devm_memremap_pages_release+0x150/0x2e0 [ 133.464949] [c000007cbfd0fb70] [c000000000738ea0] devm_action_release+0x30/0x50 [ 133.464955] [c000007cbfd0fb90] [c00000000073a5a4] release_nodes+0x344/0x400 [ 133.464961] [c000007cbfd0fc40] [c00000000073378c] device_release_driver_internal+0x15c/0x250 [ 133.464968] [c000007cbfd0fc80] [c00000000072fd14] unbind_store+0x104/0x110 [ 133.464973] [c000007cbfd0fcd0] [c00000000072ee24] drv_attr_store+0x44/0x70 [ 133.464981] [c000007cbfd0fcf0] [c0000000004a32bc] sysfs_kf_write+0x6c/0xa0 [ 133.464987] [c000007cbfd0fd10] [c0000000004a1dfc] kernfs_fop_write+0x17c/0x250 [ 133.464993] [c000007cbfd0fd60] [c0000000003c348c] __vfs_write+0x3c/0x70 [ 133.464999] [c000007cbfd0fd80] [c0000000003c75d0] vfs_write+0xd0/0x250 djbw: Aneesh notes that this crash can likely be triggered in any kernel that supports 'papr_scm', so flagging that commit for -stable consideration. Fixes: b5beae5e224f ("powerpc/pseries: Add driver for PAPR SCM regions") Cc: Reported-by: Sachin Sant Signed-off-by: Aneesh Kumar K.V Reviewed-by: Pankaj Gupta Tested-by: Santosh Sivaraj Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20190910062826.10041-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- arch/powerpc/mm/init_64.c | 17 ++++++++++++++++- drivers/nvdimm/pfn_devs.c | 2 ++ include/linux/memremap.h | 1 + 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a44f6281ca3a..4e08246acd79 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -172,6 +172,21 @@ static __meminit void vmemmap_list_populate(unsigned long phys, vmemmap_list = vmem_back; } +static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size) +{ + unsigned long nr_pfn = page_size / sizeof(struct page); + unsigned long start_pfn = page_to_pfn((struct page *)start); + + if ((start_pfn + nr_pfn) > altmap->end_pfn) + return true; + + if (start_pfn < altmap->base_pfn) + return true; + + return false; +} + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { @@ -194,7 +209,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, * fail due to alignment issues when using 16MB hugepages, so * fall back to system memory if the altmap allocation fail. */ - if (altmap) { + if (altmap && !altmap_cross_boundary(altmap, start, page_size)) { p = altmap_alloc_block_buf(page_size, altmap); if (!p) pr_debug("altmap block allocation failed, falling back to system memory"); diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 934cdcaaae97..80c7992bc538 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -672,9 +672,11 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) struct nd_namespace_common *ndns = nd_pfn->ndns; struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); resource_size_t base = nsio->res.start + start_pad; + resource_size_t end = nsio->res.end - end_trunc; struct vmem_altmap __altmap = { .base_pfn = init_altmap_base(base), .reserve = init_altmap_reserve(base), + .end_pfn = PHYS_PFN(end), }; memcpy(res, &nsio->res, sizeof(*res)); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f8a5b2a19945..c70996fe48c8 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -17,6 +17,7 @@ struct device; */ struct vmem_altmap { const unsigned long base_pfn; + const unsigned long end_pfn; const unsigned long reserve; unsigned long free; unsigned long align; From 59f08896f058a92f03a0041b397a1a227c5e8529 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 17 Sep 2019 21:21:49 -0700 Subject: [PATCH 0950/2880] libnvdimm/nfit_test: Fix acpi_handle redefinition After commit 62974fc389b3 ("libnvdimm: Enable unit test infrastructure compile checks"), clang warns: In file included from ../drivers/nvdimm/../../tools/testing/nvdimm/test/iomap.c:15: ../drivers/nvdimm/../../tools/testing/nvdimm/test/nfit_test.h:206:15: warning: redefinition of typedef 'acpi_handle' is a C11 feature [-Wtypedef-redefinition] typedef void *acpi_handle; ^ ../include/acpi/actypes.h:424:15: note: previous definition is here typedef void *acpi_handle; /* Actually a ptr to a NS Node */ ^ 1 warning generated. The include chain: iomap.c -> linux/acpi.h -> acpi/acpi.h -> acpi/actypes.h nfit_test.h Avoid this by including linux/acpi.h in nfit_test.h, which allows us to remove both the typedef and the forward declaration of acpi_object. Link: https://github.com/ClangBuiltLinux/linux/issues/660 Signed-off-by: Nathan Chancellor Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20190918042148.77553-1-natechancellor@gmail.com Signed-off-by: Dan Williams --- tools/testing/nvdimm/test/nfit_test.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h index 448d686da8b1..0bf5640f1f07 100644 --- a/tools/testing/nvdimm/test/nfit_test.h +++ b/tools/testing/nvdimm/test/nfit_test.h @@ -4,6 +4,7 @@ */ #ifndef __NFIT_TEST_H__ #define __NFIT_TEST_H__ +#include #include #include #include @@ -202,9 +203,6 @@ struct nd_intel_lss { __u32 status; } __packed; -union acpi_object; -typedef void *acpi_handle; - typedef struct nfit_test_resource *(*nfit_test_lookup_fn)(resource_size_t); typedef union acpi_object *(*nfit_test_evaluate_dsm_fn)(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, From c42adf87e4e7ed77f6ffe288dc90f980d07d68df Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 19 Sep 2019 14:03:55 +0530 Subject: [PATCH 0951/2880] libnvdimm/region: Initialize bad block for volatile namespaces We do check for a bad block during namespace init and that use region bad block list. We need to initialize the bad block for volatile regions for this to work. We also observe a lockdep warning as below because the lock is not initialized correctly since we skip bad block init for volatile regions. INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc1-15699-g3dee241c937e #149 Call Trace: [c0000000f95cb250] [c00000000147dd84] dump_stack+0xe8/0x164 (unreliable) [c0000000f95cb2a0] [c00000000022ccd8] register_lock_class+0x308/0xa60 [c0000000f95cb3a0] [c000000000229cc0] __lock_acquire+0x170/0x1ff0 [c0000000f95cb4c0] [c00000000022c740] lock_acquire+0x220/0x270 [c0000000f95cb580] [c000000000a93230] badblocks_check+0xc0/0x290 [c0000000f95cb5f0] [c000000000d97540] nd_pfn_validate+0x5c0/0x7f0 [c0000000f95cb6d0] [c000000000d98300] nd_dax_probe+0xd0/0x1f0 [c0000000f95cb760] [c000000000d9b66c] nd_pmem_probe+0x10c/0x160 [c0000000f95cb790] [c000000000d7f5ec] nvdimm_bus_probe+0x10c/0x240 [c0000000f95cb820] [c000000000d0f844] really_probe+0x254/0x4e0 [c0000000f95cb8b0] [c000000000d0fdfc] driver_probe_device+0x16c/0x1e0 [c0000000f95cb930] [c000000000d10238] device_driver_attach+0x68/0xa0 [c0000000f95cb970] [c000000000d1040c] __driver_attach+0x19c/0x1c0 [c0000000f95cb9f0] [c000000000d0c4c4] bus_for_each_dev+0x94/0x130 [c0000000f95cba50] [c000000000d0f014] driver_attach+0x34/0x50 [c0000000f95cba70] [c000000000d0e208] bus_add_driver+0x178/0x2f0 [c0000000f95cbb00] [c000000000d117c8] driver_register+0x108/0x170 [c0000000f95cbb70] [c000000000d7edb0] __nd_driver_register+0xe0/0x100 [c0000000f95cbbd0] [c000000001a6baa4] nd_pmem_driver_init+0x34/0x48 [c0000000f95cbbf0] [c0000000000106f4] do_one_initcall+0x1d4/0x4b0 [c0000000f95cbcd0] [c0000000019f499c] kernel_init_freeable+0x544/0x65c [c0000000f95cbdb0] [c000000000010d6c] kernel_init+0x2c/0x180 [c0000000f95cbe20] [c00000000000b954] ret_from_kernel_thread+0x5c/0x68 Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20190919083355.26340-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- drivers/nvdimm/bus.c | 2 +- drivers/nvdimm/region.c | 4 ++-- drivers/nvdimm/region_devs.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 75a58a6e9615..d47412dcdf38 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -180,7 +180,7 @@ static int nvdimm_clear_badblocks_region(struct device *dev, void *data) sector_t sector; /* make sure device is a region */ - if (!is_nd_pmem(dev)) + if (!is_memory(dev)) return 0; nd_region = to_nd_region(dev); diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 37bf8719a2a4..0f6978e72e7c 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -34,7 +34,7 @@ static int nd_region_probe(struct device *dev) if (rc) return rc; - if (is_nd_pmem(&nd_region->dev)) { + if (is_memory(&nd_region->dev)) { struct resource ndr_res; if (devm_init_badblocks(dev, &nd_region->bb)) @@ -123,7 +123,7 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event) struct nd_region *nd_region = to_nd_region(dev); struct resource res; - if (is_nd_pmem(&nd_region->dev)) { + if (is_memory(&nd_region->dev)) { res.start = nd_region->ndr_start; res.end = nd_region->ndr_start + nd_region->ndr_size - 1; diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 3fd6b59abd33..ab91890f2486 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -632,11 +632,11 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) if (!is_memory(dev) && a == &dev_attr_dax_seed.attr) return 0; - if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr) + if (!is_memory(dev) && a == &dev_attr_badblocks.attr) return 0; if (a == &dev_attr_resource.attr) { - if (is_nd_pmem(dev)) + if (is_memory(dev)) return 0400; else return 0; From 674f31a352da5e9f621f757b9a89262f486533a0 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 24 Sep 2019 10:34:49 -0700 Subject: [PATCH 0952/2880] libnvdimm: prevent nvdimm from requesting key when security is disabled Current implementation attempts to request keys from the keyring even when security is not enabled. Change behavior so when security is disabled it will skip key request. Error messages seen when no keys are installed and libnvdimm is loaded: request-key[4598]: Cannot find command to construct key 661489677 request-key[4606]: Cannot find command to construct key 34713726 Cc: stable@vger.kernel.org Fixes: 4c6926a23b76 ("acpi/nfit, libnvdimm: Add unlock of nvdimm support for Intel DIMMs") Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/156934642272.30222.5230162488753445916.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dan Williams --- drivers/nvdimm/security.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvdimm/security.c b/drivers/nvdimm/security.c index 9e45b207ff01..89b85970912d 100644 --- a/drivers/nvdimm/security.c +++ b/drivers/nvdimm/security.c @@ -177,6 +177,10 @@ static int __nvdimm_security_unlock(struct nvdimm *nvdimm) || !nvdimm->sec.flags) return -EIO; + /* No need to go further if security is disabled */ + if (test_bit(NVDIMM_SECURITY_DISABLED, &nvdimm->sec.flags)) + return 0; + if (test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags)) { dev_dbg(dev, "Security operation in progress.\n"); return -EBUSY; From 4c806b897d6075bfa5067e524fb058c57ab64e7b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 17:13:27 +0530 Subject: [PATCH 0953/2880] libnvdimm/region: Enable MAP_SYNC for volatile regions Some environments want to use a host tmpfs/ramdisk to back guest pmem. While the data is not persisted relative to the host it *is* persisted relative to guest crashes / reboots. The guest is free to use dax and MAP_SYNC to keep filesystem metadata consistent with dax accesses without requiring guest fsync(). The guest can also observe that the region is volatile and skip cache flushing as global visibility is enough to "persist" data relative to the host staying alive over guest reset events. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20190924114327.14700-1-aneesh.kumar@linux.ibm.com [djbw: reword the changelog] Signed-off-by: Dan Williams --- drivers/nvdimm/region_devs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index ab91890f2486..ef423ba1a711 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -1168,6 +1168,9 @@ EXPORT_SYMBOL_GPL(nvdimm_has_cache); bool is_nvdimm_sync(struct nd_region *nd_region) { + if (is_nd_volatile(&nd_region->dev)) + return true; + return is_nd_pmem(&nd_region->dev) && !test_bit(ND_REGION_ASYNC, &nd_region->flags); } From 697d7150502e3b4f6eb536b42ef189e897d94914 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 23 Sep 2019 15:56:25 -0500 Subject: [PATCH 0954/2880] drm/amdgpu/display: include slab.h in dcn21_resource.c It's apparently needed in some configurations. Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c index 3ca5139f1273..de182185fe1f 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c @@ -23,6 +23,8 @@ * */ +#include + #include "dm_services.h" #include "dc.h" From 1e94b43813a2757f6ee471584ca07cdbc1a51db9 Mon Sep 17 00:00:00 2001 From: "Tianci.Yin" Date: Thu, 12 Sep 2019 17:40:22 +0800 Subject: [PATCH 0955/2880] drm/amdgpu/gfx10: add support for wks firmware loading load different cp firmware according to the DID and RID Reviewed-by: Feifei Xu Signed-off-by: Tianci.Yin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index db28823891ac..638c821611ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -70,6 +70,11 @@ MODULE_FIRMWARE("amdgpu/navi10_mec.bin"); MODULE_FIRMWARE("amdgpu/navi10_mec2.bin"); MODULE_FIRMWARE("amdgpu/navi10_rlc.bin"); +MODULE_FIRMWARE("amdgpu/navi14_ce_wks.bin"); +MODULE_FIRMWARE("amdgpu/navi14_pfp_wks.bin"); +MODULE_FIRMWARE("amdgpu/navi14_me_wks.bin"); +MODULE_FIRMWARE("amdgpu/navi14_mec_wks.bin"); +MODULE_FIRMWARE("amdgpu/navi14_mec2_wks.bin"); MODULE_FIRMWARE("amdgpu/navi14_ce.bin"); MODULE_FIRMWARE("amdgpu/navi14_pfp.bin"); MODULE_FIRMWARE("amdgpu/navi14_me.bin"); @@ -594,7 +599,8 @@ static void gfx_v10_0_check_gfxoff_flag(struct amdgpu_device *adev) static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) { const char *chip_name; - char fw_name[30]; + char fw_name[40]; + char wks[10]; int err; struct amdgpu_firmware_info *info = NULL; const struct common_firmware_header *header = NULL; @@ -607,12 +613,16 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) DRM_DEBUG("\n"); + memset(wks, 0, sizeof(wks)); switch (adev->asic_type) { case CHIP_NAVI10: chip_name = "navi10"; break; case CHIP_NAVI14: chip_name = "navi14"; + if (!(adev->pdev->device == 0x7340 && + adev->pdev->revision != 0x00)) + snprintf(wks, sizeof(wks), "_wks"); break; case CHIP_NAVI12: chip_name = "navi12"; @@ -621,7 +631,7 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) BUG(); } - snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_pfp.bin", chip_name); + snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_pfp%s.bin", chip_name, wks); err = request_firmware(&adev->gfx.pfp_fw, fw_name, adev->dev); if (err) goto out; @@ -632,7 +642,7 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) adev->gfx.pfp_fw_version = le32_to_cpu(cp_hdr->header.ucode_version); adev->gfx.pfp_feature_version = le32_to_cpu(cp_hdr->ucode_feature_version); - snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_me.bin", chip_name); + snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_me%s.bin", chip_name, wks); err = request_firmware(&adev->gfx.me_fw, fw_name, adev->dev); if (err) goto out; @@ -643,7 +653,7 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) adev->gfx.me_fw_version = le32_to_cpu(cp_hdr->header.ucode_version); adev->gfx.me_feature_version = le32_to_cpu(cp_hdr->ucode_feature_version); - snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_ce.bin", chip_name); + snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_ce%s.bin", chip_name, wks); err = request_firmware(&adev->gfx.ce_fw, fw_name, adev->dev); if (err) goto out; @@ -708,7 +718,7 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) if (adev->gfx.rlc.is_rlc_v2_1) gfx_v10_0_init_rlc_ext_microcode(adev); - snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec.bin", chip_name); + snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec%s.bin", chip_name, wks); err = request_firmware(&adev->gfx.mec_fw, fw_name, adev->dev); if (err) goto out; @@ -719,7 +729,7 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) adev->gfx.mec_fw_version = le32_to_cpu(cp_hdr->header.ucode_version); adev->gfx.mec_feature_version = le32_to_cpu(cp_hdr->ucode_feature_version); - snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec2.bin", chip_name); + snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec2%s.bin", chip_name, wks); err = request_firmware(&adev->gfx.mec2_fw, fw_name, adev->dev); if (!err) { err = amdgpu_ucode_validate(adev->gfx.mec2_fw); From 722e6f500ac72d0d43955710738a24fd957607a4 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 24 Sep 2019 11:45:34 -0700 Subject: [PATCH 0956/2880] ia64: Fix some warnings introduced in merge window Fix arch/ia64/kernel/irq_ia64.c:586:1: warning: no return statement in function returning non-void [-Wreturn-type] arch/ia64/mm/contig.c:111:6: warning: unused variable 'rc' [-Wunused-variable] arch/ia64/mm/discontig.c:189:39: warning: unused variable 'rc' [-Wunused-variable] Signed-off-by: Tony Luck Signed-off-by: Linus Torvalds --- arch/ia64/kernel/irq_ia64.c | 1 + arch/ia64/mm/contig.c | 1 - arch/ia64/mm/discontig.c | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index f10208478131..8e91c86e8072 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -583,6 +583,7 @@ void ia64_process_pending_intr(void) static irqreturn_t dummy_handler (int irq, void *dev_id) { BUG(); + return IRQ_NONE; } static struct irqaction ipi_irqaction = { diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index db09a693f094..5b00dc3898e1 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -108,7 +108,6 @@ setup_per_cpu_areas(void) struct pcpu_group_info *gi; unsigned int cpu; ssize_t static_size, reserved_size, dyn_size; - int rc; ai = pcpu_alloc_alloc_info(1, num_possible_cpus()); if (!ai) diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 219fc640414b..4f33f6e7e206 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -186,7 +186,7 @@ void __init setup_per_cpu_areas(void) unsigned long base_offset; unsigned int cpu; ssize_t static_size, reserved_size, dyn_size; - int node, prev_node, unit, nr_units, rc; + int node, prev_node, unit, nr_units; ai = pcpu_alloc_alloc_info(MAX_NUMNODES, nr_cpu_ids); if (!ai) From c128e575514ce93dced349417d136304a33b6f99 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 22 Sep 2019 15:07:49 -0400 Subject: [PATCH 0957/2880] NFS: Optimise the default readahead size In the years since the max readahead size was fixed in NFS, a number of things have happened: - Users can now set the value directly using /sys/class/bdi - NFS max supported block sizes have increased by several orders of magnitude from 64K to 1MB. - Disk access latencies are orders of magnitude faster due to SSD + NVME. In particular note that if the server is advertising 1MB as the optimal read size, as that will set the readahead size to 15MB. Let's therefore adjust down, and try to default to VM_READAHEAD_PAGES. However let's inform the VM about our preferred block size so that it can choose to round up in cases where that makes sense. Reported-by: Alkis Georgopoulos Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/internal.h | 8 -------- fs/nfs/super.c | 9 ++++++++- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a2346a2f8361..4b946e6a052f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -16,14 +16,6 @@ extern const struct export_operations nfs_export_ops; struct nfs_string; -/* Maximum number of readahead requests - * FIXME: this should really be a sysctl so that users may tune it to suit - * their needs. People that do NFS over a slow network, might for - * instance want to reduce it to something closer to 1 for improved - * interactive response. - */ -#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) - static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) { if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 703f595dce90..c96194e28692 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2627,6 +2627,13 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, } EXPORT_SYMBOL_GPL(nfs_clone_sb_security); +static void nfs_set_readahead(struct backing_dev_info *bdi, + unsigned long iomax_pages) +{ + bdi->ra_pages = VM_READAHEAD_PAGES; + bdi->io_pages = iomax_pages; +} + struct dentry *nfs_fs_mount_common(struct nfs_server *server, int flags, const char *dev_name, struct nfs_mount_info *mount_info, @@ -2669,7 +2676,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, mntroot = ERR_PTR(error); goto error_splat_super; } - s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD; + nfs_set_readahead(s->s_bdi, server->rpages); server->super = s; } From a8fd0feeca35cb8f9ddd950191f4aeb777f52f89 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 10 Sep 2019 17:14:30 -0400 Subject: [PATCH 0958/2880] pNFS/filelayout: enable LAYOUTGET on OPEN Add the flag to the filelayout driver to add LAYOUTGET to the OPEN compound. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 3cb073c50fa6..c9b605f6c9cb 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, + .flags = PNFS_LAYOUTGET_ON_OPEN, .max_layoutget_response = 4096, /* 1 page or so... */ .alloc_layout_hdr = filelayout_alloc_layout_hdr, .free_layout_hdr = filelayout_free_layout_hdr, From 93cad5f789951eaa27c3392b15294b4e51253944 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 09:22:54 +0530 Subject: [PATCH 0959/2880] selftests/powerpc: Add test case for tlbie vs mtpidr ordering issue Signed-off-by: Aneesh Kumar K.V [mpe: Some minor fixes to make it build] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190924035254.24612-4-aneesh.kumar@linux.ibm.com --- tools/testing/selftests/powerpc/mm/Makefile | 2 + .../testing/selftests/powerpc/mm/tlbie_test.c | 734 ++++++++++++++++++ 2 files changed, 736 insertions(+) create mode 100644 tools/testing/selftests/powerpc/mm/tlbie_test.c diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index f1fbc15800c4..ed1565809d2b 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -4,6 +4,7 @@ noarg: TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \ large_vm_fork_separation +TEST_GEN_PROGS_EXTENDED := tlbie_test TEST_GEN_FILES := tempfile top_srcdir = ../../../../.. @@ -19,3 +20,4 @@ $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64 $(OUTPUT)/tempfile: dd if=/dev/zero of=$@ bs=64k count=1 +$(OUTPUT)/tlbie_test: LDLIBS += -lpthread diff --git a/tools/testing/selftests/powerpc/mm/tlbie_test.c b/tools/testing/selftests/powerpc/mm/tlbie_test.c new file mode 100644 index 000000000000..9868a5ddd847 --- /dev/null +++ b/tools/testing/selftests/powerpc/mm/tlbie_test.c @@ -0,0 +1,734 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2019, Nick Piggin, Gautham R. Shenoy, Aneesh Kumar K.V, IBM Corp. + */ + +/* + * + * Test tlbie/mtpidr race. We have 4 threads doing flush/load/compare/store + * sequence in a loop. The same threads also rung a context switch task + * that does sched_yield() in loop. + * + * The snapshot thread mark the mmap area PROT_READ in between, make a copy + * and copy it back to the original area. This helps us to detect if any + * store continued to happen after we marked the memory PROT_READ. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void dcbf(volatile unsigned int *addr) +{ + __asm__ __volatile__ ("dcbf %y0; sync" : : "Z"(*(unsigned char *)addr) : "memory"); +} + +static void err_msg(char *msg) +{ + + time_t now; + time(&now); + printf("=================================\n"); + printf(" Error: %s\n", msg); + printf(" %s", ctime(&now)); + printf("=================================\n"); + exit(1); +} + +static char *map1; +static char *map2; +static pid_t rim_process_pid; + +/* + * A "rim-sequence" is defined to be the sequence of the following + * operations performed on a memory word: + * 1) FLUSH the contents of that word. + * 2) LOAD the contents of that word. + * 3) COMPARE the contents of that word with the content that was + * previously stored at that word + * 4) STORE new content into that word. + * + * The threads in this test that perform the rim-sequence are termed + * as rim_threads. + */ + +/* + * A "corruption" is defined to be the failed COMPARE operation in a + * rim-sequence. + * + * A rim_thread that detects a corruption informs about it to all the + * other rim_threads, and the mem_snapshot thread. + */ +static volatile unsigned int corruption_found; + +/* + * This defines the maximum number of rim_threads in this test. + * + * The THREAD_ID_BITS denote the number of bits required + * to represent the thread_ids [0..MAX_THREADS - 1]. + * We are being a bit paranoid here and set it to 8 bits, + * though 6 bits suffice. + * + */ +#define MAX_THREADS 64 +#define THREAD_ID_BITS 8 +#define THREAD_ID_MASK ((1 << THREAD_ID_BITS) - 1) +static unsigned int rim_thread_ids[MAX_THREADS]; +static pthread_t rim_threads[MAX_THREADS]; + + +/* + * Each rim_thread works on an exclusive "chunk" of size + * RIM_CHUNK_SIZE. + * + * The ith rim_thread works on the ith chunk. + * + * The ith chunk begins at + * map1 + (i * RIM_CHUNK_SIZE) + */ +#define RIM_CHUNK_SIZE 1024 +#define BITS_PER_BYTE 8 +#define WORD_SIZE (sizeof(unsigned int)) +#define WORD_BITS (WORD_SIZE * BITS_PER_BYTE) +#define WORDS_PER_CHUNK (RIM_CHUNK_SIZE/WORD_SIZE) + +static inline char *compute_chunk_start_addr(unsigned int thread_id) +{ + char *chunk_start; + + chunk_start = (char *)((unsigned long)map1 + + (thread_id * RIM_CHUNK_SIZE)); + + return chunk_start; +} + +/* + * The "word-offset" of a word-aligned address inside a chunk, is + * defined to be the number of words that precede the address in that + * chunk. + * + * WORD_OFFSET_BITS denote the number of bits required to represent + * the word-offsets of all the word-aligned addresses of a chunk. + */ +#define WORD_OFFSET_BITS (__builtin_ctz(WORDS_PER_CHUNK)) +#define WORD_OFFSET_MASK ((1 << WORD_OFFSET_BITS) - 1) + +static inline unsigned int compute_word_offset(char *start, unsigned int *addr) +{ + unsigned int delta_bytes, ret; + delta_bytes = (unsigned long)addr - (unsigned long)start; + + ret = delta_bytes/WORD_SIZE; + + return ret; +} + +/* + * A "sweep" is defined to be the sequential execution of the + * rim-sequence by a rim_thread on its chunk one word at a time, + * starting from the first word of its chunk and ending with the last + * word of its chunk. + * + * Each sweep of a rim_thread is uniquely identified by a sweep_id. + * SWEEP_ID_BITS denote the number of bits required to represent + * the sweep_ids of rim_threads. + * + * As to why SWEEP_ID_BITS are computed as a function of THREAD_ID_BITS, + * WORD_OFFSET_BITS, and WORD_BITS, see the "store-pattern" below. + */ +#define SWEEP_ID_BITS (WORD_BITS - (THREAD_ID_BITS + WORD_OFFSET_BITS)) +#define SWEEP_ID_MASK ((1 << SWEEP_ID_BITS) - 1) + +/* + * A "store-pattern" is the word-pattern that is stored into a word + * location in the 4)STORE step of the rim-sequence. + * + * In the store-pattern, we shall encode: + * + * - The thread-id of the rim_thread performing the store + * (The most significant THREAD_ID_BITS) + * + * - The word-offset of the address into which the store is being + * performed (The next WORD_OFFSET_BITS) + * + * - The sweep_id of the current sweep in which the store is + * being performed. (The lower SWEEP_ID_BITS) + * + * Store Pattern: 32 bits + * |------------------|--------------------|---------------------------------| + * | Thread id | Word offset | sweep_id | + * |------------------|--------------------|---------------------------------| + * THREAD_ID_BITS WORD_OFFSET_BITS SWEEP_ID_BITS + * + * In the store pattern, the (Thread-id + Word-offset) uniquely identify the + * address to which the store is being performed i.e, + * address == map1 + + * (Thread-id * RIM_CHUNK_SIZE) + (Word-offset * WORD_SIZE) + * + * And the sweep_id in the store pattern identifies the time when the + * store was performed by the rim_thread. + * + * We shall use this property in the 3)COMPARE step of the + * rim-sequence. + */ +#define SWEEP_ID_SHIFT 0 +#define WORD_OFFSET_SHIFT (SWEEP_ID_BITS) +#define THREAD_ID_SHIFT (WORD_OFFSET_BITS + SWEEP_ID_BITS) + +/* + * Compute the store pattern for a given thread with id @tid, at + * location @addr in the sweep identified by @sweep_id + */ +static inline unsigned int compute_store_pattern(unsigned int tid, + unsigned int *addr, + unsigned int sweep_id) +{ + unsigned int ret = 0; + char *start = compute_chunk_start_addr(tid); + unsigned int word_offset = compute_word_offset(start, addr); + + ret += (tid & THREAD_ID_MASK) << THREAD_ID_SHIFT; + ret += (word_offset & WORD_OFFSET_MASK) << WORD_OFFSET_SHIFT; + ret += (sweep_id & SWEEP_ID_MASK) << SWEEP_ID_SHIFT; + return ret; +} + +/* Extract the thread-id from the given store-pattern */ +static inline unsigned int extract_tid(unsigned int pattern) +{ + unsigned int ret; + + ret = (pattern >> THREAD_ID_SHIFT) & THREAD_ID_MASK; + return ret; +} + +/* Extract the word-offset from the given store-pattern */ +static inline unsigned int extract_word_offset(unsigned int pattern) +{ + unsigned int ret; + + ret = (pattern >> WORD_OFFSET_SHIFT) & WORD_OFFSET_MASK; + + return ret; +} + +/* Extract the sweep-id from the given store-pattern */ +static inline unsigned int extract_sweep_id(unsigned int pattern) + +{ + unsigned int ret; + + ret = (pattern >> SWEEP_ID_SHIFT) & SWEEP_ID_MASK; + + return ret; +} + +/************************************************************ + * * + * Logging the output of the verification * + * * + ************************************************************/ +#define LOGDIR_NAME_SIZE 100 +static char logdir[LOGDIR_NAME_SIZE]; + +static FILE *fp[MAX_THREADS]; +static const char logfilename[] ="Thread-%02d-Chunk"; + +static inline void start_verification_log(unsigned int tid, + unsigned int *addr, + unsigned int cur_sweep_id, + unsigned int prev_sweep_id) +{ + FILE *f; + char logfile[30]; + char path[LOGDIR_NAME_SIZE + 30]; + char separator[2] = "/"; + char *chunk_start = compute_chunk_start_addr(tid); + unsigned int size = RIM_CHUNK_SIZE; + + sprintf(logfile, logfilename, tid); + strcpy(path, logdir); + strcat(path, separator); + strcat(path, logfile); + f = fopen(path, "w"); + + if (!f) { + err_msg("Unable to create logfile\n"); + } + + fp[tid] = f; + + fprintf(f, "----------------------------------------------------------\n"); + fprintf(f, "PID = %d\n", rim_process_pid); + fprintf(f, "Thread id = %02d\n", tid); + fprintf(f, "Chunk Start Addr = 0x%016lx\n", (unsigned long)chunk_start); + fprintf(f, "Chunk Size = %d\n", size); + fprintf(f, "Next Store Addr = 0x%016lx\n", (unsigned long)addr); + fprintf(f, "Current sweep-id = 0x%08x\n", cur_sweep_id); + fprintf(f, "Previous sweep-id = 0x%08x\n", prev_sweep_id); + fprintf(f, "----------------------------------------------------------\n"); +} + +static inline void log_anamoly(unsigned int tid, unsigned int *addr, + unsigned int expected, unsigned int observed) +{ + FILE *f = fp[tid]; + + fprintf(f, "Thread %02d: Addr 0x%lx: Expected 0x%x, Observed 0x%x\n", + tid, (unsigned long)addr, expected, observed); + fprintf(f, "Thread %02d: Expected Thread id = %02d\n", tid, extract_tid(expected)); + fprintf(f, "Thread %02d: Observed Thread id = %02d\n", tid, extract_tid(observed)); + fprintf(f, "Thread %02d: Expected Word offset = %03d\n", tid, extract_word_offset(expected)); + fprintf(f, "Thread %02d: Observed Word offset = %03d\n", tid, extract_word_offset(observed)); + fprintf(f, "Thread %02d: Expected sweep-id = 0x%x\n", tid, extract_sweep_id(expected)); + fprintf(f, "Thread %02d: Observed sweep-id = 0x%x\n", tid, extract_sweep_id(observed)); + fprintf(f, "----------------------------------------------------------\n"); +} + +static inline void end_verification_log(unsigned int tid, unsigned nr_anamolies) +{ + FILE *f = fp[tid]; + char logfile[30]; + char path[LOGDIR_NAME_SIZE + 30]; + char separator[] = "/"; + + fclose(f); + + if (nr_anamolies == 0) { + remove(path); + return; + } + + sprintf(logfile, logfilename, tid); + strcpy(path, logdir); + strcat(path, separator); + strcat(path, logfile); + + printf("Thread %02d chunk has %d corrupted words. For details check %s\n", + tid, nr_anamolies, path); +} + +/* + * When a COMPARE step of a rim-sequence fails, the rim_thread informs + * everyone else via the shared_memory pointed to by + * corruption_found variable. On seeing this, every thread verifies the + * content of its chunk as follows. + * + * Suppose a thread identified with @tid was about to store (but not + * yet stored) to @next_store_addr in its current sweep identified + * @cur_sweep_id. Let @prev_sweep_id indicate the previous sweep_id. + * + * This implies that for all the addresses @addr < @next_store_addr, + * Thread @tid has already performed a store as part of its current + * sweep. Hence we expect the content of such @addr to be: + * |-------------------------------------------------| + * | tid | word_offset(addr) | cur_sweep_id | + * |-------------------------------------------------| + * + * Since Thread @tid is yet to perform stores on address + * @next_store_addr and above, we expect the content of such an + * address @addr to be: + * |-------------------------------------------------| + * | tid | word_offset(addr) | prev_sweep_id | + * |-------------------------------------------------| + * + * The verifier function @verify_chunk does this verification and logs + * any anamolies that it finds. + */ +static void verify_chunk(unsigned int tid, unsigned int *next_store_addr, + unsigned int cur_sweep_id, + unsigned int prev_sweep_id) +{ + unsigned int *iter_ptr; + unsigned int size = RIM_CHUNK_SIZE; + unsigned int expected; + unsigned int observed; + char *chunk_start = compute_chunk_start_addr(tid); + + int nr_anamolies = 0; + + start_verification_log(tid, next_store_addr, + cur_sweep_id, prev_sweep_id); + + for (iter_ptr = (unsigned int *)chunk_start; + (unsigned long)iter_ptr < (unsigned long)chunk_start + size; + iter_ptr++) { + unsigned int expected_sweep_id; + + if (iter_ptr < next_store_addr) { + expected_sweep_id = cur_sweep_id; + } else { + expected_sweep_id = prev_sweep_id; + } + + expected = compute_store_pattern(tid, iter_ptr, expected_sweep_id); + + dcbf((volatile unsigned int*)iter_ptr); //Flush before reading + observed = *iter_ptr; + + if (observed != expected) { + nr_anamolies++; + log_anamoly(tid, iter_ptr, expected, observed); + } + } + + end_verification_log(tid, nr_anamolies); +} + +static void set_pthread_cpu(pthread_t th, int cpu) +{ + cpu_set_t run_cpu_mask; + struct sched_param param; + + CPU_ZERO(&run_cpu_mask); + CPU_SET(cpu, &run_cpu_mask); + pthread_setaffinity_np(th, sizeof(cpu_set_t), &run_cpu_mask); + + param.sched_priority = 1; + if (0 && sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) { + /* haven't reproduced with this setting, it kills random preemption which may be a factor */ + fprintf(stderr, "could not set SCHED_FIFO, run as root?\n"); + } +} + +static void set_mycpu(int cpu) +{ + cpu_set_t run_cpu_mask; + struct sched_param param; + + CPU_ZERO(&run_cpu_mask); + CPU_SET(cpu, &run_cpu_mask); + sched_setaffinity(0, sizeof(cpu_set_t), &run_cpu_mask); + + param.sched_priority = 1; + if (0 && sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) { + fprintf(stderr, "could not set SCHED_FIFO, run as root?\n"); + } +} + +static volatile int segv_wait; + +static void segv_handler(int signo, siginfo_t *info, void *extra) +{ + while (segv_wait) { + sched_yield(); + } + +} + +static void set_segv_handler(void) +{ + struct sigaction sa; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = segv_handler; + + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } +} + +int timeout = 0; +/* + * This function is executed by every rim_thread. + * + * This function performs sweeps over the exclusive chunks of the + * rim_threads executing the rim-sequence one word at a time. + */ +static void *rim_fn(void *arg) +{ + unsigned int tid = *((unsigned int *)arg); + + int size = RIM_CHUNK_SIZE; + char *chunk_start = compute_chunk_start_addr(tid); + + unsigned int prev_sweep_id; + unsigned int cur_sweep_id = 0; + + /* word access */ + unsigned int pattern = cur_sweep_id; + unsigned int *pattern_ptr = &pattern; + unsigned int *w_ptr, read_data; + + set_segv_handler(); + + /* + * Let us initialize the chunk: + * + * Each word-aligned address addr in the chunk, + * is initialized to : + * |-------------------------------------------------| + * | tid | word_offset(addr) | 0 | + * |-------------------------------------------------| + */ + for (w_ptr = (unsigned int *)chunk_start; + (unsigned long)w_ptr < (unsigned long)(chunk_start) + size; + w_ptr++) { + + *pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id); + *w_ptr = *pattern_ptr; + } + + while (!corruption_found && !timeout) { + prev_sweep_id = cur_sweep_id; + cur_sweep_id = cur_sweep_id + 1; + + for (w_ptr = (unsigned int *)chunk_start; + (unsigned long)w_ptr < (unsigned long)(chunk_start) + size; + w_ptr++) { + unsigned int old_pattern; + + /* + * Compute the pattern that we would have + * stored at this location in the previous + * sweep. + */ + old_pattern = compute_store_pattern(tid, w_ptr, prev_sweep_id); + + /* + * FLUSH:Ensure that we flush the contents of + * the cache before loading + */ + dcbf((volatile unsigned int*)w_ptr); //Flush + + /* LOAD: Read the value */ + read_data = *w_ptr; //Load + + /* + * COMPARE: Is it the same as what we had stored + * in the previous sweep ? It better be! + */ + if (read_data != old_pattern) { + /* No it isn't! Tell everyone */ + corruption_found = 1; + } + + /* + * Before performing a store, let us check if + * any rim_thread has found a corruption. + */ + if (corruption_found || timeout) { + /* + * Yes. Someone (including us!) has found + * a corruption :( + * + * Let us verify that our chunk is + * correct. + */ + /* But first, let us allow the dust to settle down! */ + verify_chunk(tid, w_ptr, cur_sweep_id, prev_sweep_id); + + return 0; + } + + /* + * Compute the new pattern that we are going + * to write to this location + */ + *pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id); + + /* + * STORE: Now let us write this pattern into + * the location + */ + *w_ptr = *pattern_ptr; + } + } + + return NULL; +} + + +static unsigned long start_cpu = 0; +static unsigned long nrthreads = 4; + +static pthread_t mem_snapshot_thread; + +static void *mem_snapshot_fn(void *arg) +{ + int page_size = getpagesize(); + size_t size = page_size; + void *tmp = malloc(size); + + while (!corruption_found && !timeout) { + /* Stop memory migration once corruption is found */ + segv_wait = 1; + + mprotect(map1, size, PROT_READ); + + /* + * Load from the working alias (map1). Loading from map2 + * also fails. + */ + memcpy(tmp, map1, size); + + /* + * Stores must go via map2 which has write permissions, but + * the corrupted data tends to be seen in the snapshot buffer, + * so corruption does not appear to be introduced at the + * copy-back via map2 alias here. + */ + memcpy(map2, tmp, size); + /* + * Before releasing other threads, must ensure the copy + * back to + */ + asm volatile("sync" ::: "memory"); + mprotect(map1, size, PROT_READ|PROT_WRITE); + asm volatile("sync" ::: "memory"); + segv_wait = 0; + + usleep(1); /* This value makes a big difference */ + } + + return 0; +} + +void alrm_sighandler(int sig) +{ + timeout = 1; +} + +int main(int argc, char *argv[]) +{ + int c; + int page_size = getpagesize(); + time_t now; + int i, dir_error; + pthread_attr_t attr; + key_t shm_key = (key_t) getpid(); + int shmid, run_time = 20 * 60; + struct sigaction sa_alrm; + + snprintf(logdir, LOGDIR_NAME_SIZE, + "/tmp/logdir-%u", (unsigned int)getpid()); + while ((c = getopt(argc, argv, "r:hn:l:t:")) != -1) { + switch(c) { + case 'r': + start_cpu = strtoul(optarg, NULL, 10); + break; + case 'h': + printf("%s [-r ] [-n ] [-l ] [-t ]\n", argv[0]); + exit(0); + break; + case 'n': + nrthreads = strtoul(optarg, NULL, 10); + break; + case 'l': + strncpy(logdir, optarg, LOGDIR_NAME_SIZE); + break; + case 't': + run_time = strtoul(optarg, NULL, 10); + break; + default: + printf("invalid option\n"); + exit(0); + break; + } + } + + if (nrthreads > MAX_THREADS) + nrthreads = MAX_THREADS; + + shmid = shmget(shm_key, page_size, IPC_CREAT|0666); + if (shmid < 0) { + err_msg("Failed shmget\n"); + } + + map1 = shmat(shmid, NULL, 0); + if (map1 == (void *) -1) { + err_msg("Failed shmat"); + } + + map2 = shmat(shmid, NULL, 0); + if (map2 == (void *) -1) { + err_msg("Failed shmat"); + } + + dir_error = mkdir(logdir, 0755); + + if (dir_error) { + err_msg("Failed mkdir"); + } + + printf("start_cpu list:%lu\n", start_cpu); + printf("number of worker threads:%lu + 1 snapshot thread\n", nrthreads); + printf("Allocated address:0x%016lx + secondary map:0x%016lx\n", (unsigned long)map1, (unsigned long)map2); + printf("logdir at : %s\n", logdir); + printf("Timeout: %d seconds\n", run_time); + + time(&now); + printf("=================================\n"); + printf(" Starting Test\n"); + printf(" %s", ctime(&now)); + printf("=================================\n"); + + for (i = 0; i < nrthreads; i++) { + if (1 && !fork()) { + prctl(PR_SET_PDEATHSIG, SIGKILL); + set_mycpu(start_cpu + i); + for (;;) + sched_yield(); + exit(0); + } + } + + + sa_alrm.sa_handler = &alrm_sighandler; + sigemptyset(&sa_alrm.sa_mask); + sa_alrm.sa_flags = 0; + + if (sigaction(SIGALRM, &sa_alrm, 0) == -1) { + err_msg("Failed signal handler registration\n"); + } + + alarm(run_time); + + pthread_attr_init(&attr); + for (i = 0; i < nrthreads; i++) { + rim_thread_ids[i] = i; + pthread_create(&rim_threads[i], &attr, rim_fn, &rim_thread_ids[i]); + set_pthread_cpu(rim_threads[i], start_cpu + i); + } + + pthread_create(&mem_snapshot_thread, &attr, mem_snapshot_fn, map1); + set_pthread_cpu(mem_snapshot_thread, start_cpu + i); + + + pthread_join(mem_snapshot_thread, NULL); + for (i = 0; i < nrthreads; i++) { + pthread_join(rim_threads[i], NULL); + } + + if (!timeout) { + time(&now); + printf("=================================\n"); + printf(" Data Corruption Detected\n"); + printf(" %s", ctime(&now)); + printf(" See logfiles in %s\n", logdir); + printf("=================================\n"); + return 1; + } + return 0; +} From 4111cdef0e871c0f7c8874b19ee68a3671f7d63e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 3 Sep 2019 18:04:51 +0530 Subject: [PATCH 0960/2880] powerpc/nvdimm: Use HCALL error as the return value This simplifies the error handling and also enable us to switch to H_SCM_QUERY hcall in a later patch on H_OVERLAP error. We also do some kernel print formatting fixup in this patch. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903123452.28620-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/platforms/pseries/papr_scm.c | 26 ++++++++++------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index a5ac371a3f06..3bef4d298ac6 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -66,28 +66,22 @@ static int drc_pmem_bind(struct papr_scm_priv *p) } while (rc == H_BUSY); if (rc) { - /* H_OVERLAP needs a separate error path */ - if (rc == H_OVERLAP) - return -EBUSY; - dev_err(&p->pdev->dev, "bind err: %lld\n", rc); - return -ENXIO; + return rc; } p->bound_addr = saved; - - dev_dbg(&p->pdev->dev, "bound drc %x to %pR\n", p->drc_index, &p->res); - - return 0; + dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res); + return rc; } -static int drc_pmem_unbind(struct papr_scm_priv *p) +static void drc_pmem_unbind(struct papr_scm_priv *p) { unsigned long ret[PLPAR_HCALL_BUFSIZE]; uint64_t token = 0; int64_t rc; - dev_dbg(&p->pdev->dev, "unbind drc %x\n", p->drc_index); + dev_dbg(&p->pdev->dev, "unbind drc 0x%x\n", p->drc_index); /* NB: unbind has the same retry requirements as drc_pmem_bind() */ do { @@ -110,10 +104,10 @@ static int drc_pmem_unbind(struct papr_scm_priv *p) if (rc) dev_err(&p->pdev->dev, "unbind error: %lld\n", rc); else - dev_dbg(&p->pdev->dev, "unbind drc %x complete\n", + dev_dbg(&p->pdev->dev, "unbind drc 0x%x complete\n", p->drc_index); - return rc == H_SUCCESS ? 0 : -ENXIO; + return; } static int papr_scm_meta_get(struct papr_scm_priv *p, @@ -436,14 +430,16 @@ static int papr_scm_probe(struct platform_device *pdev) rc = drc_pmem_bind(p); /* If phyp says drc memory still bound then force unbound and retry */ - if (rc == -EBUSY) { + if (rc == H_OVERLAP) { dev_warn(&pdev->dev, "Retrying bind after unbinding\n"); drc_pmem_unbind(p); rc = drc_pmem_bind(p); } - if (rc) + if (rc != H_SUCCESS) { + rc = -ENXIO; goto err; + } /* setup the resource for the newly bound range */ p->res.start = p->bound_addr; From faa6d21153fd11e139dd880044521389b34a24f2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 3 Sep 2019 18:04:52 +0530 Subject: [PATCH 0961/2880] powerpc/nvdimm: use H_SCM_QUERY hcall on H_OVERLAP error Right now we force an unbind of SCM memory at drcindex on H_OVERLAP error. This really slows down operations like kexec where we get the H_OVERLAP error because we don't go through a full hypervisor re init. H_OVERLAP error for a H_SCM_BIND_MEM hcall indicates that SCM memory at drc index is already bound. Since we don't specify a logical memory address for bind hcall, we can use the H_SCM_QUERY hcall to query the already bound logical address. Boot time difference with and without patch is: [ 5.583617] IOMMU table initialized, virtual merging enabled [ 5.603041] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Retrying bind after unbinding [ 301.514221] papr_scm ibm,persistent-memory:ibm,pmemory@44108001: Retrying bind after unbinding [ 340.057238] hv-24x7: read 1530 catalog entries, created 537 event attrs (0 failures), 275 descs after fix [ 5.101572] IOMMU table initialized, virtual merging enabled [ 5.116984] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Querying SCM details [ 5.117223] papr_scm ibm,persistent-memory:ibm,pmemory@44108001: Querying SCM details [ 5.120530] hv-24x7: read 1530 catalog entries, created 537 event attrs (0 failures), 275 descs Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903123452.28620-2-aneesh.kumar@linux.ibm.com --- arch/powerpc/platforms/pseries/papr_scm.c | 48 +++++++++++++++++++---- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 3bef4d298ac6..61883291defc 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -65,10 +65,8 @@ static int drc_pmem_bind(struct papr_scm_priv *p) cond_resched(); } while (rc == H_BUSY); - if (rc) { - dev_err(&p->pdev->dev, "bind err: %lld\n", rc); + if (rc) return rc; - } p->bound_addr = saved; dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res); @@ -110,6 +108,42 @@ static void drc_pmem_unbind(struct papr_scm_priv *p) return; } +static int drc_pmem_query_n_bind(struct papr_scm_priv *p) +{ + unsigned long start_addr; + unsigned long end_addr; + unsigned long ret[PLPAR_HCALL_BUFSIZE]; + int64_t rc; + + + rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret, + p->drc_index, 0); + if (rc) + goto err_out; + start_addr = ret[0]; + + /* Make sure the full region is bound. */ + rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret, + p->drc_index, p->blocks - 1); + if (rc) + goto err_out; + end_addr = ret[0]; + + if ((end_addr - start_addr) != ((p->blocks - 1) * p->block_size)) + goto err_out; + + p->bound_addr = start_addr; + dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res); + return rc; + +err_out: + dev_info(&p->pdev->dev, + "Failed to query, trying an unbind followed by bind"); + drc_pmem_unbind(p); + return drc_pmem_bind(p); +} + + static int papr_scm_meta_get(struct papr_scm_priv *p, struct nd_cmd_get_config_data_hdr *hdr) { @@ -430,13 +464,11 @@ static int papr_scm_probe(struct platform_device *pdev) rc = drc_pmem_bind(p); /* If phyp says drc memory still bound then force unbound and retry */ - if (rc == H_OVERLAP) { - dev_warn(&pdev->dev, "Retrying bind after unbinding\n"); - drc_pmem_unbind(p); - rc = drc_pmem_bind(p); - } + if (rc == H_OVERLAP) + rc = drc_pmem_query_n_bind(p); if (rc != H_SUCCESS) { + dev_err(&p->pdev->dev, "bind err: %d\n", rc); rc = -ENXIO; goto err; } From 07bfa4415ab607e459b69bd86aa7e7602ce10b4f Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Mon, 23 Sep 2019 15:32:53 -0700 Subject: [PATCH 0962/2880] fat: work around race with userspace's read via blockdev while mounting If userspace reads the buffer via blockdev while mounting, sb_getblk()+modify can race with buffer read via blockdev. For example, FS userspace bh = sb_getblk() modify bh->b_data read ll_rw_block(bh) fill bh->b_data by on-disk data /* lost modified data by FS */ set_buffer_uptodate(bh) set_buffer_uptodate(bh) Userspace should not use the blockdev while mounting though, the udev seems to be already doing this. Although I think the udev should try to avoid this, workaround the race by small overhead. Link: http://lkml.kernel.org/r/87pnk7l3sw.fsf_-_@mail.parknet.co.jp Signed-off-by: OGAWA Hirofumi Reported-by: Jan Stancek Tested-by: Jan Stancek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/dir.c | 13 +++++++++++-- fs/fat/fatent.c | 3 +++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 1bda2ab6745b..814ad2c2ba80 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1100,8 +1100,11 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, err = -ENOMEM; goto error; } + /* Avoid race with userspace read via bdev */ + lock_buffer(bhs[n]); memset(bhs[n]->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bhs[n]); + unlock_buffer(bhs[n]); mark_buffer_dirty_inode(bhs[n], dir); n++; @@ -1158,6 +1161,8 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); de = (struct msdos_dir_entry *)bhs[0]->b_data; + /* Avoid race with userspace read via bdev */ + lock_buffer(bhs[0]); /* filling the new directory slots ("." and ".." entries) */ memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME); memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME); @@ -1180,6 +1185,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) de[0].size = de[1].size = 0; memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); set_buffer_uptodate(bhs[0]); + unlock_buffer(bhs[0]); mark_buffer_dirty_inode(bhs[0], dir); err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); @@ -1237,11 +1243,14 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, /* fill the directory entry */ copy = min(size, sb->s_blocksize); + /* Avoid race with userspace read via bdev */ + lock_buffer(bhs[n]); memcpy(bhs[n]->b_data, slots, copy); + set_buffer_uptodate(bhs[n]); + unlock_buffer(bhs[n]); + mark_buffer_dirty_inode(bhs[n], dir); slots += copy; size -= copy; - set_buffer_uptodate(bhs[n]); - mark_buffer_dirty_inode(bhs[n], dir); if (!size) break; n++; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 265983635f2b..3647c65a0f48 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -388,8 +388,11 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, err = -ENOMEM; goto error; } + /* Avoid race with userspace read via bdev */ + lock_buffer(c_bh); memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); set_buffer_uptodate(c_bh); + unlock_buffer(c_bh); mark_buffer_dirty_inode(c_bh, sbi->fat_inode); if (sb->s_flags & SB_SYNCHRONOUS) err = sync_dirty_buffer(c_bh); From 6e73fd25e2c7510b7dfadbaf701f31d4bff9c75b Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 23 Sep 2019 15:32:56 -0700 Subject: [PATCH 0963/2880] Revert "mm/z3fold.c: fix race between migration and destruction" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the original commit applied, z3fold_zpool_destroy() may get blocked on wait_event() for indefinite time. Revert this commit for the time being to get rid of this problem since the issue the original commit addresses is less severe. Link: http://lkml.kernel.org/r/20190910123142.7a9c8d2de4d0acbc0977c602@gmail.com Fixes: d776aaa9895eb6eb77 ("mm/z3fold.c: fix race between migration and destruction") Reported-by: Agustín Dall'Alba Signed-off-by: Vitaly Wool Cc: Vlastimil Babka Cc: Vitaly Wool Cc: Shakeel Butt Cc: Jonathan Adams Cc: Henry Burns Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 90 ----------------------------------------------------- 1 file changed, 90 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 75b7962439ff..ed19d98c9dcd 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include @@ -146,8 +145,6 @@ struct z3fold_header { * @release_wq: workqueue for safe page release * @work: work_struct for safe page release * @inode: inode for z3fold pseudo filesystem - * @destroying: bool to stop migration once we start destruction - * @isolated: int to count the number of pages currently in isolation * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. @@ -166,11 +163,8 @@ struct z3fold_pool { const struct zpool_ops *zpool_ops; struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; - struct wait_queue_head isolate_wait; struct work_struct work; struct inode *inode; - bool destroying; - int isolated; }; /* @@ -775,7 +769,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, goto out_c; spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); - init_waitqueue_head(&pool->isolate_wait); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); if (!pool->unbuddied) goto out_pool; @@ -815,15 +808,6 @@ out: return NULL; } -static bool pool_isolated_are_drained(struct z3fold_pool *pool) -{ - bool ret; - - spin_lock(&pool->lock); - ret = pool->isolated == 0; - spin_unlock(&pool->lock); - return ret; -} /** * z3fold_destroy_pool() - destroys an existing z3fold pool * @pool: the z3fold pool to be destroyed @@ -833,22 +817,6 @@ static bool pool_isolated_are_drained(struct z3fold_pool *pool) static void z3fold_destroy_pool(struct z3fold_pool *pool) { kmem_cache_destroy(pool->c_handle); - /* - * We set pool-> destroying under lock to ensure that - * z3fold_page_isolate() sees any changes to destroying. This way we - * avoid the need for any memory barriers. - */ - - spin_lock(&pool->lock); - pool->destroying = true; - spin_unlock(&pool->lock); - - /* - * We need to ensure that no pages are being migrated while we destroy - * these workqueues, as migration can queue work on either of the - * workqueues. - */ - wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool)); /* * We need to destroy pool->compact_wq before pool->release_wq, @@ -1339,28 +1307,6 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool) return atomic64_read(&pool->pages_nr); } -/* - * z3fold_dec_isolated() expects to be called while pool->lock is held. - */ -static void z3fold_dec_isolated(struct z3fold_pool *pool) -{ - assert_spin_locked(&pool->lock); - VM_BUG_ON(pool->isolated <= 0); - pool->isolated--; - - /* - * If we have no more isolated pages, we have to see if - * z3fold_destroy_pool() is waiting for a signal. - */ - if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait)) - wake_up_all(&pool->isolate_wait); -} - -static void z3fold_inc_isolated(struct z3fold_pool *pool) -{ - pool->isolated++; -} - static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) { struct z3fold_header *zhdr; @@ -1387,34 +1333,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) spin_lock(&pool->lock); if (!list_empty(&page->lru)) list_del(&page->lru); - /* - * We need to check for destruction while holding pool->lock, as - * otherwise destruction could see 0 isolated pages, and - * proceed. - */ - if (unlikely(pool->destroying)) { - spin_unlock(&pool->lock); - /* - * If this page isn't stale, somebody else holds a - * reference to it. Let't drop our refcount so that they - * can call the release logic. - */ - if (unlikely(kref_put(&zhdr->refcount, - release_z3fold_page_locked))) { - /* - * If we get here we have kref problems, so we - * should freak out. - */ - WARN(1, "Z3fold is experiencing kref problems\n"); - z3fold_page_unlock(zhdr); - return false; - } - z3fold_page_unlock(zhdr); - return false; - } - - - z3fold_inc_isolated(pool); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); return true; @@ -1483,10 +1401,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); - spin_lock(&pool->lock); - z3fold_dec_isolated(pool); - spin_unlock(&pool->lock); - page_mapcount_reset(page); put_page(page); return 0; @@ -1506,14 +1420,10 @@ static void z3fold_page_putback(struct page *page) INIT_LIST_HEAD(&page->lru); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { atomic64_dec(&pool->pages_nr); - spin_lock(&pool->lock); - z3fold_dec_isolated(pool); - spin_unlock(&pool->lock); return; } spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); - z3fold_dec_isolated(pool); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); } From 710ec38b0f633ab3e2581f07a73442d809e28ab0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 23 Sep 2019 15:32:59 -0700 Subject: [PATCH 0964/2880] mm: add dummy can_do_mlock() helper On kernels without CONFIG_MMU, we get a link error for the siw driver: drivers/infiniband/sw/siw/siw_mem.o: In function `siw_umem_get': siw_mem.c:(.text+0x4c8): undefined reference to `can_do_mlock' This is probably not the only driver that needs the function and could otherwise build correctly without CONFIG_MMU, so add a dummy variant that always returns false. Link: http://lkml.kernel.org/r/20190909204201.931830-1-arnd@arndb.de Fixes: 2251334dcac9 ("rdma/siw: application buffer management") Signed-off-by: Arnd Bergmann Suggested-by: Jason Gunthorpe Acked-by: Michal Hocko Cc: Bernard Metzler Cc: "Matthew Wilcox (Oracle)" Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7cf955feb823..6e79b3df1582 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1405,7 +1405,11 @@ extern void pagefault_out_of_memory(void); extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); +#ifdef CONFIG_MMU extern bool can_do_mlock(void); +#else +static inline bool can_do_mlock(void) { return false; } +#endif extern int user_shm_lock(size_t, struct user_struct *); extern void user_shm_unlock(size_t, struct user_struct *); From 3f9d2b5766aea06042630ac60b7316fd0cebf06f Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 23 Sep 2019 15:33:02 -0700 Subject: [PATCH 0965/2880] z3fold: fix retry mechanism in page reclaim z3fold_page_reclaim()'s retry mechanism is broken: on a second iteration it will have zhdr from the first one so that zhdr is no longer in line with struct page. That leads to crashes when the system is stressed. Fix that by moving zhdr assignment up. While at it, protect against using already freed handles by using own local slots structure in z3fold_page_reclaim(). Link: http://lkml.kernel.org/r/20190908162919.830388dc7404d1e2c80f4095@gmail.com Signed-off-by: Vitaly Wool Reported-by: Markus Linnala Reported-by: Chris Murphy Reported-by: Agustin Dall'Alba Cc: "Maciej S. Szmigiero" Cc: Shakeel Butt Cc: Henry Burns Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 49 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index ed19d98c9dcd..d637d9ee005c 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -366,9 +366,10 @@ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) * Encodes the handle of a particular buddy within a z3fold page * Pool lock should be held as this function accesses first_num */ -static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) +static unsigned long __encode_handle(struct z3fold_header *zhdr, + struct z3fold_buddy_slots *slots, + enum buddy bud) { - struct z3fold_buddy_slots *slots; unsigned long h = (unsigned long)zhdr; int idx = 0; @@ -385,11 +386,15 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) if (bud == LAST) h |= (zhdr->last_chunks << BUDDY_SHIFT); - slots = zhdr->slots; slots->slot[idx] = h; return (unsigned long)&slots->slot[idx]; } +static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) +{ + return __encode_handle(zhdr, zhdr->slots, bud); +} + /* Returns the z3fold page where a given handle is stored */ static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) { @@ -624,6 +629,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) } if (unlikely(PageIsolated(page) || + test_bit(PAGE_CLAIMED, &page->private) || test_bit(PAGE_STALE, &page->private))) { z3fold_page_unlock(zhdr); return; @@ -1100,6 +1106,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) struct z3fold_header *zhdr = NULL; struct page *page = NULL; struct list_head *pos; + struct z3fold_buddy_slots slots; unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; spin_lock(&pool->lock); @@ -1118,16 +1125,22 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) /* this bit could have been set by free, in which case * we pass over to the next page in the pool. */ - if (test_and_set_bit(PAGE_CLAIMED, &page->private)) + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) { + page = NULL; continue; + } - if (unlikely(PageIsolated(page))) + if (unlikely(PageIsolated(page))) { + clear_bit(PAGE_CLAIMED, &page->private); + page = NULL; continue; + } + zhdr = page_address(page); if (test_bit(PAGE_HEADLESS, &page->private)) break; - zhdr = page_address(page); if (!z3fold_page_trylock(zhdr)) { + clear_bit(PAGE_CLAIMED, &page->private); zhdr = NULL; continue; /* can't evict at this point */ } @@ -1145,26 +1158,30 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) if (!test_bit(PAGE_HEADLESS, &page->private)) { /* - * We need encode the handles before unlocking, since - * we can race with free that will set - * (first|last)_chunks to 0 + * We need encode the handles before unlocking, and + * use our local slots structure because z3fold_free + * can zero out zhdr->slots and we can't do much + * about that */ first_handle = 0; last_handle = 0; middle_handle = 0; if (zhdr->first_chunks) - first_handle = encode_handle(zhdr, FIRST); + first_handle = __encode_handle(zhdr, &slots, + FIRST); if (zhdr->middle_chunks) - middle_handle = encode_handle(zhdr, MIDDLE); + middle_handle = __encode_handle(zhdr, &slots, + MIDDLE); if (zhdr->last_chunks) - last_handle = encode_handle(zhdr, LAST); + last_handle = __encode_handle(zhdr, &slots, + LAST); /* * it's safe to unlock here because we hold a * reference to this page */ z3fold_page_unlock(zhdr); } else { - first_handle = encode_handle(zhdr, HEADLESS); + first_handle = __encode_handle(zhdr, &slots, HEADLESS); last_handle = middle_handle = 0; } @@ -1194,9 +1211,9 @@ next: spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); + clear_bit(PAGE_CLAIMED, &page->private); } else { z3fold_page_lock(zhdr); - clear_bit(PAGE_CLAIMED, &page->private); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { atomic64_dec(&pool->pages_nr); @@ -1211,6 +1228,7 @@ next: list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); + clear_bit(PAGE_CLAIMED, &page->private); } /* We started off locked to we need to lock the pool back */ @@ -1315,7 +1333,8 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); - if (test_bit(PAGE_HEADLESS, &page->private)) + if (test_bit(PAGE_HEADLESS, &page->private) || + test_bit(PAGE_CLAIMED, &page->private)) return false; zhdr = page_address(page); From 6279eb3dd7946c69346a3b98473ed13d3a44adb5 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Mon, 23 Sep 2019 15:33:05 -0700 Subject: [PATCH 0966/2880] kbuild: clean compressed initramfs image Since 9e3596b0c653 ("kbuild: initramfs cleanup, set target from Kconfig") "make clean" leaves behind compressed initramfs images. Example: $ make defconfig $ sed -i 's|CONFIG_INITRAMFS_SOURCE=""|CONFIG_INITRAMFS_SOURCE="/tmp/ir.cpio"|' .config $ make olddefconfig $ make -s $ make -s clean $ git clean -ndxf | grep initramfs Would remove usr/initramfs_data.cpio.gz clean rules do not have CONFIG_* context so they do not know which compression format was used. Thus they don't know which files to delete. Tell clean to delete all possible compression formats. Once patched usr/initramfs_data.cpio.gz and friends are deleted by "make clean". Link: http://lkml.kernel.org/r/20190722063251.55541-1-gthelen@google.com Fixes: 9e3596b0c653 ("kbuild: initramfs cleanup, set target from Kconfig") Signed-off-by: Greg Thelen Cc: Nicholas Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- usr/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/usr/Makefile b/usr/Makefile index 6a89eb019275..e6f7cb2f81db 100644 --- a/usr/Makefile +++ b/usr/Makefile @@ -11,6 +11,9 @@ datafile_y = initramfs_data.cpio$(suffix_y) datafile_d_y = .$(datafile_y).d AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)" +# clean rules do not have CONFIG_INITRAMFS_COMPRESSION. So clean up after all +# possible compression formats. +clean-files += initramfs_data.cpio* # Generate builtin.o based on initramfs_data.o obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o From bbd0f32721e28cc7097fa50afa96178896f9e33b Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 23 Sep 2019 15:33:08 -0700 Subject: [PATCH 0967/2880] ocfs2: use jbd2_inode dirty range scoping 6ba0e7dc64a5 ("jbd2: introduce jbd2_inode dirty range scoping") allow us scoping each of the inode dirty ranges associated with a given transaction, and ext4 already does this way. Now let's also use the newly introduced jbd2_inode dirty range scoping to prevent us from waiting forever when trying to complete a journal transaction in ocfs2. Link: http://lkml.kernel.org/r/1562977611-8412-1-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Reviewed-by: Ross Zwisler Reviewed-by: Changwei Ge Cc: "Theodore Ts'o" Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 5 ++++- fs/ocfs2/aops.c | 13 ++++++++++--- fs/ocfs2/file.c | 10 +++++++--- fs/ocfs2/journal.h | 11 +++++++---- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 0c335b51043d..f5d2bd15e0ca 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6792,6 +6792,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, struct page *page, int zero, u64 *phys) { int ret, partial = 0; + loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from; + loff_t length = to - from; ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); if (ret) @@ -6811,7 +6813,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, if (ret < 0) mlog_errno(ret); else if (ocfs2_should_order_data(inode)) { - ret = ocfs2_jbd2_file_inode(handle, inode); + ret = ocfs2_jbd2_inode_add_write(handle, inode, + start_byte, length); if (ret < 0) mlog_errno(ret); } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a4c905d6b575..8de1c9d644f6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -942,7 +942,8 @@ static void ocfs2_write_failure(struct inode *inode, if (tmppage && page_has_buffers(tmppage)) { if (ocfs2_should_order_data(inode)) - ocfs2_jbd2_file_inode(wc->w_handle, inode); + ocfs2_jbd2_inode_add_write(wc->w_handle, inode, + user_pos, user_len); block_commit_write(tmppage, from, to); } @@ -2023,8 +2024,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, } if (page_has_buffers(tmppage)) { - if (handle && ocfs2_should_order_data(inode)) - ocfs2_jbd2_file_inode(handle, inode); + if (handle && ocfs2_should_order_data(inode)) { + loff_t start_byte = + ((loff_t)tmppage->index << PAGE_SHIFT) + + from; + loff_t length = to - from; + ocfs2_jbd2_inode_add_write(handle, inode, + start_byte, length); + } block_commit_write(tmppage, from, to); } } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4435df3e5adb..efe9988a5be4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -706,7 +706,9 @@ leave: * Thus, we need to explicitly order the zeroed pages. */ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, - struct buffer_head *di_bh) + struct buffer_head *di_bh, + loff_t start_byte, + loff_t length) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; @@ -722,7 +724,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, goto out; } - ret = ocfs2_jbd2_file_inode(handle, inode); + ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length); if (ret < 0) { mlog_errno(ret); goto out; @@ -761,7 +763,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); - handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); + handle = ocfs2_zero_start_ordered_transaction(inode, di_bh, + abs_from, + abs_to - abs_from); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index c0fe6ed08ab1..f37473c20a7b 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -232,8 +232,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) * ocfs2_journal_access_*() unless you intend to * manage the checksum by hand. * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. - * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before - * the current handle commits. + * ocfs2_jbd2_inode_add_write - Mark an inode with range so that its data goes + * out before the current handle commits. */ /* You must always start_trans with a number of buffs > 0, but it's @@ -603,9 +603,12 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, return credits; } -static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) +static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode, + loff_t start_byte, loff_t length) { - return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode); + return jbd2_journal_inode_ranged_write(handle, + &OCFS2_I(inode)->ip_jinode, + start_byte, length); } static inline int ocfs2_begin_ordered_truncate(struct inode *inode, From 963abb9aebcdacf5c709a36da9ac0727a33671eb Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 23 Sep 2019 15:33:11 -0700 Subject: [PATCH 0968/2880] jbd2: remove jbd2_journal_inode_add_[write|wait] Since ext4/ocfs2 are using jbd2_inode dirty range scoping APIs now, jbd2_journal_inode_add_[write|wait] are not used any more, remove them. Link: http://lkml.kernel.org/r/1562977611-8412-2-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Reviewed-by: Ross Zwisler Acked-by: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Joseph Qi Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd2/journal.c | 2 -- fs/jbd2/transaction.c | 12 ------------ include/linux/jbd2.h | 2 -- 3 files changed, 16 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 953990eb70a9..1c58859aa592 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -89,8 +89,6 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); EXPORT_SYMBOL(jbd2_journal_invalidatepage); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); -EXPORT_SYMBOL(jbd2_journal_inode_add_write); -EXPORT_SYMBOL(jbd2_journal_inode_add_wait); EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index afc06daee5bb..bee8498d7792 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2622,18 +2622,6 @@ done: return 0; } -int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode) -{ - return jbd2_journal_file_inode(handle, jinode, - JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX); -} - -int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode) -{ - return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0, - LLONG_MAX); -} - int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *jinode, loff_t start_byte, loff_t length) { diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index df03825ad1a1..603fbc4e2f70 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1410,8 +1410,6 @@ extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_force_commit_nested(journal_t *); -extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode); -extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode); extern int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); From 5e7a3ed9f1a60f17c165e1b73df6d6aebb211266 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 23 Sep 2019 15:33:15 -0700 Subject: [PATCH 0969/2880] ocfs2: further debugfs cleanups There is no need to check return value of debugfs_create functions, but the last sweep through ocfs missed a number of places where this was happening. There is also no need to save the individual dentries for the debugfs files, as everything is can just be removed at once when the directory is removed. By getting rid of the file dentries for the debugfs entries, a bit of local memory can be saved as well. [colin.king@canonical.com: ensure ret is set to zero before returning] Link: http://lkml.kernel.org/r/20190807121929.28918-1-colin.king@canonical.com Link: http://lkml.kernel.org/r/20190731132119.GA12603@kroah.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Colin Ian King Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Jia Guo Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/blockcheck.c | 26 ++++----- fs/ocfs2/cluster/heartbeat.c | 103 +++++++++-------------------------- fs/ocfs2/dlm/dlmcommon.h | 1 - fs/ocfs2/dlm/dlmdebug.c | 55 ++++--------------- fs/ocfs2/dlm/dlmdebug.h | 16 +----- fs/ocfs2/dlm/dlmdomain.c | 7 +-- fs/ocfs2/dlmglue.c | 20 ++----- fs/ocfs2/ocfs2.h | 3 - fs/ocfs2/super.c | 10 +--- 9 files changed, 61 insertions(+), 180 deletions(-) diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index 429e6a8359a5..eaf042feaf5e 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -231,14 +231,6 @@ static int blockcheck_u64_get(void *data, u64 *val) } DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); -static struct dentry *blockcheck_debugfs_create(const char *name, - struct dentry *parent, - u64 *value) -{ - return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value, - &blockcheck_fops); -} - static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) { if (stats) { @@ -250,16 +242,20 @@ static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, struct dentry *parent) { - stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); + struct dentry *dir; - blockcheck_debugfs_create("blocks_checked", stats->b_debug_dir, - &stats->b_check_count); + dir = debugfs_create_dir("blockcheck", parent); + stats->b_debug_dir = dir; - blockcheck_debugfs_create("checksums_failed", stats->b_debug_dir, - &stats->b_failure_count); + debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir, + &stats->b_check_count, &blockcheck_fops); + + debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir, + &stats->b_failure_count, &blockcheck_fops); + + debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir, + &stats->b_recover_count, &blockcheck_fops); - blockcheck_debugfs_create("ecc_recoveries", stats->b_debug_dir, - &stats->b_recover_count); } #else static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f1b613327ac8..a368350d4c27 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -225,10 +225,6 @@ struct o2hb_region { unsigned int hr_region_num; struct dentry *hr_debug_dir; - struct dentry *hr_debug_livenodes; - struct dentry *hr_debug_regnum; - struct dentry *hr_debug_elapsed_time; - struct dentry *hr_debug_pinned; struct o2hb_debug_buf *hr_db_livenodes; struct o2hb_debug_buf *hr_db_regnum; struct o2hb_debug_buf *hr_db_elapsed_time; @@ -1394,21 +1390,20 @@ void o2hb_exit(void) kfree(o2hb_db_failedregions); } -static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, - struct o2hb_debug_buf **db, int db_len, - int type, int size, int len, void *data) +static void o2hb_debug_create(const char *name, struct dentry *dir, + struct o2hb_debug_buf **db, int db_len, int type, + int size, int len, void *data) { *db = kmalloc(db_len, GFP_KERNEL); if (!*db) - return NULL; + return; (*db)->db_type = type; (*db)->db_size = size; (*db)->db_len = len; (*db)->db_data = data; - return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, - &o2hb_debug_fops); + debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops); } static void o2hb_debug_init(void) @@ -1525,11 +1520,7 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slots); - debugfs_remove(reg->hr_debug_livenodes); - debugfs_remove(reg->hr_debug_regnum); - debugfs_remove(reg->hr_debug_elapsed_time); - debugfs_remove(reg->hr_debug_pinned); - debugfs_remove(reg->hr_debug_dir); + debugfs_remove_recursive(reg->hr_debug_dir); kfree(reg->hr_db_livenodes); kfree(reg->hr_db_regnum); kfree(reg->hr_db_elapsed_time); @@ -1988,69 +1979,33 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group : NULL; } -static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) +static void o2hb_debug_region_init(struct o2hb_region *reg, + struct dentry *parent) { - int ret = -ENOMEM; + struct dentry *dir; - reg->hr_debug_dir = - debugfs_create_dir(config_item_name(®->hr_item), dir); - if (!reg->hr_debug_dir) { - mlog_errno(ret); - goto bail; - } + dir = debugfs_create_dir(config_item_name(®->hr_item), parent); + reg->hr_debug_dir = dir; - reg->hr_debug_livenodes = - o2hb_debug_create(O2HB_DEBUG_LIVENODES, - reg->hr_debug_dir, - &(reg->hr_db_livenodes), - sizeof(*(reg->hr_db_livenodes)), - O2HB_DB_TYPE_REGION_LIVENODES, - sizeof(reg->hr_live_node_bitmap), - O2NM_MAX_NODES, reg); - if (!reg->hr_debug_livenodes) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes), + sizeof(*(reg->hr_db_livenodes)), + O2HB_DB_TYPE_REGION_LIVENODES, + sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES, + reg); - reg->hr_debug_regnum = - o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, - reg->hr_debug_dir, - &(reg->hr_db_regnum), - sizeof(*(reg->hr_db_regnum)), - O2HB_DB_TYPE_REGION_NUMBER, - 0, O2NM_MAX_NODES, reg); - if (!reg->hr_debug_regnum) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum), + sizeof(*(reg->hr_db_regnum)), + O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg); - reg->hr_debug_elapsed_time = - o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, - reg->hr_debug_dir, - &(reg->hr_db_elapsed_time), - sizeof(*(reg->hr_db_elapsed_time)), - O2HB_DB_TYPE_REGION_ELAPSED_TIME, - 0, 0, reg); - if (!reg->hr_debug_elapsed_time) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir, + &(reg->hr_db_elapsed_time), + sizeof(*(reg->hr_db_elapsed_time)), + O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg); - reg->hr_debug_pinned = - o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, - reg->hr_debug_dir, - &(reg->hr_db_pinned), - sizeof(*(reg->hr_db_pinned)), - O2HB_DB_TYPE_REGION_PINNED, - 0, 0, reg); - if (!reg->hr_debug_pinned) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned), + sizeof(*(reg->hr_db_pinned)), + O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg); - ret = 0; -bail: - return ret; } static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, @@ -2106,11 +2061,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g if (ret) goto unregister_handler; - ret = o2hb_debug_region_init(reg, o2hb_debug_dir); - if (ret) { - config_item_put(®->hr_item); - goto unregister_handler; - } + o2hb_debug_region_init(reg, o2hb_debug_dir); return ®->hr_item; diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 69a429b625cc..aaf24548b02a 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -142,7 +142,6 @@ struct dlm_ctxt atomic_t res_tot_count; atomic_t res_cur_count; - struct dlm_debug_ctxt *dlm_debug_ctxt; struct dentry *dlm_debugfs_subroot; /* NOTE: Next three are protected by dlm_domain_lock */ diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index a4b58ba99927..4d0b452012b2 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -853,67 +853,34 @@ static const struct file_operations debug_state_fops = { /* files in subroot */ void dlm_debug_init(struct dlm_ctxt *dlm) { - struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; - /* for dumping dlm_ctxt */ - dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_state_fops); + debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_state_fops); /* for dumping lockres */ - dc->debug_lockres_dentry = - debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_lockres_fops); + debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops); /* for dumping mles */ - dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_mle_fops); + debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops); /* for dumping lockres on the purge list */ - dc->debug_purgelist_dentry = - debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_purgelist_fops); -} - -void dlm_debug_shutdown(struct dlm_ctxt *dlm) -{ - struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; - - if (dc) { - debugfs_remove(dc->debug_purgelist_dentry); - debugfs_remove(dc->debug_mle_dentry); - debugfs_remove(dc->debug_lockres_dentry); - debugfs_remove(dc->debug_state_dentry); - kfree(dc); - dc = NULL; - } + debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, + &debug_purgelist_fops); } /* subroot - domain dir */ -int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { - dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), - GFP_KERNEL); - if (!dlm->dlm_debug_ctxt) { - mlog_errno(-ENOMEM); - return -ENOMEM; - } - dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, dlm_debugfs_root); - return 0; } void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { - debugfs_remove(dlm->dlm_debugfs_subroot); + debugfs_remove_recursive(dlm->dlm_debugfs_subroot); } /* debugfs root */ diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index 7d0c7c9013ce..f8fd8680a4b6 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -14,13 +14,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle); #ifdef CONFIG_DEBUG_FS -struct dlm_debug_ctxt { - struct dentry *debug_state_dentry; - struct dentry *debug_lockres_dentry; - struct dentry *debug_mle_dentry; - struct dentry *debug_purgelist_dentry; -}; - struct debug_lockres { int dl_len; char *dl_buf; @@ -29,9 +22,8 @@ struct debug_lockres { }; void dlm_debug_init(struct dlm_ctxt *dlm); -void dlm_debug_shutdown(struct dlm_ctxt *dlm); -int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); +void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); void dlm_create_debugfs_root(void); @@ -42,13 +34,9 @@ void dlm_destroy_debugfs_root(void); static inline void dlm_debug_init(struct dlm_ctxt *dlm) { } -static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) +static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { } -static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) -{ - return 0; -} static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 7338b5d4647c..ee6f459f9770 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -387,7 +387,6 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) { dlm_unregister_domain_handlers(dlm); - dlm_debug_shutdown(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); dlm_destroy_dlm_worker(dlm); @@ -1938,7 +1937,6 @@ bail: if (status) { dlm_unregister_domain_handlers(dlm); - dlm_debug_shutdown(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); dlm_destroy_dlm_worker(dlm); @@ -1992,9 +1990,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->key = key; dlm->node_num = o2nm_this_node(); - ret = dlm_create_debugfs_subroot(dlm); - if (ret < 0) - goto leave; + dlm_create_debugfs_subroot(dlm); spin_lock_init(&dlm->spinlock); spin_lock_init(&dlm->master_lock); @@ -2056,6 +2052,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, mlog(0, "context init: refcount %u\n", kref_read(&dlm->dlm_refs)); + ret = 0; leave: if (ret < 0 && dlm) { if (dlm->master_hash) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 14207234fa3d..ad594fef2ab0 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3012,8 +3012,6 @@ struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) kref_init(&dlm_debug->d_refcnt); INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); - dlm_debug->d_locking_state = NULL; - dlm_debug->d_locking_filter = NULL; dlm_debug->d_filter_secs = 0; out: return dlm_debug; @@ -3282,27 +3280,19 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb) { struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; - dlm_debug->d_locking_state = debugfs_create_file("locking_state", - S_IFREG|S_IRUSR, - osb->osb_debug_root, - osb, - &ocfs2_dlm_debug_fops); + debugfs_create_file("locking_state", S_IFREG|S_IRUSR, + osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops); - dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter", - 0600, - osb->osb_debug_root, - &dlm_debug->d_filter_secs); + debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, + &dlm_debug->d_filter_secs); } static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) { struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; - if (dlm_debug) { - debugfs_remove(dlm_debug->d_locking_state); - debugfs_remove(dlm_debug->d_locking_filter); + if (dlm_debug) ocfs2_put_dlm_debug(dlm_debug); - } } int ocfs2_dlm_init(struct ocfs2_super *osb) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fddbbd60f434..9150cfa4df7d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -223,8 +223,6 @@ struct ocfs2_orphan_scan { struct ocfs2_dlm_debug { struct kref d_refcnt; - struct dentry *d_locking_state; - struct dentry *d_locking_filter; u32 d_filter_secs; struct list_head d_lockres_tracking; }; @@ -401,7 +399,6 @@ struct ocfs2_super struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; - struct dentry *osb_ctxt; wait_queue_head_t recovery_event; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8b2f39506648..c81e86c62380 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1080,10 +1080,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); - osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, - osb->osb_debug_root, - osb, - &ocfs2_osb_debug_fops); + debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, + osb, &ocfs2_osb_debug_fops); if (ocfs2_meta_ecc(osb)) ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, @@ -1861,8 +1859,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) kset_unregister(osb->osb_dev_kset); - debugfs_remove(osb->osb_ctxt); - /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); @@ -1918,7 +1914,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_dlm_shutdown(osb, hangup_needed); ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); - debugfs_remove(osb->osb_debug_root); + debugfs_remove_recursive(osb->osb_debug_root); if (hangup_needed) ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); From 3dd21cdbefa97cf3ec5559f9ab214af4d8ae2096 Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Mon, 23 Sep 2019 15:33:18 -0700 Subject: [PATCH 0970/2880] ocfs2: remove unused ocfs2_calc_tree_trunc_credits() ocfs2_calc_tree_trunc_credits() is not called anywhere. Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4014FC2050F@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: guozhonghua Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.h | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index f37473c20a7b..6d8dd3308aaf 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -575,34 +575,6 @@ static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb) return ocfs2_extent_recs_per_gd(sb); } -static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, - unsigned int clusters_to_del, - struct ocfs2_dinode *fe, - struct ocfs2_extent_list *last_el) -{ - /* for dinode + all headers in this pass + update to next leaf */ - u16 next_free = le16_to_cpu(last_el->l_next_free_rec); - u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); - int credits = 1 + tree_depth + 1; - int i; - - i = next_free - 1; - BUG_ON(i < 0); - - /* We may be deleting metadata blocks, so metadata alloc dinode + - one desc. block for each possible delete. */ - if (tree_depth && next_free == 1 && - ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) - credits += 1 + tree_depth; - - /* update to the truncate log. */ - credits += OCFS2_TRUNCATE_LOG_UPDATE; - - credits += ocfs2_quota_trans_credits(sb); - - return credits; -} - static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode, loff_t start_byte, loff_t length) { From bf5a52647963cd08b2e999f77b16739d6300ebce Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Mon, 23 Sep 2019 15:33:21 -0700 Subject: [PATCH 0971/2880] ocfs2: remove unused ocfs2_orphan_scan_exit() declaration ocfs2_orphan_scan_exit() is declared but not implemented. Also perform a minor cleanup in ocfs2_link_credits() Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4014FC208AC@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: guozhonghua Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 6d8dd3308aaf..3103ba7f97a2 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -144,7 +144,6 @@ static inline void ocfs2_ci_set_new(struct ocfs2_super *osb, void ocfs2_orphan_scan_init(struct ocfs2_super *osb); void ocfs2_orphan_scan_start(struct ocfs2_super *osb); void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); -void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); void ocfs2_complete_recovery(struct work_struct *work); void ocfs2_wait_for_recovery(struct ocfs2_super *osb); @@ -441,7 +440,7 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, * previous dirblock update in the free list */ static inline int ocfs2_link_credits(struct super_block *sb) { - return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + + return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 + ocfs2_quota_trans_credits(sb); } From 225dcadf8ee869d4429373ef04fdce49ac3fc266 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 23 Sep 2019 15:33:24 -0700 Subject: [PATCH 0972/2880] fs/ocfs2/namei.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/ocfs2/namei.c: In function ocfs2_create_inode_in_orphan: fs/ocfs2/namei.c:2503:23: warning: variable di set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566522588-63786-2-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: zhengbin Signed-off-by: Joseph Qi Reported-by: Hulk Robot Reviewed-by: Joseph Qi Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6f8e1c4fdb9c..8ea51cf27b97 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2486,7 +2486,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, struct inode *inode = NULL; struct inode *orphan_dir = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); - struct ocfs2_dinode *di = NULL; handle_t *handle = NULL; char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; struct buffer_head *parent_di_bh = NULL; @@ -2552,7 +2551,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; } - di = (struct ocfs2_dinode *)new_di_bh->b_data; status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, &orphan_insert, orphan_dir, false); if (status < 0) { From 236dcc2ae4945e00201dbab9b8f76905849b7515 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 23 Sep 2019 15:33:27 -0700 Subject: [PATCH 0973/2880] fs/ocfs2/file.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/ocfs2/file.c: In function ocfs2_prepare_inode_for_write: fs/ocfs2/file.c:2143:9: warning: variable end set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566522588-63786-3-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: zhengbin Signed-off-by: Joseph Qi Reported-by: Hulk Robot Reviewed-by: Joseph Qi Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index efe9988a5be4..2e982db3e1ae 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2130,7 +2130,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); struct buffer_head *di_bh = NULL; - loff_t end; /* * We start with a read level meta lock and only jump to an ex @@ -2194,8 +2193,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, } } - end = pos + count; - ret = ocfs2_check_range_for_refcount(inode, pos, count); if (ret == 1) { ocfs2_inode_unlock(inode, meta_level); From 77461ba1d17673da47eeb5cccb928e10cf97a01e Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 23 Sep 2019 15:33:30 -0700 Subject: [PATCH 0974/2880] fs/ocfs2/dir.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/ocfs2/dir.c: In function ocfs2_dx_dir_transfer_leaf: fs/ocfs2/dir.c:3653:42: warning: variable new_list set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566522588-63786-4-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: zhengbin Signed-off-by: Joseph Qi Reported-by: Hulk Robot Reviewed-by: Joseph Qi Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dir.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 784426dee56c..bdef72c0f099 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3636,7 +3636,7 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, int i, j, num_used; u32 major_hash; struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; - struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list; + struct ocfs2_dx_entry_list *orig_list, *tmp_list; struct ocfs2_dx_entry *dx_entry; tmp_list = &tmp_dx_leaf->dl_list; @@ -3645,7 +3645,6 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; orig_list = &orig_dx_leaf->dl_list; new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; - new_list = &new_dx_leaf->dl_list; num_used = le16_to_cpu(orig_list->de_num_used); From a89bd89fae638965ca5a79a3467d79f926260882 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Mon, 23 Sep 2019 15:33:34 -0700 Subject: [PATCH 0975/2880] ocfs2: delete unnecessary checks before brelse() brelse() tests whether its argument is NULL and then returns immediately. Thus the tests around the shown calls are not needed. This issue was detected by using the Coccinelle software. Link: http://lkml.kernel.org/r/55cde320-394b-f985-56ce-1a2abea782aa@web.de Signed-off-by: Markus Elfring Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 7 ++----- fs/ocfs2/extent_map.c | 3 +-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index ad594fef2ab0..6e774c5ea13b 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2508,9 +2508,7 @@ bail: ocfs2_inode_unlock(inode, ex); } - if (local_bh) - brelse(local_bh); - + brelse(local_bh); return status; } @@ -2593,8 +2591,7 @@ int ocfs2_inode_lock_atime(struct inode *inode, *level = 1; if (ocfs2_should_update_atime(inode, vfsmnt)) ocfs2_update_inode_atime(inode, bh); - if (bh) - brelse(bh); + brelse(bh); } else *level = 0; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e66a249fe07c..e3e2d1b2af51 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -590,8 +590,7 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, *extent_flags = rec->e_flags; } out: - if (eb_bh) - brelse(eb_bh); + brelse(eb_bh); return ret; } From 0a3775e4f883912944481cf2ef36eb6383a9cc74 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Mon, 23 Sep 2019 15:33:37 -0700 Subject: [PATCH 0976/2880] ocfs2: wait for recovering done after direct unlock request There is a scenario causing ocfs2 umount hang when multiple hosts are rebooting at the same time. NODE1 NODE2 NODE3 send unlock requset to NODE2 dies become recovery master recover NODE2 find NODE2 dead mark resource RECOVERING directly remove lock from grant list calculate usage but RECOVERING marked **miss the window of purging clear RECOVERING To reproduce this issue, crash a host and then umount ocfs2 from another node. To solve this, just let unlock progress wait for recovery done. Link: http://lkml.kernel.org/r/1550124866-20367-1-git-send-email-gechangwei@live.cn Signed-off-by: Changwei Ge Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmunlock.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index e78657742bd8..3883633e82eb 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -90,7 +90,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, enum dlm_status status; int actions = 0; int in_use; - u8 owner; + u8 owner; + int recovery_wait = 0; mlog(0, "master_node = %d, valblk = %d\n", master_node, flags & LKM_VALBLK); @@ -193,9 +194,12 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, } if (flags & LKM_CANCEL) lock->cancel_pending = 0; - else - lock->unlock_pending = 0; - + else { + if (!lock->unlock_pending) + recovery_wait = 1; + else + lock->unlock_pending = 0; + } } /* get an extra ref on lock. if we are just switching @@ -229,6 +233,17 @@ leave: spin_unlock(&res->spinlock); wake_up(&res->wq); + if (recovery_wait) { + spin_lock(&res->spinlock); + /* Unlock request will directly succeed after owner dies, + * and the lock is already removed from grant list. We have to + * wait for RECOVERING done or we miss the chance to purge it + * since the removement is much faster than RECOVERING proc. + */ + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING); + spin_unlock(&res->spinlock); + } + /* let the caller's final dlm_lock_put handle the actual kfree */ if (actions & DLM_UNLOCK_FREE_LOCK) { /* this should always be coupled with list removal */ From d7283b39dbf3de4fe54870f07d7975effa3188da Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Mon, 23 Sep 2019 15:33:40 -0700 Subject: [PATCH 0977/2880] ocfs2: checkpoint appending truncate log transaction before flushing Appending truncate log(TA) and and flushing truncate log(TF) are two separated transactions. They can be both committed but not checkpointed. If crash occurs then, both transaction will be replayed with several already released to global bitmap clusters. Then truncate log will be replayed resulting in cluster double free. To reproduce this issue, just crash the host while punching hole to files. Signed-off-by: Changwei Ge Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f5d2bd15e0ca..f9baefc76cf9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5993,6 +5993,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct buffer_head *data_alloc_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; + struct ocfs2_journal *journal = osb->journal; BUG_ON(inode_trylock(tl_inode)); @@ -6013,6 +6014,20 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } + /* Appending truncate log(TA) and and flushing truncate log(TF) are + * two separated transactions. They can be both committed but not + * checkpointed. If crash occurs then, both two transaction will be + * replayed with several already released to global bitmap clusters. + * Then truncate log will be replayed resulting in cluster double free. + */ + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) { + mlog_errno(status); + goto out; + } + data_alloc_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); From 1c3ce5417b338ba3c697bac6485581ba117d8e52 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 23 Sep 2019 15:33:43 -0700 Subject: [PATCH 0978/2880] ocfs2: fix spelling mistake "ambigous" -> "ambiguous" There is a spelling mistake in a mlog_bug_on_msg message. Fix it. Link: http://lkml.kernel.org/r/831bdff4-064e-038b-f45d-c4d265cbff1e@linux.alibaba.com Signed-off-by: Colin Ian King Acked-by: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7ad9d6590818..7c9dfd50c1c1 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -534,7 +534,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, */ mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), - "Inode %llu: system file state is ambigous\n", + "Inode %llu: system file state is ambiguous\n", (unsigned long long)args->fi_blkno); if (S_ISCHR(le16_to_cpu(fe->i_mode)) || From 04f768a39d55967246c002aa66b407b3bfdd8269 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 23 Sep 2019 15:33:46 -0700 Subject: [PATCH 0979/2880] mm, slab: extend slab/shrink to shrink all memcg caches Currently, a value of '1" is written to /sys/kernel/slab//shrink file to shrink the slab by flushing out all the per-cpu slabs and free slabs in partial lists. This can be useful to squeeze out a bit more memory under extreme condition as well as making the active object counts in /proc/slabinfo more accurate. This usually applies only to the root caches, as the SLUB_MEMCG_SYSFS_ON option is usually not enabled and "slub_memcg_sysfs=1" not set. Even if memcg sysfs is turned on, it is too cumbersome and impractical to manage all those per-memcg sysfs files in a real production system. So there is no practical way to shrink memcg caches. Fix this by enabling a proper write to the shrink sysfs file of the root cache to scan all the available memcg caches and shrink them as well. For a non-root memcg cache (when SLUB_MEMCG_SYSFS_ON or slub_memcg_sysfs is on), only that cache will be shrunk when written. On a 2-socket 64-core 256-thread arm64 system with 64k page after a parallel kernel build, the the amount of memory occupied by slabs before shrinking slabs were: # grep task_struct /proc/slabinfo task_struct 53137 53192 4288 61 4 : tunables 0 0 0 : slabdata 872 872 0 # grep "^S[lRU]" /proc/meminfo Slab: 3936832 kB SReclaimable: 399104 kB SUnreclaim: 3537728 kB After shrinking slabs (by echoing "1" to all shrink files): # grep "^S[lRU]" /proc/meminfo Slab: 1356288 kB SReclaimable: 263296 kB SUnreclaim: 1092992 kB # grep task_struct /proc/slabinfo task_struct 2764 6832 4288 61 4 : tunables 0 0 0 : slabdata 112 112 0 Link: http://lkml.kernel.org/r/20190723151445.7385-1-longman@redhat.com Signed-off-by: Waiman Long Acked-by: Roman Gushchin Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Michal Hocko Cc: Johannes Weiner Cc: Shakeel Butt Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-kernel-slab | 13 +++++--- mm/slab.h | 1 + mm/slab_common.c | 37 +++++++++++++++++++++ mm/slub.c | 2 +- 4 files changed, 48 insertions(+), 5 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index 29601d93a1c2..ed35833ad7f0 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -429,10 +429,15 @@ KernelVersion: 2.6.22 Contact: Pekka Enberg , Christoph Lameter Description: - The shrink file is written when memory should be reclaimed from - a cache. Empty partial slabs are freed and the partial list is - sorted so the slabs with the fewest available objects are used - first. + The shrink file is used to reclaim unused slab cache + memory from a cache. Empty per-cpu or partial slabs + are freed and the partial list is sorted so the slabs + with the fewest available objects are used first. + It only accepts a value of "1" on write for shrinking + the cache. Other input values are considered invalid. + Shrinking slab caches might be expensive and can + adversely impact other running applications. So it + should be used with care. What: /sys/kernel/slab/cache/slab_size Date: May 2007 diff --git a/mm/slab.h b/mm/slab.h index 9057b8056b07..5bf615cb3f99 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -174,6 +174,7 @@ int __kmem_cache_shrink(struct kmem_cache *); void __kmemcg_cache_deactivate(struct kmem_cache *s); void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s); void slab_kmem_cache_release(struct kmem_cache *); +void kmem_cache_shrink_all(struct kmem_cache *s); struct seq_file; struct file; diff --git a/mm/slab_common.c b/mm/slab_common.c index 807490fe217a..6491c3a41805 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -981,6 +981,43 @@ int kmem_cache_shrink(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_shrink); +/** + * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache + * @s: The cache pointer + */ +void kmem_cache_shrink_all(struct kmem_cache *s) +{ + struct kmem_cache *c; + + if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) { + kmem_cache_shrink(s); + return; + } + + get_online_cpus(); + get_online_mems(); + kasan_cache_shrink(s); + __kmem_cache_shrink(s); + + /* + * We have to take the slab_mutex to protect from the memcg list + * modification. + */ + mutex_lock(&slab_mutex); + for_each_memcg_cache(c, s) { + /* + * Don't need to shrink deactivated memcg caches. + */ + if (s->flags & SLAB_DEACTIVATED) + continue; + kasan_cache_shrink(c); + __kmem_cache_shrink(c); + } + mutex_unlock(&slab_mutex); + put_online_mems(); + put_online_cpus(); +} + bool slab_is_available(void) { return slab_state >= UP; diff --git a/mm/slub.c b/mm/slub.c index 8834563cdb4b..66808d3e97d2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5298,7 +5298,7 @@ static ssize_t shrink_store(struct kmem_cache *s, const char *buf, size_t length) { if (buf[0] == '1') - kmem_cache_shrink(s); + kmem_cache_shrink_all(s); else return -EINVAL; return length; From 9adeaa226988b97bc15928e12f40a9863134467c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 23 Sep 2019 15:33:49 -0700 Subject: [PATCH 0980/2880] mm, slab: move memcg_cache_params structure to mm/slab.h The memcg_cache_params structure is only embedded into the kmem_cache of slab and slub allocators as defined in slab_def.h and slub_def.h and used internally by mm code. There is no needed to expose it in a public header. So move it from include/linux/slab.h to mm/slab.h. It is just a refactoring patch with no code change. In fact both the slub_def.h and slab_def.h should be moved into the mm directory as well, but that will probably cause many merge conflicts. Link: http://lkml.kernel.org/r/20190718180827.18758-1-longman@redhat.com Signed-off-by: Waiman Long Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 62 ------------------------------------------- mm/slab.h | 63 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 62 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 56c9c7eed34e..ab2b98ad76e1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -595,68 +595,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) return __kmalloc_node(size, flags, node); } -struct memcg_cache_array { - struct rcu_head rcu; - struct kmem_cache *entries[0]; -}; - -/* - * This is the main placeholder for memcg-related information in kmem caches. - * Both the root cache and the child caches will have it. For the root cache, - * this will hold a dynamically allocated array large enough to hold - * information about the currently limited memcgs in the system. To allow the - * array to be accessed without taking any locks, on relocation we free the old - * version only after a grace period. - * - * Root and child caches hold different metadata. - * - * @root_cache: Common to root and child caches. NULL for root, pointer to - * the root cache for children. - * - * The following fields are specific to root caches. - * - * @memcg_caches: kmemcg ID indexed table of child caches. This table is - * used to index child cachces during allocation and cleared - * early during shutdown. - * - * @root_caches_node: List node for slab_root_caches list. - * - * @children: List of all child caches. While the child caches are also - * reachable through @memcg_caches, a child cache remains on - * this list until it is actually destroyed. - * - * The following fields are specific to child caches. - * - * @memcg: Pointer to the memcg this cache belongs to. - * - * @children_node: List node for @root_cache->children list. - * - * @kmem_caches_node: List node for @memcg->kmem_caches list. - */ -struct memcg_cache_params { - struct kmem_cache *root_cache; - union { - struct { - struct memcg_cache_array __rcu *memcg_caches; - struct list_head __root_caches_node; - struct list_head children; - bool dying; - }; - struct { - struct mem_cgroup *memcg; - struct list_head children_node; - struct list_head kmem_caches_node; - struct percpu_ref refcnt; - - void (*work_fn)(struct kmem_cache *); - union { - struct rcu_head rcu_head; - struct work_struct work; - }; - }; - }; -}; - int memcg_update_all_caches(int num_memcgs); /** diff --git a/mm/slab.h b/mm/slab.h index 5bf615cb3f99..68e455f2b698 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -30,6 +30,69 @@ struct kmem_cache { struct list_head list; /* List of all slab caches on the system */ }; +#else /* !CONFIG_SLOB */ + +struct memcg_cache_array { + struct rcu_head rcu; + struct kmem_cache *entries[0]; +}; + +/* + * This is the main placeholder for memcg-related information in kmem caches. + * Both the root cache and the child caches will have it. For the root cache, + * this will hold a dynamically allocated array large enough to hold + * information about the currently limited memcgs in the system. To allow the + * array to be accessed without taking any locks, on relocation we free the old + * version only after a grace period. + * + * Root and child caches hold different metadata. + * + * @root_cache: Common to root and child caches. NULL for root, pointer to + * the root cache for children. + * + * The following fields are specific to root caches. + * + * @memcg_caches: kmemcg ID indexed table of child caches. This table is + * used to index child cachces during allocation and cleared + * early during shutdown. + * + * @root_caches_node: List node for slab_root_caches list. + * + * @children: List of all child caches. While the child caches are also + * reachable through @memcg_caches, a child cache remains on + * this list until it is actually destroyed. + * + * The following fields are specific to child caches. + * + * @memcg: Pointer to the memcg this cache belongs to. + * + * @children_node: List node for @root_cache->children list. + * + * @kmem_caches_node: List node for @memcg->kmem_caches list. + */ +struct memcg_cache_params { + struct kmem_cache *root_cache; + union { + struct { + struct memcg_cache_array __rcu *memcg_caches; + struct list_head __root_caches_node; + struct list_head children; + bool dying; + }; + struct { + struct mem_cgroup *memcg; + struct list_head children_node; + struct list_head kmem_caches_node; + struct percpu_ref refcnt; + + void (*work_fn)(struct kmem_cache *); + union { + struct rcu_head rcu_head; + struct work_struct work; + }; + }; + }; +}; #endif /* CONFIG_SLOB */ #ifdef CONFIG_SLAB From 9d5f0be0f7556873cd00125dc579a9d9d186b0d0 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 23 Sep 2019 15:33:52 -0700 Subject: [PATCH 0981/2880] mm/slub.c: fix -Wunused-function compiler warnings tid_to_cpu() and tid_to_event() are only used in note_cmpxchg_failure() when SLUB_DEBUG_CMPXCHG=y, so when SLUB_DEBUG_CMPXCHG=n by default, Clang will complain that those unused functions. Link: http://lkml.kernel.org/r/1568752232-5094-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 66808d3e97d2..17fe1cac11fb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2004,6 +2004,7 @@ static inline unsigned long next_tid(unsigned long tid) return tid + TID_STEP; } +#ifdef SLUB_DEBUG_CMPXCHG static inline unsigned int tid_to_cpu(unsigned long tid) { return tid % TID_STEP; @@ -2013,6 +2014,7 @@ static inline unsigned long tid_to_event(unsigned long tid) { return tid / TID_STEP; } +#endif static inline unsigned int init_tid(int cpu) { From b751c52bb587ae66f773b15204ef7a147467f4c7 Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Mon, 23 Sep 2019 15:33:55 -0700 Subject: [PATCH 0982/2880] kmemleak: increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE default to 16K The current default value (400) is too low on many systems (e.g. some ARM64 platform takes up 1000+ entries). syzbot uses 16000 as default value, and has proved to be enough on beefy configurations, so let's pick that value. This consumes more RAM on boot (each entry is 160 bytes, so in total ~2.5MB of RAM), but the memory would later be freed (early_log is __initdata). Link: http://lkml.kernel.org/r/20190730154027.101525-1-drinkcat@chromium.org Signed-off-by: Nicolas Boichat Suggested-by: Dmitry Vyukov Acked-by: Catalin Marinas Acked-by: Dmitry Vyukov Cc: Masahiro Yamada Cc: Kees Cook Cc: Petr Mladek Cc: Thomas Gleixner Cc: Tetsuo Handa Cc: Joe Lawrence Cc: Uladzislau Rezki Cc: Andy Shevchenko Cc: Stephen Rothwell Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e0e14780a13d..3c88e54da86c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -580,7 +580,7 @@ config DEBUG_KMEMLEAK_EARLY_LOG_SIZE int "Maximum kmemleak early log entries" depends on DEBUG_KMEMLEAK range 200 40000 - default 400 + default 16000 help Kmemleak must track all the memory allocations to avoid reporting false positives. Since memory may be allocated or From dba82d9431770e68c45b03f0ffa2daa8abfb9429 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Sep 2019 15:33:59 -0700 Subject: [PATCH 0983/2880] mm: kmemleak: make the tool tolerant to struct scan_area allocation failures Patch series "mm: kmemleak: Use a memory pool for kmemleak object allocations", v3. Following the discussions on v2 of this patch(set) [1], this series takes slightly different approach: - it implements its own simple memory pool that does not rely on the slab allocator - drops the early log buffer logic entirely since it can now allocate metadata from the memory pool directly before kmemleak is fully initialised - CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE option is renamed to CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE - moves the kmemleak_init() call earlier (mm_init()) - to avoid a separate memory pool for struct scan_area, it makes the tool robust when such allocations fail as scan areas are rather an optimisation [1] http://lkml.kernel.org/r/20190727132334.9184-1-catalin.marinas@arm.com This patch (of 3): Object scan areas are an optimisation aimed to decrease the false positives and slightly improve the scanning time of large objects known to only have a few specific pointers. If a struct scan_area fails to allocate, kmemleak can still function normally by scanning the full object. Introduce an OBJECT_FULL_SCAN flag and mark objects as such when scan_area allocation fails. Link: http://lkml.kernel.org/r/20190812160642.52134-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Cc: Michal Hocko Cc: Matthew Wilcox Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f6e602918dac..5ba7fad00fda 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -168,6 +168,8 @@ struct kmemleak_object { #define OBJECT_REPORTED (1 << 1) /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) +/* flag set to fully scan the object when scan_area allocation failed */ +#define OBJECT_FULL_SCAN (1 << 3) #define HEX_PREFIX " " /* number of bytes to print per line; must be 16 or 32 */ @@ -773,12 +775,14 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) } area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); - if (!area) { - pr_warn("Cannot allocate a scan area\n"); - goto out; - } spin_lock_irqsave(&object->lock, flags); + if (!area) { + pr_warn_once("Cannot allocate a scan area, scanning the full object\n"); + /* mark the object for full scan to avoid false positives */ + object->flags |= OBJECT_FULL_SCAN; + goto out_unlock; + } if (size == SIZE_MAX) { size = object->pointer + object->size - ptr; } else if (ptr + size > object->pointer + object->size) { @@ -795,7 +799,6 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) hlist_add_head(&area->node, &object->area_list); out_unlock: spin_unlock_irqrestore(&object->lock, flags); -out: put_object(object); } @@ -1408,7 +1411,8 @@ static void scan_object(struct kmemleak_object *object) if (!(object->flags & OBJECT_ALLOCATED)) /* already freed object */ goto out; - if (hlist_empty(&object->area_list)) { + if (hlist_empty(&object->area_list) || + object->flags & OBJECT_FULL_SCAN) { void *start = (void *)object->pointer; void *end = (void *)(object->pointer + object->size); void *next; From 0647398a8c7bd55e0b7565c5076e86b7c3c204c5 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Sep 2019 15:34:02 -0700 Subject: [PATCH 0984/2880] mm: kmemleak: simple memory allocation pool for kmemleak objects Add a memory pool for struct kmemleak_object in case the normal kmem_cache_alloc() fails under the gfp constraints passed by the caller. The mem_pool[] array size is currently fixed at 16000. We are not using the existing mempool kernel API since this requires the slab allocator to be available (for pool->elements allocation). A subsequent kmemleak patch will replace the static early log buffer with the pool allocation introduced here and this functionality is required to be available before the slab was initialised. Link: http://lkml.kernel.org/r/20190812160642.52134-3-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Cc: Matthew Wilcox Cc: Michal Hocko Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5ba7fad00fda..2fb86524d70b 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -180,11 +180,17 @@ struct kmemleak_object { #define HEX_ASCII 1 /* max number of lines to be printed */ #define HEX_MAX_LINES 2 +/* memory pool size */ +#define MEM_POOL_SIZE 16000 /* the list of all allocated objects */ static LIST_HEAD(object_list); /* the list of gray-colored objects (see color_gray comment below) */ static LIST_HEAD(gray_list); +/* memory pool allocation */ +static struct kmemleak_object mem_pool[MEM_POOL_SIZE]; +static int mem_pool_free_count = ARRAY_SIZE(mem_pool); +static LIST_HEAD(mem_pool_free_list); /* search tree for object boundaries */ static struct rb_root object_tree_root = RB_ROOT; /* rw_lock protecting the access to object_list and object_tree_root */ @@ -451,6 +457,50 @@ static int get_object(struct kmemleak_object *object) return atomic_inc_not_zero(&object->use_count); } +/* + * Memory pool allocation and freeing. kmemleak_lock must not be held. + */ +static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) +{ + unsigned long flags; + struct kmemleak_object *object; + + /* try the slab allocator first */ + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); + if (object) + return object; + + /* slab allocation failed, try the memory pool */ + write_lock_irqsave(&kmemleak_lock, flags); + object = list_first_entry_or_null(&mem_pool_free_list, + typeof(*object), object_list); + if (object) + list_del(&object->object_list); + else if (mem_pool_free_count) + object = &mem_pool[--mem_pool_free_count]; + write_unlock_irqrestore(&kmemleak_lock, flags); + + return object; +} + +/* + * Return the object to either the slab allocator or the memory pool. + */ +static void mem_pool_free(struct kmemleak_object *object) +{ + unsigned long flags; + + if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) { + kmem_cache_free(object_cache, object); + return; + } + + /* add the object to the memory pool free list */ + write_lock_irqsave(&kmemleak_lock, flags); + list_add(&object->object_list, &mem_pool_free_list); + write_unlock_irqrestore(&kmemleak_lock, flags); +} + /* * RCU callback to free a kmemleak_object. */ @@ -469,7 +519,7 @@ static void free_object_rcu(struct rcu_head *rcu) hlist_del(&area->node); kmem_cache_free(scan_area_cache, area); } - kmem_cache_free(object_cache, object); + mem_pool_free(object); } /* @@ -552,7 +602,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct rb_node **link, *rb_parent; unsigned long untagged_ptr; - object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); + object = mem_pool_alloc(gfp); if (!object) { pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); From c5665868183fec689dbab9fb8505188b2c4f0757 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Sep 2019 15:34:05 -0700 Subject: [PATCH 0985/2880] mm: kmemleak: use the memory pool for early allocations Currently kmemleak uses a static early_log buffer to trace all memory allocation/freeing before the slab allocator is initialised. Such early log is replayed during kmemleak_init() to properly initialise the kmemleak metadata for objects allocated up that point. With a memory pool that does not rely on the slab allocator, it is possible to skip this early log entirely. In order to remove the early logging, consider kmemleak_enabled == 1 by default while the kmem_cache availability is checked directly on the object_cache and scan_area_cache variables. The RCU callback is only invoked after object_cache has been initialised as we wouldn't have any concurrent list traversal before this. In order to reduce the number of callbacks before kmemleak is fully initialised, move the kmemleak_init() call to mm_init(). [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: remove WARN_ON(), per Catalin] Link: http://lkml.kernel.org/r/20190812160642.52134-4-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Cc: Matthew Wilcox Cc: Michal Hocko Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 2 +- lib/Kconfig.debug | 11 +- mm/kmemleak.c | 265 +++++----------------------------------------- 3 files changed, 33 insertions(+), 245 deletions(-) diff --git a/init/main.c b/init/main.c index 653693da8da6..3ca67e8b92fd 100644 --- a/init/main.c +++ b/init/main.c @@ -556,6 +556,7 @@ static void __init mm_init(void) report_meminit(); mem_init(); kmem_cache_init(); + kmemleak_init(); pgtable_init(); debug_objects_mem_init(); vmalloc_init(); @@ -740,7 +741,6 @@ asmlinkage __visible void __init start_kernel(void) initrd_start = 0; } #endif - kmemleak_init(); setup_per_cpu_pageset(); numa_policy_init(); acpi_early_init(); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3c88e54da86c..c6975cded461 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -576,17 +576,18 @@ config DEBUG_KMEMLEAK In order to access the kmemleak file, debugfs needs to be mounted (usually at /sys/kernel/debug). -config DEBUG_KMEMLEAK_EARLY_LOG_SIZE - int "Maximum kmemleak early log entries" +config DEBUG_KMEMLEAK_MEM_POOL_SIZE + int "Kmemleak memory pool size" depends on DEBUG_KMEMLEAK range 200 40000 default 16000 help Kmemleak must track all the memory allocations to avoid reporting false positives. Since memory may be allocated or - freed before kmemleak is initialised, an early log buffer is - used to store these actions. If kmemleak reports "early log - buffer exceeded", please increase this value. + freed before kmemleak is fully initialised, use a static pool + of metadata objects to track such callbacks. After kmemleak is + fully initialised, this memory pool acts as an emergency one + if slab allocations fail. config DEBUG_KMEMLEAK_TEST tristate "Simple test for the kernel memory leak detector" diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2fb86524d70b..b8bbe9ac5472 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -180,15 +180,13 @@ struct kmemleak_object { #define HEX_ASCII 1 /* max number of lines to be printed */ #define HEX_MAX_LINES 2 -/* memory pool size */ -#define MEM_POOL_SIZE 16000 /* the list of all allocated objects */ static LIST_HEAD(object_list); /* the list of gray-colored objects (see color_gray comment below) */ static LIST_HEAD(gray_list); /* memory pool allocation */ -static struct kmemleak_object mem_pool[MEM_POOL_SIZE]; +static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE]; static int mem_pool_free_count = ARRAY_SIZE(mem_pool); static LIST_HEAD(mem_pool_free_list); /* search tree for object boundaries */ @@ -201,13 +199,11 @@ static struct kmem_cache *object_cache; static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ -static int kmemleak_enabled; +static int kmemleak_enabled = 1; /* same as above but only for the kmemleak_free() callback */ -static int kmemleak_free_enabled; +static int kmemleak_free_enabled = 1; /* set in the late_initcall if there were no errors */ static int kmemleak_initialized; -/* enables or disables early logging of the memory operations */ -static int kmemleak_early_log = 1; /* set if a kmemleak warning was issued */ static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ @@ -235,49 +231,6 @@ static bool kmemleak_found_leaks; static bool kmemleak_verbose; module_param_named(verbose, kmemleak_verbose, bool, 0600); -/* - * Early object allocation/freeing logging. Kmemleak is initialized after the - * kernel allocator. However, both the kernel allocator and kmemleak may - * allocate memory blocks which need to be tracked. Kmemleak defines an - * arbitrary buffer to hold the allocation/freeing information before it is - * fully initialized. - */ - -/* kmemleak operation type for early logging */ -enum { - KMEMLEAK_ALLOC, - KMEMLEAK_ALLOC_PERCPU, - KMEMLEAK_FREE, - KMEMLEAK_FREE_PART, - KMEMLEAK_FREE_PERCPU, - KMEMLEAK_NOT_LEAK, - KMEMLEAK_IGNORE, - KMEMLEAK_SCAN_AREA, - KMEMLEAK_NO_SCAN, - KMEMLEAK_SET_EXCESS_REF -}; - -/* - * Structure holding the information passed to kmemleak callbacks during the - * early logging. - */ -struct early_log { - int op_type; /* kmemleak operation type */ - int min_count; /* minimum reference count */ - const void *ptr; /* allocated/freed memory block */ - union { - size_t size; /* memory block size */ - unsigned long excess_ref; /* surplus reference passing */ - }; - unsigned long trace[MAX_TRACE]; /* stack trace */ - unsigned int trace_len; /* stack trace length */ -}; - -/* early logging buffer and current position */ -static struct early_log - early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; -static int crt_early_log __initdata; - static void kmemleak_disable(void); /* @@ -466,9 +419,11 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) struct kmemleak_object *object; /* try the slab allocator first */ - object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); - if (object) - return object; + if (object_cache) { + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); + if (object) + return object; + } /* slab allocation failed, try the memory pool */ write_lock_irqsave(&kmemleak_lock, flags); @@ -478,6 +433,8 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) list_del(&object->object_list); else if (mem_pool_free_count) object = &mem_pool[--mem_pool_free_count]; + else + pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n"); write_unlock_irqrestore(&kmemleak_lock, flags); return object; @@ -537,7 +494,15 @@ static void put_object(struct kmemleak_object *object) /* should only get here after delete_object was called */ WARN_ON(object->flags & OBJECT_ALLOCATED); - call_rcu(&object->rcu, free_object_rcu); + /* + * It may be too early for the RCU callbacks, however, there is no + * concurrent object_list traversal when !object_cache and all objects + * came from the memory pool. Free the object directly. + */ + if (object_cache) + call_rcu(&object->rcu, free_object_rcu); + else + free_object_rcu(&object->rcu); } /* @@ -741,9 +706,7 @@ static void delete_object_part(unsigned long ptr, size_t size) /* * Create one or two objects that may result from the memory block * split. Note that partial freeing is only done by free_bootmem() and - * this happens before kmemleak_init() is called. The path below is - * only executed during early log recording in kmemleak_init(), so - * GFP_KERNEL is enough. + * this happens before kmemleak_init() is called. */ start = object->pointer; end = object->pointer + object->size; @@ -815,7 +778,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) { unsigned long flags; struct kmemleak_object *object; - struct kmemleak_scan_area *area; + struct kmemleak_scan_area *area = NULL; object = find_and_get_object(ptr, 1); if (!object) { @@ -824,7 +787,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) return; } - area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); + if (scan_area_cache) + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); spin_lock_irqsave(&object->lock, flags); if (!area) { @@ -898,86 +862,6 @@ static void object_no_scan(unsigned long ptr) put_object(object); } -/* - * Log an early kmemleak_* call to the early_log buffer. These calls will be - * processed later once kmemleak is fully initialized. - */ -static void __init log_early(int op_type, const void *ptr, size_t size, - int min_count) -{ - unsigned long flags; - struct early_log *log; - - if (kmemleak_error) { - /* kmemleak stopped recording, just count the requests */ - crt_early_log++; - return; - } - - if (crt_early_log >= ARRAY_SIZE(early_log)) { - crt_early_log++; - kmemleak_disable(); - return; - } - - /* - * There is no need for locking since the kernel is still in UP mode - * at this stage. Disabling the IRQs is enough. - */ - local_irq_save(flags); - log = &early_log[crt_early_log]; - log->op_type = op_type; - log->ptr = ptr; - log->size = size; - log->min_count = min_count; - log->trace_len = __save_stack_trace(log->trace); - crt_early_log++; - local_irq_restore(flags); -} - -/* - * Log an early allocated block and populate the stack trace. - */ -static void early_alloc(struct early_log *log) -{ - struct kmemleak_object *object; - unsigned long flags; - int i; - - if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr)) - return; - - /* - * RCU locking needed to ensure object is not freed via put_object(). - */ - rcu_read_lock(); - object = create_object((unsigned long)log->ptr, log->size, - log->min_count, GFP_ATOMIC); - if (!object) - goto out; - spin_lock_irqsave(&object->lock, flags); - for (i = 0; i < log->trace_len; i++) - object->trace[i] = log->trace[i]; - object->trace_len = log->trace_len; - spin_unlock_irqrestore(&object->lock, flags); -out: - rcu_read_unlock(); -} - -/* - * Log an early allocated block and populate the stack trace. - */ -static void early_alloc_percpu(struct early_log *log) -{ - unsigned int cpu; - const void __percpu *ptr = log->ptr; - - for_each_possible_cpu(cpu) { - log->ptr = per_cpu_ptr(ptr, cpu); - early_alloc(log); - } -} - /** * kmemleak_alloc - register a newly allocated object * @ptr: pointer to beginning of the object @@ -999,8 +883,6 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, if (kmemleak_enabled && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); - else if (kmemleak_early_log) - log_early(KMEMLEAK_ALLOC, ptr, size, min_count); } EXPORT_SYMBOL_GPL(kmemleak_alloc); @@ -1028,8 +910,6 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, for_each_possible_cpu(cpu) create_object((unsigned long)per_cpu_ptr(ptr, cpu), size, 0, gfp); - else if (kmemleak_early_log) - log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); @@ -1054,11 +934,6 @@ void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp create_object((unsigned long)area->addr, size, 2, gfp); object_set_excess_ref((unsigned long)area, (unsigned long)area->addr); - } else if (kmemleak_early_log) { - log_early(KMEMLEAK_ALLOC, area->addr, size, 2); - /* reusing early_log.size for storing area->addr */ - log_early(KMEMLEAK_SET_EXCESS_REF, - area, (unsigned long)area->addr, 0); } } EXPORT_SYMBOL_GPL(kmemleak_vmalloc); @@ -1076,8 +951,6 @@ void __ref kmemleak_free(const void *ptr) if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); - else if (kmemleak_early_log) - log_early(KMEMLEAK_FREE, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); @@ -1096,8 +969,6 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size); - else if (kmemleak_early_log) - log_early(KMEMLEAK_FREE_PART, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -1118,8 +989,6 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) for_each_possible_cpu(cpu) delete_object_full((unsigned long)per_cpu_ptr(ptr, cpu)); - else if (kmemleak_early_log) - log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_percpu); @@ -1170,8 +1039,6 @@ void __ref kmemleak_not_leak(const void *ptr) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); - else if (kmemleak_early_log) - log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_not_leak); @@ -1190,8 +1057,6 @@ void __ref kmemleak_ignore(const void *ptr) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr); - else if (kmemleak_early_log) - log_early(KMEMLEAK_IGNORE, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_ignore); @@ -1212,8 +1077,6 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); - else if (kmemleak_early_log) - log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); } EXPORT_SYMBOL(kmemleak_scan_area); @@ -1232,8 +1095,6 @@ void __ref kmemleak_no_scan(const void *ptr) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); - else if (kmemleak_early_log) - log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_no_scan); @@ -2020,7 +1881,6 @@ static void kmemleak_disable(void) /* stop any memory operation tracing */ kmemleak_enabled = 0; - kmemleak_early_log = 0; /* check whether it is too early for a kernel thread */ if (kmemleak_initialized) @@ -2048,20 +1908,11 @@ static int __init kmemleak_boot_config(char *str) } early_param("kmemleak", kmemleak_boot_config); -static void __init print_log_trace(struct early_log *log) -{ - pr_notice("Early log backtrace:\n"); - stack_trace_print(log->trace, log->trace_len, 2); -} - /* * Kmemleak initialization. */ void __init kmemleak_init(void) { - int i; - unsigned long flags; - #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF if (!kmemleak_skip_disable) { kmemleak_disable(); @@ -2069,28 +1920,15 @@ void __init kmemleak_init(void) } #endif + if (kmemleak_error) + return; + jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); - if (crt_early_log > ARRAY_SIZE(early_log)) - pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", - crt_early_log); - - /* the kernel is still in UP mode, so disabling the IRQs is enough */ - local_irq_save(flags); - kmemleak_early_log = 0; - if (kmemleak_error) { - local_irq_restore(flags); - return; - } else { - kmemleak_enabled = 1; - kmemleak_free_enabled = 1; - } - local_irq_restore(flags); - /* register the data/bss sections */ create_object((unsigned long)_sdata, _edata - _sdata, KMEMLEAK_GREY, GFP_ATOMIC); @@ -2101,57 +1939,6 @@ void __init kmemleak_init(void) create_object((unsigned long)__start_ro_after_init, __end_ro_after_init - __start_ro_after_init, KMEMLEAK_GREY, GFP_ATOMIC); - - /* - * This is the point where tracking allocations is safe. Automatic - * scanning is started during the late initcall. Add the early logged - * callbacks to the kmemleak infrastructure. - */ - for (i = 0; i < crt_early_log; i++) { - struct early_log *log = &early_log[i]; - - switch (log->op_type) { - case KMEMLEAK_ALLOC: - early_alloc(log); - break; - case KMEMLEAK_ALLOC_PERCPU: - early_alloc_percpu(log); - break; - case KMEMLEAK_FREE: - kmemleak_free(log->ptr); - break; - case KMEMLEAK_FREE_PART: - kmemleak_free_part(log->ptr, log->size); - break; - case KMEMLEAK_FREE_PERCPU: - kmemleak_free_percpu(log->ptr); - break; - case KMEMLEAK_NOT_LEAK: - kmemleak_not_leak(log->ptr); - break; - case KMEMLEAK_IGNORE: - kmemleak_ignore(log->ptr); - break; - case KMEMLEAK_SCAN_AREA: - kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); - break; - case KMEMLEAK_NO_SCAN: - kmemleak_no_scan(log->ptr); - break; - case KMEMLEAK_SET_EXCESS_REF: - object_set_excess_ref((unsigned long)log->ptr, - log->excess_ref); - break; - default: - kmemleak_warn("Unknown early log operation: %d\n", - log->op_type); - } - - if (kmemleak_warning) { - print_log_trace(log); - kmemleak_warning = 0; - } - } } /* From 0e965a6bda80f3227dfb74af6ae644e396beaacb Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 23 Sep 2019 15:34:07 -0700 Subject: [PATCH 0986/2880] mm/kmemleak.c: record the current memory pool size The only way to obtain the current memory pool size for a running kernel is to check the kernel config file which is inconvenient. Record it in the kernel messages. [akpm@linux-foundation.org: s/memory pool size/memory pool/available/, per Catalin] Link: http://lkml.kernel.org/r/1565809631-28933-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index b8bbe9ac5472..03a8d84badad 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1967,7 +1967,8 @@ static int __init kmemleak_late_init(void) mutex_unlock(&scan_mutex); } - pr_info("Kernel memory leak detector initialized\n"); + pr_info("Kernel memory leak detector initialized (mem pool available: %d)\n", + mem_pool_free_count); return 0; } From c59180ae3e5b43d3534748b19a57905712ea5fff Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 23 Sep 2019 15:34:10 -0700 Subject: [PATCH 0987/2880] mm/kmemleak: increase the max mem pool to 1M There are some machines with slow disk and fast CPUs. When they are under memory pressure, it could take a long time to swap before the OOM kicks in to free up some memory. As the results, it needs a large mem pool for kmemleak or suffering from higher chance of a kmemleak metadata allocation failure. 524288 proves to be the good number for all architectures here. Increase the upper bound to 1M to leave some room for the future. Link: http://lkml.kernel.org/r/1565807572-26041-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c6975cded461..6b1b1703a646 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -579,7 +579,7 @@ config DEBUG_KMEMLEAK config DEBUG_KMEMLEAK_MEM_POOL_SIZE int "Kmemleak memory pool size" depends on DEBUG_KMEMLEAK - range 200 40000 + range 200 1000000 default 16000 help Kmemleak must track all the memory allocations to avoid From ae8f06b31a83e54777514308a63f669a1fed519e Mon Sep 17 00:00:00 2001 From: Walter Wu Date: Mon, 23 Sep 2019 15:34:13 -0700 Subject: [PATCH 0988/2880] kasan: add memory corruption identification for software tag-based mode Add memory corruption identification at bug report for software tag-based mode. The report shows whether it is "use-after-free" or "out-of-bound" error instead of "invalid-access" error. This will make it easier for programmers to see the memory corruption problem. We extend the slab to store five old free pointer tag and free backtrace, we can check if the tagged address is in the slab record and make a good guess if the object is more like "use-after-free" or "out-of-bound". therefore every slab memory corruption can be identified whether it's "use-after-free" or "out-of-bound". [aryabinin@virtuozzo.com: simplify & clenup code] Link: https://lkml.kernel.org/r/3318f9d7-a760-3cc8-b700-f06108ae745f@virtuozzo.com] Link: http://lkml.kernel.org/r/20190821180332.11450-1-aryabinin@virtuozzo.com Signed-off-by: Walter Wu Signed-off-by: Andrey Ryabinin Acked-by: Andrey Konovalov Cc: Dmitry Vyukov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kasan | 8 ++++++++ mm/kasan/common.c | 22 +++++++++++++++++++-- mm/kasan/kasan.h | 14 +++++++++++++- mm/kasan/report.c | 44 ++++++++++++++++++++++++++++++++---------- mm/kasan/tags_report.c | 24 +++++++++++++++++++++++ 5 files changed, 99 insertions(+), 13 deletions(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 7fa97a8b5717..6c9682ce0254 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -134,6 +134,14 @@ config KASAN_S390_4_LEVEL_PAGING to 3TB of RAM with KASan enabled). This options allows to force 4-level paging instead. +config KASAN_SW_TAGS_IDENTIFY + bool "Enable memory corruption identification" + depends on KASAN_SW_TAGS + help + This option enables best-effort identification of bug type + (use-after-free or out-of-bounds) at the cost of increased + memory consumption. + config TEST_KASAN tristate "Module for testing KASAN for bug detection" depends on m && KASAN diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 95d16a42db6b..6b6f1198c72b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -304,7 +304,6 @@ size_t kasan_metadata_size(struct kmem_cache *cache) struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, const void *object) { - BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); return (void *)object + cache->kasan_info.alloc_meta_offset; } @@ -315,6 +314,24 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache, return (void *)object + cache->kasan_info.free_meta_offset; } + +static void kasan_set_free_info(struct kmem_cache *cache, + void *object, u8 tag) +{ + struct kasan_alloc_meta *alloc_meta; + u8 idx = 0; + + alloc_meta = get_alloc_info(cache, object); + +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY + idx = alloc_meta->free_track_idx; + alloc_meta->free_pointer_tag[idx] = tag; + alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; +#endif + + set_track(&alloc_meta->free_track[idx], GFP_NOWAIT); +} + void kasan_poison_slab(struct page *page) { unsigned long i; @@ -452,7 +469,8 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object, unlikely(!(cache->flags & SLAB_KASAN))) return false; - set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); + kasan_set_free_info(cache, object, tag); + quarantine_put(get_free_info(cache, object), cache); return IS_ENABLED(CONFIG_KASAN_GENERIC); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 014f19e76247..35cff6bbb716 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -95,9 +95,19 @@ struct kasan_track { depot_stack_handle_t stack; }; +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY +#define KASAN_NR_FREE_STACKS 5 +#else +#define KASAN_NR_FREE_STACKS 1 +#endif + struct kasan_alloc_meta { struct kasan_track alloc_track; - struct kasan_track free_track; + struct kasan_track free_track[KASAN_NR_FREE_STACKS]; +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY + u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; + u8 free_track_idx; +#endif }; struct qlist_node { @@ -146,6 +156,8 @@ void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip); +struct page *kasan_addr_to_page(const void *addr); + #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 0e5f965f1882..621782100eaa 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -111,7 +111,7 @@ static void print_track(struct kasan_track *track, const char *prefix) } } -static struct page *addr_to_page(const void *addr) +struct page *kasan_addr_to_page(const void *addr) { if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) @@ -151,15 +151,38 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, (void *)(object_addr + cache->object_size)); } +static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, + void *object, u8 tag) +{ + struct kasan_alloc_meta *alloc_meta; + int i = 0; + + alloc_meta = get_alloc_info(cache, object); + +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY + for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { + if (alloc_meta->free_pointer_tag[i] == tag) + break; + } + if (i == KASAN_NR_FREE_STACKS) + i = alloc_meta->free_track_idx; +#endif + + return &alloc_meta->free_track[i]; +} + static void describe_object(struct kmem_cache *cache, void *object, - const void *addr) + const void *addr, u8 tag) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); if (cache->flags & SLAB_KASAN) { + struct kasan_track *free_track; + print_track(&alloc_info->alloc_track, "Allocated"); pr_err("\n"); - print_track(&alloc_info->free_track, "Freed"); + free_track = kasan_get_free_track(cache, object, tag); + print_track(free_track, "Freed"); pr_err("\n"); } @@ -344,9 +367,9 @@ static void print_address_stack_frame(const void *addr) print_decoded_frame_descr(frame_descr); } -static void print_address_description(void *addr) +static void print_address_description(void *addr, u8 tag) { - struct page *page = addr_to_page(addr); + struct page *page = kasan_addr_to_page(addr); dump_stack(); pr_err("\n"); @@ -355,7 +378,7 @@ static void print_address_description(void *addr) struct kmem_cache *cache = page->slab_cache; void *object = nearest_obj(cache, page, addr); - describe_object(cache, object, addr); + describe_object(cache, object, addr, tag); } if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { @@ -435,13 +458,14 @@ static bool report_enabled(void) void kasan_report_invalid_free(void *object, unsigned long ip) { unsigned long flags; + u8 tag = get_tag(object); + object = reset_tag(object); start_report(&flags); pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); - print_tags(get_tag(object), reset_tag(object)); - object = reset_tag(object); + print_tags(tag, object); pr_err("\n"); - print_address_description(object); + print_address_description(object, tag); pr_err("\n"); print_shadow_for_address(object); end_report(&flags); @@ -479,7 +503,7 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon pr_err("\n"); if (addr_has_shadow(untagged_addr)) { - print_address_description(untagged_addr); + print_address_description(untagged_addr, get_tag(tagged_addr)); pr_err("\n"); print_shadow_for_address(info.first_bad_addr); } else { diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c index 8eaf5f722271..969ae08f59d7 100644 --- a/mm/kasan/tags_report.c +++ b/mm/kasan/tags_report.c @@ -36,6 +36,30 @@ const char *get_bug_type(struct kasan_access_info *info) { +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY + struct kasan_alloc_meta *alloc_meta; + struct kmem_cache *cache; + struct page *page; + const void *addr; + void *object; + u8 tag; + int i; + + tag = get_tag(info->access_addr); + addr = reset_tag(info->access_addr); + page = kasan_addr_to_page(addr); + if (page && PageSlab(page)) { + cache = page->slab_cache; + object = nearest_obj(cache, page, (void *)addr); + alloc_meta = get_alloc_info(cache, object); + + for (i = 0; i < KASAN_NR_FREE_STACKS; i++) + if (alloc_meta->free_pointer_tag[i] == tag) + return "use-after-free"; + return "out-of-bounds"; + } + +#endif return "invalid-access"; } From b92a953cb7f727c42a15ac2ea59bf3cf9c39370d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Sep 2019 15:34:16 -0700 Subject: [PATCH 0989/2880] lib/test_kasan.c: add roundtrip tests In several places we need to be able to operate on pointers which have gone via a roundtrip: virt -> {phys,page} -> virt With KASAN_SW_TAGS, we can't preserve the tag for SLUB objects, and the {phys,page} -> virt conversion will use KASAN_TAG_KERNEL. This patch adds tests to ensure that this works as expected, without false positives which have recently been spotted [1,2] in testing. [1] https://lore.kernel.org/linux-arm-kernel/20190819114420.2535-1-walter-zh.wu@mediatek.com/ [2] https://lore.kernel.org/linux-arm-kernel/20190819132347.GB9927@lakrids.cambridge.arm.com/ [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20190821153927.28630-1-mark.rutland@arm.com Signed-off-by: Mark Rutland Reviewed-by: Andrey Konovalov Tested-by: Andrey Konovalov Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index b63b367a94e8..49cc4d570a40 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -18,6 +18,9 @@ #include #include #include +#include + +#include /* * Note: test functions are marked noinline so that their names appear in @@ -337,6 +340,42 @@ static noinline void __init kmalloc_uaf2(void) kfree(ptr2); } +static noinline void __init kfree_via_page(void) +{ + char *ptr; + size_t size = 8; + struct page *page; + unsigned long offset; + + pr_info("invalid-free false positive (via page)\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + page = virt_to_page(ptr); + offset = offset_in_page(ptr); + kfree(page_address(page) + offset); +} + +static noinline void __init kfree_via_phys(void) +{ + char *ptr; + size_t size = 8; + phys_addr_t phys; + + pr_info("invalid-free false positive (via phys)\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + phys = virt_to_phys(ptr); + kfree(phys_to_virt(phys)); +} + static noinline void __init kmem_cache_oob(void) { char *p; @@ -737,6 +776,8 @@ static int __init kmalloc_tests_init(void) kmalloc_uaf(); kmalloc_uaf_memset(); kmalloc_uaf2(); + kfree_via_page(); + kfree_via_phys(); kmem_cache_oob(); memcg_accounted_kmem_cache(); kasan_stack_oob(); From dbf7684e29d171180346b25d2a124c2a0adf563e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 23 Sep 2019 15:34:19 -0700 Subject: [PATCH 0990/2880] mm/page_poison.c: fix a typo in a comment s/posioned/poisoned/ Link: http://lkml.kernel.org/r/20190721180908.6534-1-christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_poison.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_poison.c b/mm/page_poison.c index 21d4f97cb49b..34b9181ee5d1 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -101,7 +101,7 @@ static void unpoison_page(struct page *page) /* * Page poisoning when enabled poisons each and every page * that is freed to buddy. Thus no extra check is done to - * see if a page was posioned. + * see if a page was poisoned. */ check_poison_mem(addr, PAGE_SIZE); kunmap_atomic(addr); From 1f18b296699c83d858ca8ebb8b77dbc641d87cae Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Sep 2019 15:34:22 -0700 Subject: [PATCH 0991/2880] mm/rmap.c: remove set but not used variable 'cstart' Fixes gcc '-Wunused-but-set-variable' warning: mm/rmap.c: In function page_mkclean_one: mm/rmap.c:906:17: warning: variable cstart set but not used [-Wunused-but-set-variable] It is not used any more since commit cdb07bdea28e ("mm/rmap.c: remove redundant variable cend") Link: http://lkml.kernel.org/r/20190724141453.38536-1-yuehaibing@huawei.com Signed-off-by: YueHaibing Reported-by: Hulk Robot Reviewed-by: Mike Kravetz Reviewed-by: Kirill Tkhai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 003377e24232..31352bba197d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -903,10 +903,9 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { - unsigned long cstart; int ret = 0; - cstart = address = pvmw.address; + address = pvmw.address; if (pvmw.pte) { pte_t entry; pte_t *pte = pvmw.pte; @@ -933,7 +932,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); - cstart &= PMD_MASK; ret = 1; #else /* unexpected pmd-mapped page? */ From a50b854e073cd3335bbbada8dcff83a857297dd7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 23 Sep 2019 15:34:25 -0700 Subject: [PATCH 0992/2880] mm: introduce page_size() Patch series "Make working with compound pages easier", v2. These three patches add three helpers and convert the appropriate places to use them. This patch (of 3): It's unnecessarily hard to find out the size of a potentially huge page. Replace 'PAGE_SIZE << compound_order(page)' with page_size(page). Link: http://lkml.kernel.org/r/20190721104612.19120-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Michal Hocko Reviewed-by: Andrew Morton Reviewed-by: Ira Weiny Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/flush.c | 3 +-- arch/arm64/mm/flush.c | 3 +-- arch/ia64/mm/init.c | 2 +- drivers/crypto/chelsio/chtls/chtls_io.c | 5 ++--- drivers/staging/android/ion/ion_system_heap.c | 4 ++-- drivers/target/tcm_fc/tfc_io.c | 3 +-- fs/io_uring.c | 2 +- include/linux/hugetlb.h | 2 +- include/linux/mm.h | 6 ++++++ lib/iov_iter.c | 2 +- mm/kasan/common.c | 8 +++----- mm/nommu.c | 2 +- mm/page_vma_mapped.c | 3 +-- mm/rmap.c | 6 ++---- mm/slob.c | 2 +- mm/slub.c | 18 +++++++++--------- net/xdp/xsk.c | 2 +- 17 files changed, 35 insertions(+), 38 deletions(-) diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 6ecbda87ee46..4c7ebe094a83 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -204,8 +204,7 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page) * coherent with the kernels mapping. */ if (!PageHighMem(page)) { - size_t page_size = PAGE_SIZE << compound_order(page); - __cpuc_flush_dcache_area(page_address(page), page_size); + __cpuc_flush_dcache_area(page_address(page), page_size(page)); } else { unsigned long i; if (cache_is_vipt_nonaliasing()) { diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index dc19300309d2..ac485163a4a7 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -56,8 +56,7 @@ void __sync_icache_dcache(pte_t pte) struct page *page = pte_page(pte); if (!test_and_set_bit(PG_dcache_clean, &page->flags)) - sync_icache_aliases(page_address(page), - PAGE_SIZE << compound_order(page)); + sync_icache_aliases(page_address(page), page_size(page)); } EXPORT_SYMBOL_GPL(__sync_icache_dcache); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 678b98a09c85..bf9df2625bc8 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -64,7 +64,7 @@ __ia64_sync_icache_dcache (pte_t pte) if (test_bit(PG_arch_1, &page->flags)) return; /* i-cache is already coherent with d-cache */ - flush_icache_range(addr, addr + (PAGE_SIZE << compound_order(page))); + flush_icache_range(addr, addr + page_size(page)); set_bit(PG_arch_1, &page->flags); /* mark page as clean */ } diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c index c70cb5f272cf..0891ab829b1b 100644 --- a/drivers/crypto/chelsio/chtls/chtls_io.c +++ b/drivers/crypto/chelsio/chtls/chtls_io.c @@ -1078,7 +1078,7 @@ new_buf: bool merge; if (page) - pg_size <<= compound_order(page); + pg_size = page_size(page); if (off < pg_size && skb_can_coalesce(skb, i, page, off)) { merge = 1; @@ -1105,8 +1105,7 @@ new_buf: __GFP_NORETRY, order); if (page) - pg_size <<= - compound_order(page); + pg_size <<= order; } if (!page) { page = alloc_page(gfp); diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c index aa8d8425be25..b83a1d16bd89 100644 --- a/drivers/staging/android/ion/ion_system_heap.c +++ b/drivers/staging/android/ion/ion_system_heap.c @@ -120,7 +120,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap, if (!page) goto free_pages; list_add_tail(&page->lru, &pages); - size_remaining -= PAGE_SIZE << compound_order(page); + size_remaining -= page_size(page); max_order = compound_order(page); i++; } @@ -133,7 +133,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap, sg = table->sgl; list_for_each_entry_safe(page, tmp_page, &pages, lru) { - sg_set_page(sg, page, PAGE_SIZE << compound_order(page), 0); + sg_set_page(sg, page, page_size(page), 0); sg = sg_next(sg); list_del(&page->lru); } diff --git a/drivers/target/tcm_fc/tfc_io.c b/drivers/target/tcm_fc/tfc_io.c index a254792d882c..1354a157e9af 100644 --- a/drivers/target/tcm_fc/tfc_io.c +++ b/drivers/target/tcm_fc/tfc_io.c @@ -136,8 +136,7 @@ int ft_queue_data_in(struct se_cmd *se_cmd) page, off_in_page, tlen); fr_len(fp) += tlen; fp_skb(fp)->data_len += tlen; - fp_skb(fp)->truesize += - PAGE_SIZE << compound_order(page); + fp_skb(fp)->truesize += page_size(page); } else { BUG_ON(!page); from = kmap_atomic(page + (mem_off >> PAGE_SHIFT)); diff --git a/fs/io_uring.c b/fs/io_uring.c index 0dadbdbead0f..f83de4c6a826 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3319,7 +3319,7 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) } page = virt_to_head_page(ptr); - if (sz > (PAGE_SIZE << compound_order(page))) + if (sz > page_size(page)) return -EINVAL; pfn = virt_to_phys(ptr) >> PAGE_SHIFT; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index edfca4278319..53fc34f930d0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -454,7 +454,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, static inline struct hstate *page_hstate(struct page *page) { VM_BUG_ON_PAGE(!PageHuge(page), page); - return size_to_hstate(PAGE_SIZE << compound_order(page)); + return size_to_hstate(page_size(page)); } static inline unsigned hstate_index_to_shift(unsigned index) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6e79b3df1582..d46d5585e2a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -805,6 +805,12 @@ static inline void set_compound_order(struct page *page, unsigned int order) page[1].compound_order = order; } +/* Returns the number of bytes in this potentially compound page. */ +static inline unsigned long page_size(struct page *page) +{ + return PAGE_SIZE << compound_order(page); +} + void free_compound_page(struct page *page); #ifdef CONFIG_MMU diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f1e0569b4539..639d5e7014c1 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -878,7 +878,7 @@ static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) head = compound_head(page); v += (page - head) << PAGE_SHIFT; - if (likely(n <= v && v <= (PAGE_SIZE << compound_order(head)))) + if (likely(n <= v && v <= (page_size(head)))) return true; WARN_ON(1); return false; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6b6f1198c72b..307631d9c62b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -338,8 +338,7 @@ void kasan_poison_slab(struct page *page) for (i = 0; i < (1 << compound_order(page)); i++) page_kasan_tag_reset(page + i); - kasan_poison_shadow(page_address(page), - PAGE_SIZE << compound_order(page), + kasan_poison_shadow(page_address(page), page_size(page), KASAN_KMALLOC_REDZONE); } @@ -542,7 +541,7 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, page = virt_to_page(ptr); redzone_start = round_up((unsigned long)(ptr + size), KASAN_SHADOW_SCALE_SIZE); - redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page)); + redzone_end = (unsigned long)ptr + page_size(page); kasan_unpoison_shadow(ptr, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, @@ -578,8 +577,7 @@ void kasan_poison_kfree(void *ptr, unsigned long ip) kasan_report_invalid_free(ptr, ip); return; } - kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), - KASAN_FREE_PAGE); + kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE); } else { __kasan_slab_free(page->slab_cache, ptr, ip, false); } diff --git a/mm/nommu.c b/mm/nommu.c index fed1b6e9c89b..99b7ec318824 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -108,7 +108,7 @@ unsigned int kobjsize(const void *objp) * The ksize() function is only guaranteed to work for pointers * returned by kmalloc(). So handle arbitrary pointers here. */ - return PAGE_SIZE << compound_order(page); + return page_size(page); } /** diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 11df03e71288..eff4b4520c8d 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -153,8 +153,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (unlikely(PageHuge(pvmw->page))) { /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address, - PAGE_SIZE << compound_order(page)); + pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); if (!pvmw->pte) return false; diff --git a/mm/rmap.c b/mm/rmap.c index 31352bba197d..f401732b20e8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -898,8 +898,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma, vma->vm_mm, address, - min(vma->vm_end, address + - (PAGE_SIZE << compound_order(page)))); + min(vma->vm_end, address + page_size(page))); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { @@ -1372,8 +1371,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, - min(vma->vm_end, address + - (PAGE_SIZE << compound_order(page)))); + min(vma->vm_end, address + page_size(page))); if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted diff --git a/mm/slob.c b/mm/slob.c index 7f421d0ca9ab..cf377beab962 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -539,7 +539,7 @@ size_t __ksize(const void *block) sp = virt_to_page(block); if (unlikely(!PageSlab(sp))) - return PAGE_SIZE << compound_order(sp); + return page_size(sp); align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); m = (unsigned int *)(block - align); diff --git a/mm/slub.c b/mm/slub.c index 17fe1cac11fb..42c1b3af3c98 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -829,7 +829,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = PAGE_SIZE << compound_order(page); + length = page_size(page); end = start + length; remainder = length % s->size; if (!remainder) @@ -1074,13 +1074,14 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } -static void setup_page_debug(struct kmem_cache *s, void *addr, int order) +static +void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) { if (!(s->flags & SLAB_POISON)) return; metadata_access_enable(); - memset(addr, POISON_INUSE, PAGE_SIZE << order); + memset(addr, POISON_INUSE, page_size(page)); metadata_access_disable(); } @@ -1340,8 +1341,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} -static inline void setup_page_debug(struct kmem_cache *s, - void *addr, int order) {} +static inline +void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {} static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } @@ -1639,7 +1640,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; void *start, *p, *next; - int idx, order; + int idx; bool shuffle; flags &= gfp_allowed_mask; @@ -1673,7 +1674,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->objects = oo_objects(oo); - order = compound_order(page); page->slab_cache = s; __SetPageSlab(page); if (page_is_pfmemalloc(page)) @@ -1683,7 +1683,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) start = page_address(page); - setup_page_debug(s, start, order); + setup_page_debug(s, page, start); shuffle = shuffle_freelist(s, page); @@ -3932,7 +3932,7 @@ size_t __ksize(const void *object) if (unlikely(!PageSlab(page))) { WARN_ON(!PageCompound(page)); - return PAGE_SIZE << compound_order(page); + return page_size(page); } return slab_ksize(page->slab_cache); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c2f1af3b6a7c..fa8fbb8fa3c8 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -977,7 +977,7 @@ static int xsk_mmap(struct file *file, struct socket *sock, /* Matches the smp_wmb() in xsk_init_queue */ smp_rmb(); qpg = virt_to_head_page(q->ring); - if (size > (PAGE_SIZE << compound_order(qpg))) + if (size > page_size(qpg)) return -EINVAL; pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; From 94ad9338109fe9d0b8a4a16828719dd6dcaee4c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 23 Sep 2019 15:34:28 -0700 Subject: [PATCH 0993/2880] mm: introduce page_shift() Replace PAGE_SHIFT + compound_order(page) with the new page_shift() function. Minor improvements in readability. [akpm@linux-foundation.org: fix build in tce_page_is_contained()] Link: http://lkml.kernel.org/r/201907241853.yNQTrJWd%25lkp@intel.com Link: http://lkml.kernel.org/r/20190721104612.19120-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andrew Morton Reviewed-by: Ira Weiny Acked-by: Kirill A. Shutemov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/book3s64/iommu_api.c | 7 ++----- drivers/vfio/vfio_iommu_spapr_tce.c | 8 ++++---- include/linux/mm.h | 6 ++++++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index b056cae3388b..56cc84520577 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -129,11 +129,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, * Allow to use larger than 64k IOMMU pages. Only do that * if we are backed by hugetlb. */ - if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { - struct page *head = compound_head(page); - - pageshift = compound_order(head) + PAGE_SHIFT; - } + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) + pageshift = page_shift(compound_head(page)); mem->pageshift = min(mem->pageshift, pageshift); /* * We don't need struct page reference any more, switch diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 3b18fa4d090a..26cef65b41e7 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -176,13 +176,13 @@ put_exit: } static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, - unsigned int page_shift) + unsigned int it_page_shift) { struct page *page; unsigned long size = 0; - if (mm_iommu_is_devmem(mm, hpa, page_shift, &size)) - return size == (1UL << page_shift); + if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size)) + return size == (1UL << it_page_shift); page = pfn_to_page(hpa >> PAGE_SHIFT); /* @@ -190,7 +190,7 @@ static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, * a page we just found. Otherwise the hardware can get access to * a bigger memory chunk that it should. */ - return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; + return page_shift(compound_head(page)) >= it_page_shift; } static inline bool tce_groups_attached(struct tce_container *container) diff --git a/include/linux/mm.h b/include/linux/mm.h index d46d5585e2a2..9238548bdec5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -811,6 +811,12 @@ static inline unsigned long page_size(struct page *page) return PAGE_SIZE << compound_order(page); } +/* Returns the number of bits needed for the number of bytes in a page */ +static inline unsigned int page_shift(struct page *page) +{ + return PAGE_SHIFT + compound_order(page); +} + void free_compound_page(struct page *page); #ifdef CONFIG_MMU From d8c6546b1aea843fbeb4d54a1202f1adda6504be Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 23 Sep 2019 15:34:30 -0700 Subject: [PATCH 0994/2880] mm: introduce compound_nr() Replace 1 << compound_order(page) with compound_nr(page). Minor improvements in readability. Link: http://lkml.kernel.org/r/20190721104612.19120-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andrew Morton Reviewed-by: Ira Weiny Acked-by: Kirill A. Shutemov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/flush.c | 4 ++-- arch/powerpc/mm/hugetlbpage.c | 2 +- fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 6 ++++++ mm/compaction.c | 2 +- mm/filemap.c | 2 +- mm/gup.c | 2 +- mm/hugetlb_cgroup.c | 2 +- mm/kasan/common.c | 2 +- mm/memcontrol.c | 4 ++-- mm/memory_hotplug.c | 4 ++-- mm/migrate.c | 2 +- mm/page_alloc.c | 2 +- mm/rmap.c | 3 +-- mm/shmem.c | 8 ++++---- mm/swap_state.c | 2 +- mm/util.c | 2 +- mm/vmscan.c | 4 ++-- 18 files changed, 30 insertions(+), 25 deletions(-) diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 4c7ebe094a83..6d89db7895d1 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -208,13 +208,13 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page) } else { unsigned long i; if (cache_is_vipt_nonaliasing()) { - for (i = 0; i < (1 << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { void *addr = kmap_atomic(page + i); __cpuc_flush_dcache_area(addr, PAGE_SIZE); kunmap_atomic(addr); } } else { - for (i = 0; i < (1 << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { void *addr = kmap_high_get(page + i); if (addr) { __cpuc_flush_dcache_area(addr, PAGE_SIZE); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a8953f108808..73d4873fc7f8 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -667,7 +667,7 @@ void flush_dcache_icache_hugepage(struct page *page) BUG_ON(!PageCompound(page)); - for (i = 0; i < (1UL << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { if (!PageHighMem(page)) { __flush_dcache_icache(page_address(page+i)); } else { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bf43d1d60059..ea1630465474 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -461,7 +461,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked) { - int i, nr = compound ? 1 << compound_order(page) : 1; + int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 9238548bdec5..69b7314c8d24 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -805,6 +805,12 @@ static inline void set_compound_order(struct page *page, unsigned int order) page[1].compound_order = order; } +/* Returns the number of pages in this potentially compound page. */ +static inline unsigned long compound_nr(struct page *page) +{ + return 1UL << compound_order(page); +} + /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { diff --git a/mm/compaction.c b/mm/compaction.c index 952dc2fb24e5..777c088e9113 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -969,7 +969,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * is safe to read and it's 0 for tail pages. */ if (unlikely(PageCompound(page))) { - low_pfn += (1UL << compound_order(page)) - 1; + low_pfn += compound_nr(page) - 1; goto isolate_fail; } } diff --git a/mm/filemap.c b/mm/filemap.c index 40667c2f3383..5f30aedd7363 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -126,7 +126,7 @@ static void page_cache_delete(struct address_space *mapping, /* hugetlb pages are represented by a single entry in the xarray */ if (!PageHuge(page)) { xas_set_order(&xas, page->index, compound_order(page)); - nr = 1U << compound_order(page); + nr = compound_nr(page); } VM_BUG_ON_PAGE(!PageLocked(page), page); diff --git a/mm/gup.c b/mm/gup.c index 98f13ab37bac..84a36d80dd2e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1460,7 +1460,7 @@ check_again: * gup may start from a tail page. Advance step by the left * part. */ - step = (1 << compound_order(head)) - (pages[i] - head); + step = compound_nr(head) - (pages[i] - head); /* * If we get a page from the CMA zone, since we are going to * be pinning these entries, we might as well move them out diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 68c2f2f3c05b..f1930fa0b445 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -139,7 +139,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, if (!page_hcg || page_hcg != h_cg) goto out; - nr_pages = 1 << compound_order(page); + nr_pages = compound_nr(page); if (!parent) { parent = root_h_cgroup; /* root has no limit */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 307631d9c62b..6814d6d6a023 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -336,7 +336,7 @@ void kasan_poison_slab(struct page *page) { unsigned long i; - for (i = 0; i < (1 << compound_order(page)); i++) + for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); kasan_poison_shadow(page_address(page), page_size(page), KASAN_KMALLOC_REDZONE); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3c15bb07cce..6c6032c03d1d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6511,7 +6511,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) unsigned int nr_pages = 1; if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); + nr_pages = compound_nr(page); ug->nr_huge += nr_pages; } if (PageAnon(page)) @@ -6523,7 +6523,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } ug->pgpgout++; } else { - ug->nr_kmem += 1 << compound_order(page); + ug->nr_kmem += compound_nr(page); __ClearPageKmemcg(page); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c73f09913165..5f2c83ce9fde 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1309,7 +1309,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) head = compound_head(page); if (page_huge_active(head)) return pfn; - skip = (1 << compound_order(head)) - (page - head); + skip = compound_nr(head) - (page - head); pfn += skip - 1; } return 0; @@ -1347,7 +1347,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { struct page *head = compound_head(page); - pfn = page_to_pfn(head) + (1<i_pages, index, compound_order(page)); unsigned long i = 0; - unsigned long nr = 1UL << compound_order(page); + unsigned long nr = compound_nr(page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -1884,7 +1884,7 @@ alloc_nohuge: lru_cache_add_anon(page); spin_lock_irq(&info->lock); - info->alloced += 1 << compound_order(page); + info->alloced += compound_nr(page); inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); @@ -1925,7 +1925,7 @@ clear: struct page *head = compound_head(page); int i; - for (i = 0; i < (1 << compound_order(head)); i++) { + for (i = 0; i < compound_nr(head); i++) { clear_highpage(head + i); flush_dcache_page(head + i); } @@ -1952,7 +1952,7 @@ clear: * Error recovery. */ unacct: - shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); + shmem_inode_unacct_blocks(inode, compound_nr(page)); if (PageTransHuge(page)) { unlock_page(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index 8368621a0fc7..f844af5f09ba 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -116,7 +116,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); - unsigned long i, nr = 1UL << compound_order(page); + unsigned long i, nr = compound_nr(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); diff --git a/mm/util.c b/mm/util.c index e6351a80f248..bab284d69c8c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -521,7 +521,7 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; - for (i = 0; i < (1 << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; } diff --git a/mm/vmscan.c b/mm/vmscan.c index a6c5d0b28321..8e03427cb64f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1149,7 +1149,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, VM_BUG_ON_PAGE(PageActive(page), page); - nr_pages = 1 << compound_order(page); + nr_pages = compound_nr(page); /* Account the number of base pages even though THP */ sc->nr_scanned += nr_pages; @@ -1705,7 +1705,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON_PAGE(!PageLRU(page), page); - nr_pages = 1 << compound_order(page); + nr_pages = compound_nr(page); total_scan += nr_pages; if (page_zonenum(page) > sc->reclaim_idx) { From e7a1aaf28770c1f7a06c50cbd02ca0f27ce61ec5 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 23 Sep 2019 15:34:33 -0700 Subject: [PATCH 0995/2880] mm: replace list_move_tail() with add_page_to_lru_list_tail() This is a cleanup patch that replaces two historical uses of list_move_tail() with relatively recent add_page_to_lru_list_tail(). Link: http://lkml.kernel.org/r/20190716212436.7137-1-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Vlastimil Babka Cc: Michal Hocko Cc: Jason Gunthorpe Cc: Dan Williams Cc: Matthew Wilcox Cc: Ira Weiny Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index ae300397dfda..0226c5346560 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -515,7 +515,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, del_page_from_lru_list(page, lruvec, lru + active); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec, lru); if (PageWriteback(page) || PageDirty(page)) { /* @@ -523,13 +522,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, * It can make readahead confusing. But race window * is _really_ small and it's non-critical problem. */ + add_page_to_lru_list(page, lruvec, lru); SetPageReclaim(page); } else { /* * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - list_move_tail(&page->lru, &lruvec->lists[lru]); + add_page_to_lru_list_tail(page, lruvec, lru); __count_vm_event(PGROTATED); } @@ -844,17 +844,15 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, get_page(page_tail); list_add_tail(&page_tail->lru, list); } else { - struct list_head *list_head; /* * Head page has not yet been counted, as an hpage, * so we must account for each subpage individually. * - * Use the standard add function to put page_tail on the list, - * but then correct its position so they all end up in order. + * Put page_tail on the list at the correct position + * so they all end up in order. */ - add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); - list_head = page_tail->lru.prev; - list_move_tail(&page_tail->lru, list_head); + add_page_to_lru_list_tail(page_tail, lruvec, + page_lru(page_tail)); } if (!PageUnevictable(page)) From 7e2f2a0cd17cfc42acb4b6a293d5cb6c7eda9862 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 23 Sep 2019 15:34:36 -0700 Subject: [PATCH 0996/2880] mm, page_owner: record page owner for each subpage Patch series "debug_pagealloc improvements through page_owner", v2. The debug_pagealloc functionality serves a similar purpose on the page allocator level that slub_debug does on the kmalloc level, which is to detect bad users. One notable feature that slub_debug has is storing stack traces of who last allocated and freed the object. On page level we track allocations via page_owner, but that info is discarded when freeing, and we don't track freeing at all. This series improves those aspects. With both debug_pagealloc and page_owner enabled, we can then get bug reports such as the example in Patch 4. SLUB debug tracking additionally stores cpu, pid and timestamp. This could be added later, if deemed useful enough to justify the additional page_ext structure size. This patch (of 3): Currently, page owner info is only recorded for the first page of a high-order allocation, and copied to tail pages in the event of a split page. With the plan to keep previous owner info after freeing the page, it would be benefical to record page owner for each subpage upon allocation. This increases the overhead for high orders, but that should be acceptable for a debugging option. The order stored for each subpage is the order of the whole allocation. This makes it possible to calculate the "head" pfn and to recognize "tail" pages (quoted because not all high-order allocations are compound pages with true head and tail pages). When reading the page_owner debugfs file, keep skipping the "tail" pages so that stats gathered by existing scripts don't get inflated. Link: http://lkml.kernel.org/r/20190820131828.22684-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index addcbb2ae4e4..813fcb70547b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -154,18 +154,23 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) return handle; } -static inline void __set_page_owner_handle(struct page_ext *page_ext, - depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask) +static inline void __set_page_owner_handle(struct page *page, + struct page_ext *page_ext, depot_stack_handle_t handle, + unsigned int order, gfp_t gfp_mask) { struct page_owner *page_owner; + int i; - page_owner = get_page_owner(page_ext); - page_owner->handle = handle; - page_owner->order = order; - page_owner->gfp_mask = gfp_mask; - page_owner->last_migrate_reason = -1; + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + page_owner->handle = handle; + page_owner->order = order; + page_owner->gfp_mask = gfp_mask; + page_owner->last_migrate_reason = -1; + __set_bit(PAGE_EXT_OWNER, &page_ext->flags); - __set_bit(PAGE_EXT_OWNER, &page_ext->flags); + page_ext = lookup_page_ext(page + i); + } } noinline void __set_page_owner(struct page *page, unsigned int order, @@ -178,7 +183,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order, return; handle = save_stack(gfp_mask); - __set_page_owner_handle(page_ext, handle, order, gfp_mask); + __set_page_owner_handle(page, page_ext, handle, order, gfp_mask); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -204,8 +209,11 @@ void __split_page_owner(struct page *page, unsigned int order) page_owner = get_page_owner(page_ext); page_owner->order = 0; - for (i = 1; i < (1 << order); i++) - __copy_page_owner(page, page + i); + for (i = 1; i < (1 << order); i++) { + page_ext = lookup_page_ext(page + i); + page_owner = get_page_owner(page_ext); + page_owner->order = 0; + } } void __copy_page_owner(struct page *oldpage, struct page *newpage) @@ -483,6 +491,13 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) page_owner = get_page_owner(page_ext); + /* + * Don't print "tail" pages of high-order allocations as that + * would inflate the stats. + */ + if (!IS_ALIGNED(pfn, 1 << page_owner->order)) + continue; + /* * Access to page_ext->handle isn't synchronous so we should * be careful to access it. @@ -562,7 +577,8 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; /* Found early allocated page */ - __set_page_owner_handle(page_ext, early_handle, 0, 0); + __set_page_owner_handle(page, page_ext, early_handle, + 0, 0); count++; } cond_resched(); From 37389167a281f3ccb6bc958c32b2e088c7269fe0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 23 Sep 2019 15:34:39 -0700 Subject: [PATCH 0997/2880] mm, page_owner: keep owner info when freeing the page For debugging purposes it might be useful to keep the owner info even after page has been freed, and include it in e.g. dump_page() when detecting a bad page state. For that, change the PAGE_EXT_OWNER flag meaning to "page owner info has been set at least once" and add new PAGE_EXT_OWNER_ACTIVE for tracking whether page is supposed to be currently tracked allocated or free. Adjust dump_page() accordingly, distinguishing free and allocated pages. In the page_owner debugfs file, keep printing only allocated pages so that existing scripts are not confused, and also because free pages are irrelevant for the memory statistics or leak detection that's the typical use case of the file, anyway. Link: http://lkml.kernel.org/r/20190820131828.22684-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ext.h | 1 + mm/page_owner.c | 34 ++++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 09592951725c..682fd465df06 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -18,6 +18,7 @@ struct page_ext_operations { enum page_ext_flags { PAGE_EXT_OWNER, + PAGE_EXT_OWNER_ACTIVE, #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, diff --git a/mm/page_owner.c b/mm/page_owner.c index 813fcb70547b..4a48e018dbdf 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -111,7 +111,7 @@ void __reset_page_owner(struct page *page, unsigned int order) page_ext = lookup_page_ext(page + i); if (unlikely(!page_ext)) continue; - __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); + __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); } } @@ -168,6 +168,7 @@ static inline void __set_page_owner_handle(struct page *page, page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); + __set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); page_ext = lookup_page_ext(page + i); } @@ -243,6 +244,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) * the new page, which will be freed. */ __set_bit(PAGE_EXT_OWNER, &new_ext->flags); + __set_bit(PAGE_EXT_OWNER_ACTIVE, &new_ext->flags); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -302,7 +304,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (unlikely(!page_ext)) continue; - if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) continue; page_owner = get_page_owner(page_ext); @@ -413,21 +415,26 @@ void __dump_page_owner(struct page *page) mt = gfpflags_to_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { - pr_alert("page_owner info is not active (free page?)\n"); + pr_alert("page_owner info is not present (never set?)\n"); return; } + if (test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) + pr_alert("page_owner tracks the page as allocated\n"); + else + pr_alert("page_owner tracks the page as freed\n"); + + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", + page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); + handle = READ_ONCE(page_owner->handle); if (!handle) { - pr_alert("page_owner info is not active (free page?)\n"); - return; + pr_alert("page_owner allocation stack trace missing\n"); + } else { + nr_entries = stack_depot_fetch(handle, &entries); + stack_trace_print(entries, nr_entries, 0); } - nr_entries = stack_depot_fetch(handle, &entries); - pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", - page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); - stack_trace_print(entries, nr_entries, 0); - if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); @@ -489,6 +496,13 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; + /* + * Although we do have the info about past allocation of free + * pages, it's not relevant for current memory usage. + */ + if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) + continue; + page_owner = get_page_owner(page_ext); /* From 8974558f49a6a41b4a74db672e13bca616eff6d8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 23 Sep 2019 15:34:42 -0700 Subject: [PATCH 0998/2880] mm, page_owner, debug_pagealloc: save and dump freeing stack trace The debug_pagealloc functionality is useful to catch buggy page allocator users that cause e.g. use after free or double free. When page inconsistency is detected, debugging is often simpler by knowing the call stack of process that last allocated and freed the page. When page_owner is also enabled, we record the allocation stack trace, but not freeing. This patch therefore adds recording of freeing process stack trace to page owner info, if both page_owner and debug_pagealloc are configured and enabled. With only page_owner enabled, this info is not useful for the memory leak debugging use case. dump_page() is adjusted to print the info. An example result of calling __free_pages() twice may look like this (note the page last free stack trace): BUG: Bad page state in process bash pfn:13d8f8 page:ffffc31984f63e00 refcount:-1 mapcount:0 mapping:0000000000000000 index:0x0 flags: 0x1affff800000000() raw: 01affff800000000 dead000000000100 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000000000 ffffffffffffffff 0000000000000000 page dumped because: nonzero _refcount page_owner tracks the page as freed page last allocated via order 0, migratetype Unmovable, gfp_mask 0xcc0(GFP_KERNEL) prep_new_page+0x143/0x150 get_page_from_freelist+0x289/0x380 __alloc_pages_nodemask+0x13c/0x2d0 khugepaged+0x6e/0xc10 kthread+0xf9/0x130 ret_from_fork+0x3a/0x50 page last free stack trace: free_pcp_prepare+0x134/0x1e0 free_unref_page+0x18/0x90 khugepaged+0x7b/0xc10 kthread+0xf9/0x130 ret_from_fork+0x3a/0x50 Modules linked in: CPU: 3 PID: 271 Comm: bash Not tainted 5.3.0-rc4-2.g07a1a73-default+ #57 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x85/0xc0 bad_page.cold+0xba/0xbf rmqueue_pcplist.isra.0+0x6c5/0x6d0 rmqueue+0x2d/0x810 get_page_from_freelist+0x191/0x380 __alloc_pages_nodemask+0x13c/0x2d0 __get_free_pages+0xd/0x30 __pud_alloc+0x2c/0x110 copy_page_range+0x4f9/0x630 dup_mmap+0x362/0x480 dup_mm+0x68/0x110 copy_process+0x19e1/0x1b40 _do_fork+0x73/0x310 __x64_sys_clone+0x75/0x80 do_syscall_64+0x6e/0x1e0 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f10af854a10 ... Link: http://lkml.kernel.org/r/20190820131828.22684-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 2 + mm/Kconfig.debug | 4 +- mm/page_owner.c | 53 ++++++++++++++----- 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d3814789304f..8a54ed862049 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -809,6 +809,8 @@ enables the feature at boot time. By default, it is disabled and the system will work mostly the same as a kernel built without CONFIG_DEBUG_PAGEALLOC. + Note: to get most of debug_pagealloc error reports, it's + useful to also enable the page_owner functionality. on: enable the feature debugpat [X86] Enable PAT debugging diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 82b6a20898bd..327b3ebf23bf 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -21,7 +21,9 @@ config DEBUG_PAGEALLOC Also, the state of page tracking structures is checked more often as pages are being allocated and freed, as unexpected state changes often happen for same reasons as memory corruption (e.g. double free, - use-after-free). + use-after-free). The error reports for these checks can be augmented + with stack traces of last allocation and freeing of the page, when + PAGE_OWNER is also selected and enabled on boot. For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, fill the pages with poison patterns after free_pages() and verify diff --git a/mm/page_owner.c b/mm/page_owner.c index 4a48e018dbdf..dee931184788 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -24,6 +24,9 @@ struct page_owner { short last_migrate_reason; gfp_t gfp_mask; depot_stack_handle_t handle; +#ifdef CONFIG_DEBUG_PAGEALLOC + depot_stack_handle_t free_handle; +#endif }; static bool page_owner_disabled = true; @@ -102,19 +105,6 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext) return (void *)page_ext + page_owner_ops.offset; } -void __reset_page_owner(struct page *page, unsigned int order) -{ - int i; - struct page_ext *page_ext; - - for (i = 0; i < (1 << order); i++) { - page_ext = lookup_page_ext(page + i); - if (unlikely(!page_ext)) - continue; - __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); - } -} - static inline bool check_recursive_alloc(unsigned long *entries, unsigned int nr_entries, unsigned long ip) @@ -154,6 +144,32 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) return handle; } +void __reset_page_owner(struct page *page, unsigned int order) +{ + int i; + struct page_ext *page_ext; +#ifdef CONFIG_DEBUG_PAGEALLOC + depot_stack_handle_t handle = 0; + struct page_owner *page_owner; + + if (debug_pagealloc_enabled()) + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); +#endif + + for (i = 0; i < (1 << order); i++) { + page_ext = lookup_page_ext(page + i); + if (unlikely(!page_ext)) + continue; + __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); +#ifdef CONFIG_DEBUG_PAGEALLOC + if (debug_pagealloc_enabled()) { + page_owner = get_page_owner(page_ext); + page_owner->free_handle = handle; + } +#endif + } +} + static inline void __set_page_owner_handle(struct page *page, struct page_ext *page_ext, depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask) @@ -435,6 +451,17 @@ void __dump_page_owner(struct page *page) stack_trace_print(entries, nr_entries, 0); } +#ifdef CONFIG_DEBUG_PAGEALLOC + handle = READ_ONCE(page_owner->free_handle); + if (!handle) { + pr_alert("page_owner free stack trace missing\n"); + } else { + nr_entries = stack_depot_fetch(handle, &entries); + pr_alert("page last free stack trace:\n"); + stack_trace_print(entries, nr_entries, 0); + } +#endif + if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); From c3aab9a0bd91b696a852169479b7db1ece6cbf8c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 23 Sep 2019 15:34:45 -0700 Subject: [PATCH 0999/2880] mm/filemap.c: don't initiate writeback if mapping has no dirty pages Functions like filemap_write_and_wait_range() should do nothing if inode has no dirty pages or pages currently under writeback. But they anyway construct struct writeback_control and this does some atomic operations if CONFIG_CGROUP_WRITEBACK=y - on fast path it locks inode->i_lock and updates state of writeback ownership, on slow path might be more work. Current this path is safely avoided only when inode mapping has no pages. For example generic_file_read_iter() calls filemap_write_and_wait_range() at each O_DIRECT read - pretty hot path. This patch skips starting new writeback if mapping has no dirty tags set. If writeback is already in progress filemap_write_and_wait_range() will wait for it. Link: http://lkml.kernel.org/r/156378816804.1087.8607636317907921438.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Jan Kara Cc: Tejun Heo Cc: Jens Axboe Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 5f30aedd7363..82f3b9a3a940 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -408,7 +408,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, .range_end = end, }; - if (!mapping_cap_writeback_dirty(mapping)) + if (!mapping_cap_writeback_dirty(mapping) || + !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; wbc_attach_fdatawrite_inode(&wbc, mapping->host); From 875d91b11a201276ac3a9ab79f8b0fa3dc4ee8fd Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 23 Sep 2019 15:34:48 -0700 Subject: [PATCH 1000/2880] mm/filemap.c: rewrite mapping_needs_writeback in less fancy manner This actually checks that writeback is needed or in progress. Link: http://lkml.kernel.org/r/156378817069.1087.1302816672037672488.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Andrew Morton Cc: Tejun Heo Cc: Jens Axboe Cc: Johannes Weiner Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 82f3b9a3a940..d5462d706f76 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -618,10 +618,13 @@ int filemap_fdatawait_keep_errors(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawait_keep_errors); +/* Returns true if writeback might be needed or already in progress. */ static bool mapping_needs_writeback(struct address_space *mapping) { - return (!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional); + if (dax_mapping(mapping)) + return mapping->nrexceptional; + + return mapping->nrpages; } int filemap_write_and_wait(struct address_space *mapping) From 4101196b19d7f905dca5dcf46cd35eb758cf06c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 23 Sep 2019 15:34:52 -0700 Subject: [PATCH 1001/2880] mm: page cache: store only head pages in i_pages Transparent Huge Pages are currently stored in i_pages as pointers to consecutive subpages. This patch changes that to storing consecutive pointers to the head page in preparation for storing huge pages more efficiently in i_pages. Large parts of this are "inspired" by Kirill's patch https://lore.kernel.org/lkml/20170126115819.58875-2-kirill.shutemov@linux.intel.com/ Kirill and Huang Ying contributed several fixes. [willy@infradead.org: use compound_nr, squish uninit-var warning] Link: http://lkml.kernel.org/r/20190731210400.7419-1-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Jan Kara Reviewed-by: Kirill Shutemov Reviewed-by: Song Liu Tested-by: Song Liu Tested-by: William Kucharski Reviewed-by: William Kucharski Tested-by: Qian Cai Tested-by: Mikhail Gavrilov Cc: Hugh Dickins Cc: Chris Wilson Cc: Song Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 10 +++ mm/filemap.c | 147 ++++++++++++++++------------------------ mm/huge_memory.c | 22 +++++- mm/khugepaged.c | 4 +- mm/memfd.c | 2 + mm/migrate.c | 2 +- mm/shmem.c | 2 +- mm/swap_state.c | 4 +- 8 files changed, 96 insertions(+), 97 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c7552459a15f..37a4d9e32cd3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -333,6 +333,16 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } +static inline struct page *find_subpage(struct page *page, pgoff_t offset) +{ + if (PageHuge(page)) + return page; + + VM_BUG_ON_PAGE(PageTail(page), page); + + return page + (offset & (compound_nr(page) - 1)); +} + struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, diff --git a/mm/filemap.c b/mm/filemap.c index d5462d706f76..533f271d6839 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -281,11 +281,11 @@ EXPORT_SYMBOL(delete_from_page_cache); * @pvec: pagevec with pages to delete * * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index. + * from the mapping. The function expects @pvec to be sorted by page index + * and is optimised for it to be dense. * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec and takes care to delete all corresponding tail pages from the - * mapping as well. + * @pvec. * * The function expects the i_pages lock to be held. */ @@ -294,40 +294,43 @@ static void page_cache_delete_batch(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; - int i = 0, tail_pages = 0; + int i = 0; struct page *page; mapping_set_update(&xas, mapping); xas_for_each(&xas, page, ULONG_MAX) { - if (i >= pagevec_count(pvec) && !tail_pages) + if (i >= pagevec_count(pvec)) break; + + /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(page)) continue; - if (!tail_pages) { - /* - * Some page got inserted in our range? Skip it. We - * have our pages locked so they are protected from - * being removed. - */ - if (page != pvec->pages[i]) { - VM_BUG_ON_PAGE(page->index > - pvec->pages[i]->index, page); - continue; - } - WARN_ON_ONCE(!PageLocked(page)); - if (PageTransHuge(page) && !PageHuge(page)) - tail_pages = HPAGE_PMD_NR - 1; - page->mapping = NULL; - /* - * Leave page->index set: truncation lookup relies - * upon it - */ - i++; - } else { - VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages - != pvec->pages[i]->index, page); - tail_pages--; + /* + * A page got inserted in our range? Skip it. We have our + * pages locked so they are protected from being removed. + * If we see a page whose index is higher than ours, it + * means our page has been removed, which shouldn't be + * possible because we're holding the PageLock. + */ + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, + page); + continue; } + + WARN_ON_ONCE(!PageLocked(page)); + + if (page->index == xas.xa_index) + page->mapping = NULL; + /* Leave page->index set: truncation lookup relies on it */ + + /* + * Move to the next page in the vector if this is a regular + * page or the index is of the last sub-page of this compound + * page. + */ + if (page->index + compound_nr(page) - 1 == xas.xa_index) + i++; xas_store(&xas, NULL); total_pages++; } @@ -1520,7 +1523,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { XA_STATE(xas, &mapping->i_pages, offset); - struct page *head, *page; + struct page *page; rcu_read_lock(); repeat: @@ -1535,25 +1538,19 @@ repeat: if (!page || xa_is_value(page)) goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } - /* - * Has the page moved? + * Has the page moved or been split? * This is part of the lockless pagecache protocol. See * include/linux/pagemap.h for details. */ if (unlikely(page != xas_reload(&xas))) { - put_page(head); + put_page(page); goto repeat; } + page = find_subpage(page, offset); out: rcu_read_unlock(); @@ -1735,7 +1732,6 @@ unsigned find_get_entries(struct address_space *mapping, rcu_read_lock(); xas_for_each(&xas, page, ULONG_MAX) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1746,17 +1742,13 @@ unsigned find_get_entries(struct address_space *mapping, if (xa_is_value(page)) goto export; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; + page = find_subpage(page, xas.xa_index); export: indices[ret] = xas.xa_index; @@ -1765,7 +1757,7 @@ export: break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1807,33 +1799,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); xas_for_each(&xas, page, end) { - struct page *head; if (xas_retry(&xas, page)) continue; /* Skip over shadow, swap and DAX entries */ if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1878,7 +1864,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1888,24 +1873,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, if (xa_is_value(page)) break; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1941,7 +1921,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, rcu_read_lock(); xas_for_each_marked(&xas, page, end, tag) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1952,26 +1931,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *index = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -2652,7 +2626,7 @@ void filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *head, *page; + struct page *page; rcu_read_lock(); xas_for_each(&xas, page, end_pgoff) { @@ -2661,24 +2635,19 @@ void filemap_map_pages(struct vm_fault *vmf, if (xa_is_value(page)) goto next; - head = compound_head(page); - /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ - if (PageLocked(head)) + if (PageLocked(page)) goto next; - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto next; - /* The page was split under us? */ - if (compound_head(page) != head) - goto skip; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto skip; + page = find_subpage(page, xas.xa_index); if (!PageUptodate(page) || PageReadahead(page) || diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de1f15969e27..483b07b2d6ae 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2497,6 +2497,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct page *head = compound_head(page); pg_data_t *pgdat = page_pgdat(head); struct lruvec *lruvec; + struct address_space *swap_cache = NULL; + unsigned long offset = 0; int i; lruvec = mem_cgroup_page_lruvec(head, pgdat); @@ -2504,6 +2506,14 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); + if (PageAnon(head) && PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + offset = swp_offset(entry); + swap_cache = swap_address_space(entry); + xa_lock(&swap_cache->i_pages); + } + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ @@ -2513,6 +2523,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); + } else if (!PageAnon(page)) { + __xa_store(&head->mapping->i_pages, head[i].index, + head + i, 0); + } else if (swap_cache) { + __xa_store(&swap_cache->i_pages, offset + i, + head + i, 0); } } @@ -2523,10 +2539,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { /* Additional pin to swap cache */ - if (PageSwapCache(head)) + if (PageSwapCache(head)) { page_ref_add(head, 2); - else + xa_unlock(&swap_cache->i_pages); + } else { page_ref_inc(head); + } } else { /* Additional pin to page cache */ page_ref_add(head, 2); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ccede2425c3f..04a54ff5a8ac 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1378,7 +1378,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); nr_none++; continue; } @@ -1454,7 +1454,7 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); continue; out_unlock: unlock_page(page); diff --git a/mm/memfd.c b/mm/memfd.c index 650e65a46b9c..2647c898990c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas) xas_for_each(xas, page, ULONG_MAX) { if (xa_is_value(page)) continue; + page = find_subpage(page, xas->xa_index); if (page_count(page) - page_mapcount(page) > 1) xas_set_mark(xas, MEMFD_TAG_PINNED); @@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) bool clear = true; if (xa_is_value(page)) continue; + page = find_subpage(page, xas.xa_index); if (page_count(page) - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags diff --git a/mm/migrate.c b/mm/migrate.c index aa72b49e0209..374ef2fcb722 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -460,7 +460,7 @@ int migrate_page_move_mapping(struct address_space *mapping, for (i = 1; i < HPAGE_PMD_NR; i++) { xas_next(&xas); - xas_store(&xas, newpage + i); + xas_store(&xas, newpage); } } diff --git a/mm/shmem.c b/mm/shmem.c index 15d26c86e5ef..57a6aedf6649 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -631,7 +631,7 @@ static int shmem_add_to_page_cache(struct page *page, if (xas_error(&xas)) goto unlock; next: - xas_store(&xas, page + i); + xas_store(&xas, page); if (++i < nr) { xas_next(&xas); goto next; diff --git a/mm/swap_state.c b/mm/swap_state.c index f844af5f09ba..8e7ce9a9bc5e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -133,7 +133,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) for (i = 0; i < nr; i++) { VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); set_page_private(page + i, entry.val + i); - xas_store(&xas, page + i); + xas_store(&xas, page); xas_next(&xas); } address_space->nrpages += nr; @@ -168,7 +168,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry) for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, NULL); - VM_BUG_ON_PAGE(entry != page + i, entry); + VM_BUG_ON_PAGE(entry != page, entry); set_page_private(page + i, 0); xas_next(&xas); } From 0e4b01df865935007bd712cbc8e7299005b28894 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Mon, 23 Sep 2019 15:34:55 -0700 Subject: [PATCH 1002/2880] mm, memcg: throttle allocators when failing reclaim over memory.high We're trying to use memory.high to limit workloads, but have found that containment can frequently fail completely and cause OOM situations outside of the cgroup. This happens especially with swap space -- either when none is configured, or swap is full. These failures often also don't have enough warning to allow one to react, whether for a human or for a daemon monitoring PSI. Here is output from a simple program showing how long it takes in usec (column 2) to allocate a megabyte of anonymous memory (column 1) when a cgroup is already beyond its memory high setting, and no swap is available: [root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \ > --wait -t timeout 300 /root/mdf [...] 95 1035 96 1038 97 1000 98 1036 99 1048 100 1590 101 1968 102 1776 103 1863 104 1757 105 1921 106 1893 107 1760 108 1748 109 1843 110 1716 111 1924 112 1776 113 1831 114 1766 115 1836 116 1588 117 1912 118 1802 119 1857 120 1731 [...] [System OOM in 2-3 seconds] The delay does go up extremely marginally past the 100MB memory.high threshold, as now we spend time scanning before returning to usermode, but it's nowhere near enough to contain growth. It also doesn't get worse the more pages you have, since it only considers nr_pages. The current situation goes against both the expectations of users of memory.high, and our intentions as cgroup v2 developers. In cgroup-v2.txt, we claim that we will throttle and only under "extreme conditions" will memory.high protection be breached. Likewise, cgroup v2 users generally also expect that memory.high should throttle workloads as they exceed their high threshold. However, as seen above, this isn't always how it works in practice -- even on banal setups like those with no swap, or where swap has become exhausted, we can end up with memory.high being breached and us having no weapons left in our arsenal to combat runaway growth with, since reclaim is futile. It's also hard for system monitoring software or users to tell how bad the situation is, as "high" events for the memcg may in some cases be benign, and in others be catastrophic. The current status quo is that we fail containment in a way that doesn't provide any advance warning that things are about to go horribly wrong (for example, we are about to invoke the kernel OOM killer). This patch introduces explicit throttling when reclaim is failing to keep memcg size contained at the memory.high setting. It does so by applying an exponential delay curve derived from the memcg's overage compared to memory.high. In the normal case where the memcg is either below or only marginally over its memory.high setting, no throttling will be performed. This composes well with system health monitoring and remediation, as these allocator delays are factored into PSI's memory pressure calculations. This both creates a mechanism system administrators or applications consuming the PSI interface to trivially see that the memcg in question is struggling and use that to make more reasonable decisions, and permits them enough time to act. Either of these can act with significantly more nuance than that we can provide using the system OOM killer. This is a similar idea to memory.oom_control in cgroup v1 which would put the cgroup to sleep if the threshold was violated, but it's also significantly improved as it results in visible memory pressure, and also doesn't schedule indefinitely, which previously made tracing and other introspection difficult (ie. it's clamped at 2*HZ per allocation through MEMCG_MAX_HIGH_DELAY_JIFFIES). Contrast the previous results with a kernel with this patch: [root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \ > --wait -t timeout 300 /root/mdf [...] 95 1002 96 1000 97 1002 98 1003 99 1000 100 1043 101 84724 102 330628 103 610511 104 1016265 105 1503969 106 2391692 107 2872061 108 3248003 109 4791904 110 5759832 111 6912509 112 8127818 113 9472203 114 12287622 115 12480079 116 14144008 117 15808029 118 16384500 119 16383242 120 16384979 [...] As you can see, in the normal case, memory allocation takes around 1000 usec. However, as we exceed our memory.high, things start to increase exponentially, but fairly leniently at first. Our first megabyte over memory.high takes us 0.16 seconds, then the next is 0.46 seconds, then the next is almost an entire second. This gets worse until we reach our eventual 2*HZ clamp per batch, resulting in 16 seconds per megabyte. However, this is still making forward progress, so permits tracing or further analysis with programs like GDB. We use an exponential curve for our delay penalty for a few reasons: 1. We run mem_cgroup_handle_over_high to potentially do reclaim after we've already performed allocations, which means that temporarily going over memory.high by a small amount may be perfectly legitimate, even for compliant workloads. We don't want to unduly penalise such cases. 2. An exponential curve (as opposed to a static or linear delay) allows ramping up memory pressure stats more gradually, which can be useful to work out that you have set memory.high too low, without destroying application performance entirely. This patch expands on earlier work by Johannes Weiner. Thanks! [akpm@linux-foundation.org: fix max() warning] [akpm@linux-foundation.org: fix __udivdi3 ref on 32-bit] [akpm@linux-foundation.org: fix it even more] [chris@chrisdown.name: fix 64-bit divide even more] Link: http://lkml.kernel.org/r/20190723180700.GA29459@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Roman Gushchin Cc: Michal Hocko Cc: Nathan Chancellor Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 125 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c6032c03d1d..1b5d0bb008e7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include "internal.h" #include @@ -2358,12 +2359,68 @@ static void high_work_func(struct work_struct *work) reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); } +/* + * Clamp the maximum sleep time per allocation batch to 2 seconds. This is + * enough to still cause a significant slowdown in most cases, while still + * allowing diagnostics and tracing to proceed without becoming stuck. + */ +#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) + +/* + * When calculating the delay, we use these either side of the exponentiation to + * maintain precision and scale to a reasonable number of jiffies (see the table + * below. + * + * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the + * overage ratio to a delay. + * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the + * proposed penalty in order to reduce to a reasonable number of jiffies, and + * to produce a reasonable delay curve. + * + * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a + * reasonable delay curve compared to precision-adjusted overage, not + * penalising heavily at first, but still making sure that growth beyond the + * limit penalises misbehaviour cgroups by slowing them down exponentially. For + * example, with a high of 100 megabytes: + * + * +-------+------------------------+ + * | usage | time to allocate in ms | + * +-------+------------------------+ + * | 100M | 0 | + * | 101M | 6 | + * | 102M | 25 | + * | 103M | 57 | + * | 104M | 102 | + * | 105M | 159 | + * | 106M | 230 | + * | 107M | 313 | + * | 108M | 409 | + * | 109M | 518 | + * | 110M | 639 | + * | 111M | 774 | + * | 112M | 921 | + * | 113M | 1081 | + * | 114M | 1254 | + * | 115M | 1439 | + * | 116M | 1638 | + * | 117M | 1849 | + * | 118M | 2000 | + * | 119M | 2000 | + * | 120M | 2000 | + * +-------+------------------------+ + */ + #define MEMCG_DELAY_PRECISION_SHIFT 20 + #define MEMCG_DELAY_SCALING_SHIFT 14 + /* * Scheduled by try_charge() to be executed from the userland return path * and reclaims memory over the high limit. */ void mem_cgroup_handle_over_high(void) { + unsigned long usage, high, clamped_high; + unsigned long pflags; + unsigned long penalty_jiffies, overage; unsigned int nr_pages = current->memcg_nr_pages_over_high; struct mem_cgroup *memcg; @@ -2372,8 +2429,75 @@ void mem_cgroup_handle_over_high(void) memcg = get_mem_cgroup_from_mm(current->mm); reclaim_high(memcg, nr_pages, GFP_KERNEL); - css_put(&memcg->css); current->memcg_nr_pages_over_high = 0; + + /* + * memory.high is breached and reclaim is unable to keep up. Throttle + * allocators proactively to slow down excessive growth. + * + * We use overage compared to memory.high to calculate the number of + * jiffies to sleep (penalty_jiffies). Ideally this value should be + * fairly lenient on small overages, and increasingly harsh when the + * memcg in question makes it clear that it has no intention of stopping + * its crazy behaviour, so we exponentially increase the delay based on + * overage amount. + */ + + usage = page_counter_read(&memcg->memory); + high = READ_ONCE(memcg->high); + + if (usage <= high) + goto out; + + /* + * Prevent division by 0 in overage calculation by acting as if it was a + * threshold of 1 page + */ + clamped_high = max(high, 1UL); + + overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT, + clamped_high); + + penalty_jiffies = ((u64)overage * overage * HZ) + >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); + + /* + * Factor in the task's own contribution to the overage, such that four + * N-sized allocations are throttled approximately the same as one + * 4N-sized allocation. + * + * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or + * larger the current charge patch is than that. + */ + penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; + + /* + * Clamp the max delay per usermode return so as to still keep the + * application moving forwards and also permit diagnostics, albeit + * extremely slowly. + */ + penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); + + /* + * Don't sleep if the amount of jiffies this memcg owes us is so low + * that it's not even worth doing, in an attempt to be nice to those who + * go only a small amount over their memory.high value and maybe haven't + * been aggressively reclaimed enough yet. + */ + if (penalty_jiffies <= HZ / 100) + goto out; + + /* + * If we exit early, we're guaranteed to die (since + * schedule_timeout_killable sets TASK_KILLABLE). This means we don't + * need to account for any ill-begotten jiffies to pay them off later. + */ + psi_memstall_enter(&pflags); + schedule_timeout_killable(penalty_jiffies); + psi_memstall_leave(&pflags); + +out: + css_put(&memcg->css); } static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, From e1a366be5cb4f849ec4de170d50eebc08bb0af20 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 23 Sep 2019 15:34:58 -0700 Subject: [PATCH 1003/2880] mm: memcontrol: switch to rcu protection in drain_all_stock() Commit 72f0184c8a00 ("mm, memcg: remove hotplug locking from try_charge") introduced css_tryget()/css_put() calls in drain_all_stock(), which are supposed to protect the target memory cgroup from being released during the mem_cgroup_is_descendant() call. However, it's not completely safe. In theory, memcg can go away between reading stock->cached pointer and calling css_tryget(). This can happen if drain_all_stock() races with drain_local_stock() performed on the remote cpu as a result of a work, scheduled by the previous invocation of drain_all_stock(). The race is a bit theoretical and there are few chances to trigger it, but the current code looks a bit confusing, so it makes sense to fix it anyway. The code looks like as if css_tryget() and css_put() are used to protect stocks drainage. It's not necessary because stocked pages are holding references to the cached cgroup. And it obviously won't work for works, scheduled on other cpus. So, let's read the stock->cached pointer and evaluate the memory cgroup inside a rcu read section, and get rid of css_tryget()/css_put() calls. Link: http://lkml.kernel.org/r/20190802192241.3253165-1-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Cc: Hillf Danton Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b5d0bb008e7..80131025f2da 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2271,21 +2271,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; + bool flush = false; + rcu_read_lock(); memcg = stock->cached; - if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) - continue; - if (!mem_cgroup_is_descendant(memcg, root_memcg)) { - css_put(&memcg->css); - continue; - } - if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (memcg && stock->nr_pages && + mem_cgroup_is_descendant(memcg, root_memcg)) + flush = true; + rcu_read_unlock(); + + if (flush && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) drain_local_stock(&stock->work); else schedule_work_on(cpu, &stock->work); } - css_put(&memcg->css); } put_cpu(); mutex_unlock(&percpu_charge_mutex); From 1ba6fc9af35bf97c84567d9b3eeb26629d1e3af0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 23 Sep 2019 15:35:01 -0700 Subject: [PATCH 1004/2880] mm: vmscan: do not share cgroup iteration between reclaimers One of our services observed a high rate of cgroup OOM kills in the presence of large amounts of clean cache. Debugging showed that the culprit is the shared cgroup iteration in page reclaim. Under high allocation concurrency, multiple threads enter reclaim at the same time. Fearing overreclaim when we first switched from the single global LRU to cgrouped LRU lists, we introduced a shared iteration state for reclaim invocations - whether 1 or 20 reclaimers are active concurrently, we only walk the cgroup tree once: the 1st reclaimer reclaims the first cgroup, the second the second one etc. With more reclaimers than cgroups, we start another walk from the top. This sounded reasonable at the time, but the problem is that reclaim concurrency doesn't scale with allocation concurrency. As reclaim concurrency increases, the amount of memory individual reclaimers get to scan gets smaller and smaller. Individual reclaimers may only see one cgroup per cycle, and that may not have much reclaimable memory. We see individual reclaimers declare OOM when there is plenty of reclaimable memory available in cgroups they didn't visit. This patch does away with the shared iterator, and every reclaimer is allowed to scan the full cgroup tree and see all of reclaimable memory, just like it would on a non-cgrouped system. This way, when OOM is declared, we know that the reclaimer actually had a chance. To still maintain fairness in reclaim pressure, disallow cgroup reclaim from bailing out of the tree walk early. Kswapd and regular direct reclaim already don't bail, so it's not clear why limit reclaim would have to, especially since it only walks subtrees to begin with. This change completely eliminates the OOM kills on our service, while showing no signs of overreclaim - no increased scan rates, %sys time, or abrupt free memory spikes. I tested across 100 machines that have 64G of RAM and host about 300 cgroups each. [ It's possible overreclaim never was a *practical* issue to begin with - it was simply a concern we had on the mailing lists at the time, with no real data to back it up. But we have also added more bail-out conditions deeper inside reclaim (e.g. the proportional exit in shrink_node_memcg) since. Regardless, now we have data that suggests full walks are more reliable and scale just fine. ] Link: http://lkml.kernel.org/r/20190812192316.13615-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8e03427cb64f..2ee4d9283738 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2664,10 +2664,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) do { struct mem_cgroup *root = sc->target_mem_cgroup; - struct mem_cgroup_reclaim_cookie reclaim = { - .pgdat = pgdat, - .priority = sc->priority, - }; unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; @@ -2676,7 +2672,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - memcg = mem_cgroup_iter(root, NULL, &reclaim); + memcg = mem_cgroup_iter(root, NULL, NULL); do { unsigned long lru_pages; unsigned long reclaimed; @@ -2719,21 +2715,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); - /* - * Kswapd have to scan all memory cgroups to fulfill - * the overall scan target for the node. - * - * Limit reclaim, on the other hand, only cares about - * nr_to_reclaim pages to be reclaimed and it will - * retry with decreasing priority if one round over the - * whole hierarchy is not sufficient. - */ - if (!current_is_kswapd() && - sc->nr_reclaimed >= sc->nr_to_reclaim) { - mem_cgroup_iter_break(root, memcg); - break; - } - } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); + } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; From 2d15eb31b50a1b93927bdb7cc5d9960b90f7c1e4 Mon Sep 17 00:00:00 2001 From: "akpm@linux-foundation.org" Date: Mon, 23 Sep 2019 15:35:04 -0700 Subject: [PATCH 1005/2880] mm/gup: add make_dirty arg to put_user_pages_dirty_lock() [11~From: John Hubbard Subject: mm/gup: add make_dirty arg to put_user_pages_dirty_lock() Patch series "mm/gup: add make_dirty arg to put_user_pages_dirty_lock()", v3. There are about 50+ patches in my tree [2], and I'll be sending out the remaining ones in a few more groups: * The block/bio related changes (Jerome mostly wrote those, but I've had to move stuff around extensively, and add a little code) * mm/ changes * other subsystem patches * an RFC that shows the current state of the tracking patch set. That can only be applied after all call sites are converted, but it's good to get an early look at it. This is part a tree-wide conversion, as described in fc1d8e7cca2d ("mm: introduce put_user_page*(), placeholder versions"). This patch (of 3): Provide more capable variation of put_user_pages_dirty_lock(), and delete put_user_pages_dirty(). This is based on the following: 1. Lots of call sites become simpler if a bool is passed into put_user_page*(), instead of making the call site choose which put_user_page*() variant to call. 2. Christoph Hellwig's observation that set_page_dirty_lock() is usually correct, and set_page_dirty() is usually a bug, or at least questionable, within a put_user_page*() calling chain. This leads to the following API choices: * put_user_pages_dirty_lock(page, npages, make_dirty) * There is no put_user_pages_dirty(). You have to hand code that, in the rare case that it's required. [jhubbard@nvidia.com: remove unused variable in siw_free_plist()] Link: http://lkml.kernel.org/r/20190729074306.10368-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20190724044537.10458-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Matthew Wilcox Cc: Jan Kara Cc: Christoph Hellwig Cc: Ira Weiny Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/core/umem.c | 5 +- drivers/infiniband/hw/hfi1/user_pages.c | 5 +- drivers/infiniband/hw/qib/qib_user_pages.c | 5 +- drivers/infiniband/hw/usnic/usnic_uiom.c | 5 +- drivers/infiniband/sw/siw/siw_mem.c | 10 +- include/linux/mm.h | 5 +- mm/gup.c | 115 +++++++++------------ 7 files changed, 58 insertions(+), 92 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 41f9e268e3fb..24244a2f68cc 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -54,10 +54,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { page = sg_page_iter_page(&sg_iter); - if (umem->writable && dirty) - put_user_pages_dirty_lock(&page, 1); - else - put_user_page(page); + put_user_pages_dirty_lock(&page, 1, umem->writable && dirty); } sg_free_table(&umem->sg_head); diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index b89a9b9aef7a..469acb961fbd 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -118,10 +118,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, size_t npages, bool dirty) { - if (dirty) - put_user_pages_dirty_lock(p, npages); - else - put_user_pages(p, npages); + put_user_pages_dirty_lock(p, npages, dirty); if (mm) { /* during close after signal, mm can be NULL */ atomic64_sub(npages, &mm->pinned_vm); diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index bfbfbb7e0ff4..6bf764e41891 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -40,10 +40,7 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages, int dirty) { - if (dirty) - put_user_pages_dirty_lock(p, num_pages); - else - put_user_pages(p, num_pages); + put_user_pages_dirty_lock(p, num_pages, dirty); } /** diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 0b0237d41613..62e6ffa9ad78 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -75,10 +75,7 @@ static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) for_each_sg(chunk->page_list, sg, chunk->nents, i) { page = sg_page(sg); pa = sg_phys(sg); - if (dirty) - put_user_pages_dirty_lock(&page, 1); - else - put_user_page(page); + put_user_pages_dirty_lock(&page, 1, dirty); usnic_dbg("pa: %pa\n", &pa); } kfree(chunk); diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index 87a56039f0ef..e99983f07663 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -63,15 +63,7 @@ struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, bool dirty) { - struct page **p = chunk->plist; - - while (num_pages--) { - if (!PageDirty(*p) && dirty) - put_user_pages_dirty_lock(p, 1); - else - put_user_page(*p); - p++; - } + put_user_pages_dirty_lock(chunk->plist, num_pages, dirty); } void siw_umem_release(struct siw_umem *umem, bool dirty) diff --git a/include/linux/mm.h b/include/linux/mm.h index 69b7314c8d24..57a9fa34f159 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1075,8 +1075,9 @@ static inline void put_user_page(struct page *page) put_page(page); } -void put_user_pages_dirty(struct page **pages, unsigned long npages); -void put_user_pages_dirty_lock(struct page **pages, unsigned long npages); +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty); + void put_user_pages(struct page **pages, unsigned long npages); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) diff --git a/mm/gup.c b/mm/gup.c index 84a36d80dd2e..012060efddf1 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -29,85 +29,70 @@ struct follow_page_context { unsigned int page_mask; }; -typedef int (*set_dirty_func_t)(struct page *page); - -static void __put_user_pages_dirty(struct page **pages, - unsigned long npages, - set_dirty_func_t sdf) -{ - unsigned long index; - - for (index = 0; index < npages; index++) { - struct page *page = compound_head(pages[index]); - - /* - * Checking PageDirty at this point may race with - * clear_page_dirty_for_io(), but that's OK. Two key cases: - * - * 1) This code sees the page as already dirty, so it skips - * the call to sdf(). That could happen because - * clear_page_dirty_for_io() called page_mkclean(), - * followed by set_page_dirty(). However, now the page is - * going to get written back, which meets the original - * intention of setting it dirty, so all is well: - * clear_page_dirty_for_io() goes on to call - * TestClearPageDirty(), and write the page back. - * - * 2) This code sees the page as clean, so it calls sdf(). - * The page stays dirty, despite being written back, so it - * gets written back again in the next writeback cycle. - * This is harmless. - */ - if (!PageDirty(page)) - sdf(page); - - put_user_page(page); - } -} - /** - * put_user_pages_dirty() - release and dirty an array of gup-pinned pages - * @pages: array of pages to be marked dirty and released. + * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages + * @pages: array of pages to be maybe marked dirty, and definitely released. * @npages: number of pages in the @pages array. + * @make_dirty: whether to mark the pages dirty * * "gup-pinned page" refers to a page that has had one of the get_user_pages() * variants called on that page. * * For each page in the @pages array, make that page (or its head page, if a - * compound page) dirty, if it was previously listed as clean. Then, release - * the page using put_user_page(). + * compound page) dirty, if @make_dirty is true, and if the page was previously + * listed as clean. In any case, releases all pages using put_user_page(), + * possibly via put_user_pages(), for the non-dirty case. * * Please see the put_user_page() documentation for details. * - * set_page_dirty(), which does not lock the page, is used here. - * Therefore, it is the caller's responsibility to ensure that this is - * safe. If not, then put_user_pages_dirty_lock() should be called instead. + * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is + * required, then the caller should a) verify that this is really correct, + * because _lock() is usually required, and b) hand code it: + * set_page_dirty_lock(), put_user_page(). * */ -void put_user_pages_dirty(struct page **pages, unsigned long npages) +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty) { - __put_user_pages_dirty(pages, npages, set_page_dirty); -} -EXPORT_SYMBOL(put_user_pages_dirty); + unsigned long index; -/** - * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages - * @pages: array of pages to be marked dirty and released. - * @npages: number of pages in the @pages array. - * - * For each page in the @pages array, make that page (or its head page, if a - * compound page) dirty, if it was previously listed as clean. Then, release - * the page using put_user_page(). - * - * Please see the put_user_page() documentation for details. - * - * This is just like put_user_pages_dirty(), except that it invokes - * set_page_dirty_lock(), instead of set_page_dirty(). - * - */ -void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) -{ - __put_user_pages_dirty(pages, npages, set_page_dirty_lock); + /* + * TODO: this can be optimized for huge pages: if a series of pages is + * physically contiguous and part of the same compound page, then a + * single operation to the head page should suffice. + */ + + if (!make_dirty) { + put_user_pages(pages, npages); + return; + } + + for (index = 0; index < npages; index++) { + struct page *page = compound_head(pages[index]); + /* + * Checking PageDirty at this point may race with + * clear_page_dirty_for_io(), but that's OK. Two key + * cases: + * + * 1) This code sees the page as already dirty, so it + * skips the call to set_page_dirty(). That could happen + * because clear_page_dirty_for_io() called + * page_mkclean(), followed by set_page_dirty(). + * However, now the page is going to get written back, + * which meets the original intention of setting it + * dirty, so all is well: clear_page_dirty_for_io() goes + * on to call TestClearPageDirty(), and write the page + * back. + * + * 2) This code sees the page as clean, so it calls + * set_page_dirty(). The page stays dirty, despite being + * written back, so it gets written back again in the + * next writeback cycle. This is harmless. + */ + if (!PageDirty(page)) + set_page_dirty_lock(page); + put_user_page(page); + } } EXPORT_SYMBOL(put_user_pages_dirty_lock); From 6f553ce498a72d91c1c9361fd49bcf07e17c9703 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 23 Sep 2019 15:35:07 -0700 Subject: [PATCH 1006/2880] drivers/gpu/drm/via: convert put_page() to put_user_page*() For pages that were retained via get_user_pages*(), release those pages via the new put_user_page*() routines, instead of via put_page() or release_pages(). This is part a tree-wide conversion, as described in fc1d8e7cca2d ("mm: introduce put_user_page*(), placeholder versions"). Also reverse the order of a comparison, in order to placate checkpatch.pl. Link: http://lkml.kernel.org/r/20190724044537.10458-3-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: David Airlie Cc: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/via/via_dmablit.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c index feaa538026a0..3db000aacd26 100644 --- a/drivers/gpu/drm/via/via_dmablit.c +++ b/drivers/gpu/drm/via/via_dmablit.c @@ -174,7 +174,6 @@ via_map_blit_for_device(struct pci_dev *pdev, static void via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg) { - struct page *page; int i; switch (vsg->state) { @@ -189,13 +188,8 @@ via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg) kfree(vsg->desc_pages); /* fall through */ case dr_via_pages_locked: - for (i = 0; i < vsg->num_pages; ++i) { - if (NULL != (page = vsg->pages[i])) { - if (!PageReserved(page) && (DMA_FROM_DEVICE == vsg->direction)) - SetPageDirty(page); - put_page(page); - } - } + put_user_pages_dirty_lock(vsg->pages, vsg->num_pages, + (vsg->direction == DMA_FROM_DEVICE)); /* fall through */ case dr_via_pages_alloc: vfree(vsg->pages); From 1edc97694d0fa95d143e3457be892544e41f794a Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 23 Sep 2019 15:35:10 -0700 Subject: [PATCH 1007/2880] net/xdp: convert put_page() to put_user_page*() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For pages that were retained via get_user_pages*(), release those pages via the new put_user_page*() routines, instead of via put_page() or release_pages(). This is part a tree-wide conversion, as described in fc1d8e7cca2d ("mm: introduce put_user_page*(), placeholder versions"). Link: http://lkml.kernel.org/r/20190724044537.10458-4-jhubbard@nvidia.com Signed-off-by: John Hubbard Acked-by: Björn Töpel Cc: Björn Töpel Cc: Magnus Karlsson Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/xdp/xdp_umem.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 947b8ff0227e..bba3104f128f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -206,14 +206,7 @@ static int xdp_umem_map_pages(struct xdp_umem *umem) static void xdp_umem_unpin_pages(struct xdp_umem *umem) { - unsigned int i; - - for (i = 0; i < umem->npgs; i++) { - struct page *page = umem->pgs[i]; - - set_page_dirty_lock(page); - put_page(page); - } + put_user_pages_dirty_lock(umem->pgs, umem->npgs, true); kfree(umem->pgs); umem->pgs = NULL; From 9da99f20ecf8f81a5446a47fd8de62036c20ae61 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 23 Sep 2019 15:35:13 -0700 Subject: [PATCH 1008/2880] mm: remove redundant assignment of entry Since ptent will not be changed after previous assignment of entry, it is not necessary to do the assignment again. Link: http://lkml.kernel.org/r/20190708082740.21111-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Acked-by: Matthew Wilcox (Oracle) Cc: Will Deacon Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index b1dff75640b7..676020552b32 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1093,7 +1093,6 @@ again: if (unlikely(details)) continue; - entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) rss[MM_SWAPENTS]--; else if (is_migration_entry(entry)) { From 7b167b681013f5715b6e5c4f458e346501464259 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 24 Sep 2019 00:02:24 +0000 Subject: [PATCH 1009/2880] mm: release the spinlock on zap_pte_range In our testing (camera recording), Miguel and Wei found unmap_page_range() takes above 6ms with preemption disabled easily. When I see that, the reason is it holds page table spinlock during entire 512 page operation in a PMD. 6.2ms is never trivial for user experince if RT task couldn't run in the time because it could make frame drop or glitch audio problem. I had a time to benchmark it via adding some trace_printk hooks between pte_offset_map_lock and pte_unmap_unlock in zap_pte_range. The testing device is 2018 premium mobile device. I can get 2ms delay rather easily to release 2M(ie, 512 pages) when the task runs on little core even though it doesn't have any IPI and LRU lock contention. It's already too heavy. If I remove activate_page, 35-40% overhead of zap_pte_range is gone so most of overhead(about 0.7ms) comes from activate_page via mark_page_accessed. Thus, if there are LRU contention, that 0.7ms could accumulate up to several ms. So this patch adds a check for need_resched() in the loop, and a preemption point if necessary. Link: http://lkml.kernel.org/r/20190731061440.GC155569@google.com Signed-off-by: Minchan Kim Reported-by: Miguel de Dios Reported-by: Wei Wang Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 676020552b32..9e6ac954c70b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1026,6 +1026,9 @@ again: if (pte_none(ptent)) continue; + if (need_resched()) + break; + if (pte_present(ptent)) { struct page *page; @@ -1123,8 +1126,11 @@ again: if (force_flush) { force_flush = 0; tlb_flush_mmu(tlb); - if (addr != end) - goto again; + } + + if (addr != end) { + cond_resched(); + goto again; } return addr; From 13224794cb0832caa403ad583d8605202cabc6bc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 23 Sep 2019 15:35:19 -0700 Subject: [PATCH 1010/2880] mm: remove quicklist page table caches Patch series "mm: remove quicklist page table caches". A while ago Nicholas proposed to remove quicklist page table caches [1]. I've rebased his patch on the curren upstream and switched ia64 and sh to use generic versions of PTE allocation. [1] https://lore.kernel.org/linux-mm/20190711030339.20892-1-npiggin@gmail.com This patch (of 3): Remove page table allocator "quicklists". These have been around for a long time, but have not got much traction in the last decade and are only used on ia64 and sh architectures. The numbers in the initial commit look interesting but probably don't apply anymore. If anybody wants to resurrect this it's in the git history, but it's unhelpful to have this code and divergent allocator behaviour for minor archs. Also it might be better to instead make more general improvements to page allocator if this is still so slow. Link: http://lkml.kernel.org/r/1565250728-21721-2-git-send-email-rppt@linux.ibm.com Signed-off-by: Nicholas Piggin Signed-off-by: Mike Rapoport Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgalloc.h | 2 - arch/arc/include/asm/pgalloc.h | 1 - arch/arm/include/asm/pgalloc.h | 2 - arch/arm64/include/asm/pgalloc.h | 2 - arch/csky/include/asm/pgalloc.h | 2 - arch/hexagon/include/asm/pgalloc.h | 2 - arch/ia64/Kconfig | 4 - arch/ia64/include/asm/pgalloc.h | 32 +++----- arch/m68k/include/asm/pgtable_mm.h | 2 - arch/m68k/include/asm/pgtable_no.h | 2 - arch/microblaze/include/asm/pgalloc.h | 89 ++-------------------- arch/microblaze/mm/pgtable.c | 4 - arch/mips/include/asm/pgalloc.h | 2 - arch/nds32/include/asm/pgalloc.h | 2 - arch/nios2/include/asm/pgalloc.h | 2 - arch/openrisc/include/asm/pgalloc.h | 2 - arch/parisc/include/asm/pgalloc.h | 2 - arch/powerpc/include/asm/pgalloc.h | 2 - arch/riscv/include/asm/pgalloc.h | 4 - arch/s390/include/asm/pgtable.h | 1 - arch/sh/include/asm/pgalloc.h | 22 ++---- arch/sh/mm/Kconfig | 3 - arch/sparc/include/asm/pgalloc_32.h | 2 - arch/sparc/include/asm/pgalloc_64.h | 2 - arch/sparc/mm/init_32.c | 1 - arch/um/include/asm/pgalloc.h | 2 - arch/unicore32/include/asm/pgalloc.h | 2 - arch/x86/include/asm/pgtable_32.h | 1 - arch/x86/include/asm/pgtable_64.h | 1 - arch/xtensa/include/asm/tlbflush.h | 3 - fs/proc/meminfo.c | 4 - include/asm-generic/pgalloc.h | 5 -- include/linux/quicklist.h | 94 ----------------------- kernel/sched/idle.c | 1 - lib/show_mem.c | 5 -- mm/Kconfig | 5 -- mm/Makefile | 1 - mm/mmu_gather.c | 2 - mm/quicklist.c | 103 -------------------------- 39 files changed, 25 insertions(+), 395 deletions(-) delete mode 100644 include/linux/quicklist.h delete mode 100644 mm/quicklist.c diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h index 71ded3b7d82d..eb91f1e85629 100644 --- a/arch/alpha/include/asm/pgalloc.h +++ b/arch/alpha/include/asm/pgalloc.h @@ -53,6 +53,4 @@ pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -#define check_pgt_cache() do { } while (0) - #endif /* _ALPHA_PGALLOC_H */ diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 9bdb8ed5b0db..4751f2251cd9 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -129,7 +129,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptep) #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) -#define check_pgt_cache() do { } while (0) #define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd)) #endif /* _ASM_ARC_PGALLOC_H */ diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index a2a68b751971..069da393110c 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -15,8 +15,6 @@ #include #include -#define check_pgt_cache() do { } while (0) - #ifdef CONFIG_MMU #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER)) diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 14d0bc44d451..172d76fa0245 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -15,8 +15,6 @@ #include /* for pte_{alloc,free}_one */ -#define check_pgt_cache() do { } while (0) - #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) #if CONFIG_PGTABLE_LEVELS > 2 diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index 98c5716708d6..d089113fe41f 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -75,8 +75,6 @@ do { \ tlb_remove_page(tlb, pte); \ } while (0) -#define check_pgt_cache() do {} while (0) - extern void pagetable_init(void); extern void pre_mmu_init(void); extern void pre_trap_init(void); diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index d6544dc71258..5a6e79e7926d 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -13,8 +13,6 @@ #include /* for pte_{alloc,free}_one */ -#define check_pgt_cache() do {} while (0) - extern unsigned long long kmap_generation; /* diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 685a3df126ca..16714477eef4 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -72,10 +72,6 @@ config 64BIT config ZONE_DMA32 def_bool y -config QUICKLIST - bool - default y - config MMU bool default y diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index c9e481023c25..b03d993f5d7e 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -19,18 +19,17 @@ #include #include #include -#include #include static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - quicklist_free(0, NULL, pgd); + free_page((unsigned long)pgd); } #if CONFIG_PGTABLE_LEVELS == 4 @@ -42,12 +41,12 @@ pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return (pud_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) { - quicklist_free(0, NULL, pud); + free_page((unsigned long)pud); } #define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud) #endif /* CONFIG_PGTABLE_LEVELS == 4 */ @@ -60,12 +59,12 @@ pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return (pmd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { - quicklist_free(0, NULL, pmd); + free_page((unsigned long)pmd); } #define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd) @@ -86,14 +85,12 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte) static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page; - void *pg; - pg = quicklist_alloc(0, GFP_KERNEL, NULL); - if (!pg) + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) return NULL; - page = virt_to_page(pg); if (!pgtable_page_ctor(page)) { - quicklist_free(0, NULL, pg); + __free_page(page); return NULL; } return page; @@ -101,23 +98,18 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline void pte_free(struct mm_struct *mm, pgtable_t pte) { pgtable_page_dtor(pte); - quicklist_free_page(0, NULL, pte); + __free_page(pte); } static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - quicklist_free(0, NULL, pte); -} - -static inline void check_pgt_cache(void) -{ - quicklist_trim(0, NULL, 25, 16); + free_page((unsigned long)pte); } #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index fde4534b974f..cc476c1d72e5 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -181,6 +181,4 @@ pgprot_t pgprot_dmacoherent(pgprot_t prot); */ #define pgtable_cache_init() do { } while (0) -#define check_pgt_cache() do { } while (0) - #endif /* _M68K_PGTABLE_H */ diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index fc3a96c77bd8..69e271101223 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -60,6 +60,4 @@ extern void paging_init(void); #include -#define check_pgt_cache() do { } while (0) - #endif /* _M68KNOMMU_PGTABLE_H */ diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index f4cc9ffc449e..ac0731881751 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -21,83 +21,20 @@ #include #include -#define PGDIR_ORDER 0 - -/* - * This is handled very differently on MicroBlaze since out page tables - * are all 0's and I want to be able to use these zero'd pages elsewhere - * as well - it gives us quite a speedup. - * -- Cort - */ -extern struct pgtable_cache_struct { - unsigned long *pgd_cache; - unsigned long *pte_cache; - unsigned long pgtable_cache_sz; -} quicklists; - -#define pgd_quicklist (quicklists.pgd_cache) -#define pmd_quicklist ((unsigned long *)0) -#define pte_quicklist (quicklists.pte_cache) -#define pgtable_cache_size (quicklists.pgtable_cache_sz) - -extern unsigned long *zero_cache; /* head linked list of pre-zero'd pages */ -extern atomic_t zero_sz; /* # currently pre-zero'd pages */ -extern atomic_t zeropage_hits; /* # zero'd pages request that we've done */ -extern atomic_t zeropage_calls; /* # zero'd pages request that've been made */ -extern atomic_t zerototal; /* # pages zero'd over time */ - -#define zero_quicklist (zero_cache) -#define zero_cache_sz (zero_sz) -#define zero_cache_calls (zeropage_calls) -#define zero_cache_hits (zeropage_hits) -#define zero_cache_total (zerototal) - -/* - * return a pre-zero'd page from the list, - * return NULL if none available -- Cort - */ -extern unsigned long get_zero_page_fast(void); - extern void __bad_pte(pmd_t *pmd); -static inline pgd_t *get_pgd_slow(void) +static inline pgd_t *get_pgd(void) { - pgd_t *ret; - - ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER); - if (ret != NULL) - clear_page(ret); - return ret; + return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0); } -static inline pgd_t *get_pgd_fast(void) -{ - unsigned long *ret; - - ret = pgd_quicklist; - if (ret != NULL) { - pgd_quicklist = (unsigned long *)(*ret); - ret[0] = 0; - pgtable_cache_size--; - } else - ret = (unsigned long *)get_pgd_slow(); - return (pgd_t *)ret; -} - -static inline void free_pgd_fast(pgd_t *pgd) -{ - *(unsigned long **)pgd = pgd_quicklist; - pgd_quicklist = (unsigned long *) pgd; - pgtable_cache_size++; -} - -static inline void free_pgd_slow(pgd_t *pgd) +static inline void free_pgd(pgd_t *pgd) { free_page((unsigned long)pgd); } -#define pgd_free(mm, pgd) free_pgd_fast(pgd) -#define pgd_alloc(mm) get_pgd_fast() +#define pgd_free(mm, pgd) free_pgd(pgd) +#define pgd_alloc(mm) get_pgd() #define pmd_pgtable(pmd) pmd_page(pmd) @@ -115,15 +52,14 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) struct page *ptepage; #ifdef CONFIG_HIGHPTE - int flags = GFP_KERNEL | __GFP_HIGHMEM; + int flags = GFP_KERNEL | __GFP_ZERO | __GFP_HIGHMEM; #else - int flags = GFP_KERNEL; + int flags = GFP_KERNEL | __GFP_ZERO; #endif ptepage = alloc_pages(flags, 0); if (!ptepage) return NULL; - clear_highpage(ptepage); if (!pgtable_page_ctor(ptepage)) { __free_page(ptepage); return NULL; @@ -131,13 +67,6 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) return ptepage; } -static inline void pte_free_fast(pte_t *pte) -{ - *(unsigned long **)pte = pte_quicklist; - pte_quicklist = (unsigned long *) pte; - pgtable_cache_size++; -} - static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long)pte); @@ -171,10 +100,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *ptepage) #define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x) #define pgd_populate(mm, pmd, pte) BUG() -extern int do_check_pgt_cache(int, int); - #endif /* CONFIG_MMU */ -#define check_pgt_cache() do { } while (0) - #endif /* _ASM_MICROBLAZE_PGALLOC_H */ diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index 8fe54fda31dc..010bb9cee2e4 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -44,10 +44,6 @@ unsigned long ioremap_base; unsigned long ioremap_bot; EXPORT_SYMBOL(ioremap_bot); -#ifndef CONFIG_SMP -struct pgtable_cache_struct quicklists; -#endif - static void __iomem *__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) { diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index aa16b85ddffc..aa73cb187a07 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -105,8 +105,6 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) #endif /* __PAGETABLE_PUD_FOLDED */ -#define check_pgt_cache() do { } while (0) - extern void pagetable_init(void); #endif /* _ASM_PGALLOC_H */ diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h index e78b43d8389f..37125e6884d7 100644 --- a/arch/nds32/include/asm/pgalloc.h +++ b/arch/nds32/include/asm/pgalloc.h @@ -23,8 +23,6 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t * pgd); -#define check_pgt_cache() do { } while (0) - static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { pgtable_t pte; diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index 4bc8cf72067e..750d18d5980b 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -45,6 +45,4 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) tlb_remove_page((tlb), (pte)); \ } while (0) -#define check_pgt_cache() do { } while (0) - #endif /* _ASM_NIOS2_PGALLOC_H */ diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 3d4b397c2d06..787c1b9d2f6d 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -101,6 +101,4 @@ do { \ #define pmd_pgtable(pmd) pmd_page(pmd) -#define check_pgt_cache() do { } while (0) - #endif diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index 4f2059a50fae..d98647c29b74 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -124,6 +124,4 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) pmd_populate_kernel(mm, pmd, page_address(pte_page)) #define pmd_pgtable(pmd) pmd_page(pmd) -#define check_pgt_cache() do { } while (0) - #endif diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 2b2c60a1a66d..6dd78a2dc03a 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -64,8 +64,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) extern struct kmem_cache *pgtable_cache[]; #define PGT_CACHE(shift) pgtable_cache[shift] -static inline void check_pgt_cache(void) { } - #ifdef CONFIG_PPC_BOOK3S #include #else diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 56a67d66f72f..f66a00d8cb19 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -82,8 +82,4 @@ do { \ tlb_remove_page((tlb), pte); \ } while (0) -static inline void check_pgt_cache(void) -{ -} - #endif /* _ASM_RISCV_PGALLOC_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0c4600725fc2..8f59454ac407 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1686,7 +1686,6 @@ extern void s390_reset_cmma(struct mm_struct *mm); * No page table caches to initialise */ static inline void pgtable_cache_init(void) { } -static inline void check_pgt_cache(void) { } #include diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index b56f908b1395..9e15054858b4 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -2,11 +2,8 @@ #ifndef __ASM_SH_PGALLOC_H #define __ASM_SH_PGALLOC_H -#include #include -#define QUICK_PT 0 /* Other page table pages that are zero on free */ - extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); @@ -34,20 +31,18 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, */ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page; - void *pg; - pg = quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); - if (!pg) + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) return NULL; - page = virt_to_page(pg); if (!pgtable_page_ctor(page)) { - quicklist_free(QUICK_PT, NULL, pg); + __free_page(page); return NULL; } return page; @@ -55,13 +50,13 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - quicklist_free(QUICK_PT, NULL, pte); + free_page((unsigned long)pte); } static inline void pte_free(struct mm_struct *mm, pgtable_t pte) { pgtable_page_dtor(pte); - quicklist_free_page(QUICK_PT, NULL, pte); + __free_page(pte); } #define __pte_free_tlb(tlb,pte,addr) \ @@ -79,9 +74,4 @@ do { \ } while (0); #endif -static inline void check_pgt_cache(void) -{ - quicklist_trim(QUICK_PT, NULL, 25, 16); -} - #endif /* __ASM_SH_PGALLOC_H */ diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 02ed2df25a54..5c8a2ebfc720 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -1,9 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 menu "Memory management options" -config QUICKLIST - def_bool y - config MMU bool "Support for memory management hardware" depends on !CPU_SH2 diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h index 282be50a4adf..10538a4d1a1e 100644 --- a/arch/sparc/include/asm/pgalloc_32.h +++ b/arch/sparc/include/asm/pgalloc_32.h @@ -17,8 +17,6 @@ void srmmu_free_nocache(void *addr, int size); extern struct resource sparc_iomap; -#define check_pgt_cache() do { } while (0) - pgd_t *get_pgd_fast(void); static inline void free_pgd_fast(pgd_t *pgd) { diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h index 48abccba4991..9d3e5cc95bbb 100644 --- a/arch/sparc/include/asm/pgalloc_64.h +++ b/arch/sparc/include/asm/pgalloc_64.h @@ -69,8 +69,6 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage); #define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE) #define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD)) -#define check_pgt_cache() do { } while (0) - void pgtable_free(void *table, bool is_page); #ifdef CONFIG_SMP diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 046ab116cc8c..906eda1158b4 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -31,7 +31,6 @@ #include #include #include -#include /* bug in asm-generic/tlb.h: check_pgt_cache */ #include #include #include diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 023599c3fa51..446e0c0f4018 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -43,7 +43,5 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x)) #endif -#define check_pgt_cache() do { } while (0) - #endif diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h index 3f0903bd98e9..ba1c9a79993b 100644 --- a/arch/unicore32/include/asm/pgalloc.h +++ b/arch/unicore32/include/asm/pgalloc.h @@ -18,8 +18,6 @@ #define __HAVE_ARCH_PTE_ALLOC_ONE #include -#define check_pgt_cache() do { } while (0) - #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) #define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index c78da8eda8f2..b9b9f8aa963e 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -30,7 +30,6 @@ extern pgd_t initial_page_table[1024]; extern pmd_t initial_pg_pmd[]; static inline void pgtable_cache_init(void) { } -static inline void check_pgt_cache(void) { } void paging_init(void); void sync_initial_page_table(void); diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 4990d26dfc73..a26d2d58b9c9 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -242,7 +242,6 @@ extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN #define pgtable_cache_init() do { } while (0) -#define check_pgt_cache() do { } while (0) #define PAGE_AGP PAGE_KERNEL_NOCACHE #define HAVE_PAGE_AGP 1 diff --git a/arch/xtensa/include/asm/tlbflush.h b/arch/xtensa/include/asm/tlbflush.h index 06875feb27c2..856e2da2e397 100644 --- a/arch/xtensa/include/asm/tlbflush.h +++ b/arch/xtensa/include/asm/tlbflush.h @@ -160,9 +160,6 @@ static inline void invalidate_dtlb_mapping (unsigned address) invalidate_dtlb_entry(tlb_entry); } -#define check_pgt_cache() do { } while (0) - - /* * DO NOT USE THESE FUNCTIONS. These instructions aren't part of the Xtensa * ISA and exist only for test purposes.. diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 465ea0153b2a..4bd80e68eb03 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -106,9 +105,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); -#ifdef CONFIG_QUICKLIST - show_val_kb(m, "Quicklists: ", quicklist_total_size()); -#endif show_val_kb(m, "NFS_Unstable: ", global_node_page_state(NR_UNSTABLE_NFS)); diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 8476175c07e7..6f8cc06ee44e 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -102,11 +102,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) __free_page(pte_page); } -#else /* CONFIG_MMU */ - -/* This is enough for a nommu architecture */ -#define check_pgt_cache() do { } while (0) - #endif /* CONFIG_MMU */ #endif /* __ASM_GENERIC_PGALLOC_H */ diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h deleted file mode 100644 index 034982c98c8b..000000000000 --- a/include/linux/quicklist.h +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef LINUX_QUICKLIST_H -#define LINUX_QUICKLIST_H -/* - * Fast allocations and disposal of pages. Pages must be in the condition - * as needed after allocation when they are freed. Per cpu lists of pages - * are kept that only contain node local pages. - * - * (C) 2007, SGI. Christoph Lameter - */ -#include -#include -#include - -#ifdef CONFIG_QUICKLIST - -struct quicklist { - void *page; - int nr_pages; -}; - -DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; - -/* - * The two key functions quicklist_alloc and quicklist_free are inline so - * that they may be custom compiled for the platform. - * Specifying a NULL ctor can remove constructor support. Specifying - * a constant quicklist allows the determination of the exact address - * in the per cpu area. - * - * The fast patch in quicklist_alloc touched only a per cpu cacheline and - * the first cacheline of the page itself. There is minmal overhead involved. - */ -static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *)) -{ - struct quicklist *q; - void **p = NULL; - - q =&get_cpu_var(quicklist)[nr]; - p = q->page; - if (likely(p)) { - q->page = p[0]; - p[0] = NULL; - q->nr_pages--; - } - put_cpu_var(quicklist); - if (likely(p)) - return p; - - p = (void *)__get_free_page(flags | __GFP_ZERO); - if (ctor && p) - ctor(p); - return p; -} - -static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p, - struct page *page) -{ - struct quicklist *q; - - q = &get_cpu_var(quicklist)[nr]; - *(void **)p = q->page; - q->page = p; - q->nr_pages++; - put_cpu_var(quicklist); -} - -static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp) -{ - __quicklist_free(nr, dtor, pp, virt_to_page(pp)); -} - -static inline void quicklist_free_page(int nr, void (*dtor)(void *), - struct page *page) -{ - __quicklist_free(nr, dtor, page_address(page), page); -} - -void quicklist_trim(int nr, void (*dtor)(void *), - unsigned long min_pages, unsigned long max_free); - -unsigned long quicklist_total_size(void); - -#else - -static inline unsigned long quicklist_total_size(void) -{ - return 0; -} - -#endif - -#endif /* LINUX_QUICKLIST_H */ - diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c892c6280c9f..8dad5aa600ea 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -238,7 +238,6 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - check_pgt_cache(); rmb(); local_irq_disable(); diff --git a/lib/show_mem.c b/lib/show_mem.c index 5c86ef4c899f..1c26c14ffbb9 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -6,7 +6,6 @@ */ #include -#include #include void show_mem(unsigned int filter, nodemask_t *nodemask) @@ -39,10 +38,6 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_CMA printk("%lu pages cma reserved\n", totalcma_pages); #endif -#ifdef CONFIG_QUICKLIST - printk("%lu pages in pagetable cache\n", - quicklist_total_size()); -#endif #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif diff --git a/mm/Kconfig b/mm/Kconfig index 2fe4902ad755..f88be1cbcfb2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -273,11 +273,6 @@ config BOUNCE by default when ZONE_DMA or HIGHMEM is selected, but you may say n to override this. -config NR_QUICK - int - depends on QUICKLIST - default "1" - config VIRT_TO_BUS bool help diff --git a/mm/Makefile b/mm/Makefile index d0b295c3b764..d11de59e9c3c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -72,7 +72,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o -obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 8c943a6e1696..7d70e5c78f97 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -271,8 +271,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb, tlb_flush_mmu(tlb); - /* keep the page table cache within bounds */ - check_pgt_cache(); #ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER tlb_batch_list_free(tlb); #endif diff --git a/mm/quicklist.c b/mm/quicklist.c deleted file mode 100644 index 5e98ac78e410..000000000000 --- a/mm/quicklist.c +++ /dev/null @@ -1,103 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Quicklist support. - * - * Quicklists are light weight lists of pages that have a defined state - * on alloc and free. Pages must be in the quicklist specific defined state - * (zero by default) when the page is freed. It seems that the initial idea - * for such lists first came from Dave Miller and then various other people - * improved on it. - * - * Copyright (C) 2007 SGI, - * Christoph Lameter - * Generalized, added support for multiple lists and - * constructors / destructors. - */ -#include - -#include -#include -#include -#include - -DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); - -#define FRACTION_OF_NODE_MEM 16 - -static unsigned long max_pages(unsigned long min_pages) -{ - unsigned long node_free_pages, max; - int node = numa_node_id(); - struct zone *zones = NODE_DATA(node)->node_zones; - int num_cpus_on_node; - - node_free_pages = -#ifdef CONFIG_ZONE_DMA - zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) + -#endif -#ifdef CONFIG_ZONE_DMA32 - zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) + -#endif - zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); - - max = node_free_pages / FRACTION_OF_NODE_MEM; - - num_cpus_on_node = cpumask_weight(cpumask_of_node(node)); - max /= num_cpus_on_node; - - return max(max, min_pages); -} - -static long min_pages_to_free(struct quicklist *q, - unsigned long min_pages, long max_free) -{ - long pages_to_free; - - pages_to_free = q->nr_pages - max_pages(min_pages); - - return min(pages_to_free, max_free); -} - -/* - * Trim down the number of pages in the quicklist - */ -void quicklist_trim(int nr, void (*dtor)(void *), - unsigned long min_pages, unsigned long max_free) -{ - long pages_to_free; - struct quicklist *q; - - q = &get_cpu_var(quicklist)[nr]; - if (q->nr_pages > min_pages) { - pages_to_free = min_pages_to_free(q, min_pages, max_free); - - while (pages_to_free > 0) { - /* - * We pass a gfp_t of 0 to quicklist_alloc here - * because we will never call into the page allocator. - */ - void *p = quicklist_alloc(nr, 0, NULL); - - if (dtor) - dtor(p); - free_page((unsigned long)p); - pages_to_free--; - } - } - put_cpu_var(quicklist); -} - -unsigned long quicklist_total_size(void) -{ - unsigned long count = 0; - int cpu; - struct quicklist *ql, *q; - - for_each_online_cpu(cpu) { - ql = per_cpu(quicklist, cpu); - for (q = ql; q < ql + CONFIG_NR_QUICK; q++) - count += q->nr_pages; - } - return count; -} - From 013199211c8bfe9cad9312f5083f0d5a576cf74a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 23 Sep 2019 15:35:22 -0700 Subject: [PATCH 1011/2880] ia64: switch to generic version of pte allocation The ia64 implementation pte_alloc_one(), pte_alloc_one_kernel(), pte_free_kernel() and pte_free() is identical to the generic except of lack of __GFP_ACCOUNT for the user PTEs allocation. Switch ia64 to use generic version of these functions. Link: http://lkml.kernel.org/r/1565250728-21721-3-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/pgalloc.h | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index b03d993f5d7e..f4c491044882 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -20,6 +20,8 @@ #include #include +#include + #include static inline pgd_t *pgd_alloc(struct mm_struct *mm) @@ -82,36 +84,6 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte) pmd_val(*pmd_entry) = __pa(pte); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - struct page *page; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - return NULL; - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; -} - -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) #endif /* _ASM_IA64_PGALLOC_H */ From 6fb12766f7fcd7934b09fbfbd32a725cc2febf96 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 23 Sep 2019 15:35:25 -0700 Subject: [PATCH 1012/2880] sh: switch to generic version of pte allocation The sh implementation pte_alloc_one(), pte_alloc_one_kernel(), pte_free_kernel() and pte_free() is identical to the generic except of lack of __GFP_ACCOUNT for the user PTEs allocation. Switch sh to use generic version of these functions. Link: http://lkml.kernel.org/r/1565250728-21721-4-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/pgalloc.h | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 9e15054858b4..8c6341a4d807 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -3,6 +3,7 @@ #define __ASM_SH_PGALLOC_H #include +#include extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); @@ -26,39 +27,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, } #define pmd_pgtable(pmd) pmd_page(pmd) -/* - * Allocate and free page tables. - */ -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - struct page *page; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - return NULL; - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - #define __pte_free_tlb(tlb,pte,addr) \ do { \ pgtable_page_dtor(pte); \ From 1b9a9d8564cbc49e1e5529460ca70bf7353aa4d1 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 23 Sep 2019 15:35:28 -0700 Subject: [PATCH 1013/2880] microblaze: switch to generic version of pte allocation The microblaze implementation of pte_alloc_one() has a provision to allocated PTEs from high memory, but neither CONFIG_HIGHPTE nor pte_map*() versions for suitable for HIGHPTE are defined. Except that, microblaze version of pte_alloc_one() is identical to the generic one as well as the implementations of pte_free() and pte_free_kernel(). Switch microblaze to use the generic versions of these functions. Also remove pte_free_slow() that is not referenced anywhere in the code. Link: http://lkml.kernel.org/r/1565690952-32158-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Mark Rutland Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/pgalloc.h | 39 +++------------------------ 1 file changed, 3 insertions(+), 36 deletions(-) diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index ac0731881751..7ecb05baa601 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -21,6 +21,9 @@ #include #include +#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL +#include + extern void __bad_pte(pmd_t *pmd); static inline pgd_t *get_pgd(void) @@ -47,42 +50,6 @@ static inline void free_pgd(pgd_t *pgd) extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -static inline struct page *pte_alloc_one(struct mm_struct *mm) -{ - struct page *ptepage; - -#ifdef CONFIG_HIGHPTE - int flags = GFP_KERNEL | __GFP_ZERO | __GFP_HIGHMEM; -#else - int flags = GFP_KERNEL | __GFP_ZERO; -#endif - - ptepage = alloc_pages(flags, 0); - if (!ptepage) - return NULL; - if (!pgtable_page_ctor(ptepage)) { - __free_page(ptepage); - return NULL; - } - return ptepage; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free_slow(struct page *ptepage) -{ - __free_page(ptepage); -} - -static inline void pte_free(struct mm_struct *mm, struct page *ptepage) -{ - pgtable_page_dtor(ptepage); - __free_page(ptepage); -} - #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte)) #define pmd_populate(mm, pmd, pte) \ From 782de70c42930baae55234f3df0dc90774924447 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 23 Sep 2019 15:35:31 -0700 Subject: [PATCH 1014/2880] mm: consolidate pgtable_cache_init() and pgd_cache_init() Both pgtable_cache_init() and pgd_cache_init() are used to initialize kmem cache for page table allocations on several architectures that do not use PAGE_SIZE tables for one or more levels of the page table hierarchy. Most architectures do not implement these functions and use __weak default NOP implementation of pgd_cache_init(). Since there is no such default for pgtable_cache_init(), its empty stub is duplicated among most architectures. Rename the definitions of pgd_cache_init() to pgtable_cache_init() and drop empty stubs of pgtable_cache_init(). Link: http://lkml.kernel.org/r/1566457046-22637-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Will Deacon [arm64] Acked-by: Thomas Gleixner [x86] Cc: Catalin Marinas Cc: Ingo Molnar Cc: Borislav Petkov Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgtable.h | 5 ----- arch/arc/include/asm/pgtable.h | 5 ----- arch/arm/include/asm/pgtable-nommu.h | 5 ----- arch/arm/include/asm/pgtable.h | 2 -- arch/arm64/include/asm/pgtable.h | 2 -- arch/arm64/mm/pgd.c | 2 +- arch/c6x/include/asm/pgtable.h | 5 ----- arch/csky/include/asm/pgtable.h | 5 ----- arch/h8300/include/asm/pgtable.h | 6 ------ arch/hexagon/include/asm/pgtable.h | 3 --- arch/hexagon/mm/Makefile | 2 +- arch/hexagon/mm/pgalloc.c | 10 ---------- arch/ia64/include/asm/pgtable.h | 5 ----- arch/m68k/include/asm/pgtable_mm.h | 5 ----- arch/m68k/include/asm/pgtable_no.h | 5 ----- arch/microblaze/include/asm/pgtable.h | 7 ------- arch/mips/include/asm/pgtable.h | 5 ----- arch/nds32/include/asm/pgtable.h | 2 -- arch/nios2/include/asm/pgtable.h | 2 -- arch/openrisc/include/asm/pgtable.h | 5 ----- arch/parisc/include/asm/pgtable.h | 2 -- arch/powerpc/include/asm/pgtable.h | 1 - arch/riscv/include/asm/pgtable.h | 5 ----- arch/s390/include/asm/pgtable.h | 5 ----- arch/sh/include/asm/pgtable.h | 5 ----- arch/sh/mm/nommu.c | 4 ---- arch/sparc/include/asm/pgtable_32.h | 5 ----- arch/sparc/include/asm/pgtable_64.h | 1 - arch/um/include/asm/pgtable.h | 2 -- arch/unicore32/include/asm/pgtable.h | 2 -- arch/x86/include/asm/pgtable_32.h | 1 - arch/x86/include/asm/pgtable_64.h | 2 -- arch/x86/mm/pgtable.c | 6 +----- arch/xtensa/include/asm/pgtable.h | 1 - include/asm-generic/pgtable.h | 2 +- init/main.c | 3 +-- 36 files changed, 5 insertions(+), 130 deletions(-) delete mode 100644 arch/hexagon/mm/pgalloc.c diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 89c2032f9960..065b57f408c3 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -359,11 +359,6 @@ extern void paging_init(void); #include -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - /* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 1d87c18a2976..7addd0301c51 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -395,11 +395,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, /* to cope with aliasing VIPT cache */ #define HAVE_ARCH_UNMAPPED_AREA -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h index d0de24f06724..010fa1a35a68 100644 --- a/arch/arm/include/asm/pgtable-nommu.h +++ b/arch/arm/include/asm/pgtable-nommu.h @@ -70,11 +70,6 @@ typedef pte_t *pte_addr_t; */ extern unsigned int kobjsize(const void *objp); -/* - * No page table caches to initialise. - */ -#define pgtable_cache_init() do { } while (0) - /* * All 32bit addresses are effectively valid for vmalloc... * Sort of meaningless for non-VM targets. diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index f2e990dc27e7..3ae120cd1715 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -368,8 +368,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define pgtable_cache_init() do { } while (0) - #endif /* !__ASSEMBLY__ */ #endif /* CONFIG_MMU */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 57427d17580e..7576df00eb50 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -861,8 +861,6 @@ extern int kern_addr_valid(unsigned long addr); #include -static inline void pgtable_cache_init(void) { } - /* * On AArch64, the cache coherency is handled via the set_pte_at() function. */ diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 7548f9ca1f11..4a64089e5771 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -35,7 +35,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } -void __init pgd_cache_init(void) +void __init pgtable_cache_init(void) { if (PGD_SIZE == PAGE_SIZE) return; diff --git a/arch/c6x/include/asm/pgtable.h b/arch/c6x/include/asm/pgtable.h index 0bd805964ea6..0b6919c00413 100644 --- a/arch/c6x/include/asm/pgtable.h +++ b/arch/c6x/include/asm/pgtable.h @@ -59,11 +59,6 @@ extern unsigned long empty_zero_page; #define swapper_pg_dir ((pgd_t *) 0) -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - /* * c6x is !MMU, so define the simpliest implementation */ diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index c429a6f347de..0040b3a05b61 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -296,11 +296,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ #define kern_addr_valid(addr) (1) -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do {} while (0) - #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ remap_pfn_range(vma, vaddr, pfn, size, prot) diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h index a99caa49d265..4d00152fab58 100644 --- a/arch/h8300/include/asm/pgtable.h +++ b/arch/h8300/include/asm/pgtable.h @@ -4,7 +4,6 @@ #define __ARCH_USE_5LEVEL_HACK #include #include -#define pgtable_cache_init() do { } while (0) extern void paging_init(void); #define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */ #define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */ @@ -34,11 +33,6 @@ static inline int pte_file(pte_t pte) { return 0; } extern unsigned int kobjsize(const void *objp); extern int is_in_rom(unsigned long); -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - /* * All 32bit addresses are effectively valid for vmalloc... * Sort of meaningless for non-VM targets. diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index a3ff6d24c09e..2fec20ad939e 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -431,9 +431,6 @@ static inline int pte_exec(pte_t pte) #define __pte_offset(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -/* I think this is in case we have page table caches; needed by init/main.c */ -#define pgtable_cache_init() do { } while (0) - /* * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is * interpreted as swap information. The remaining free bits are interpreted as diff --git a/arch/hexagon/mm/Makefile b/arch/hexagon/mm/Makefile index 1894263ae5bc..893838499591 100644 --- a/arch/hexagon/mm/Makefile +++ b/arch/hexagon/mm/Makefile @@ -3,5 +3,5 @@ # Makefile for Hexagon memory management subsystem # -obj-y := init.o pgalloc.o ioremap.o uaccess.o vm_fault.o cache.o +obj-y := init.o ioremap.o uaccess.o vm_fault.o cache.o obj-y += copy_to_user.o copy_from_user.o strnlen_user.o vm_tlb.o diff --git a/arch/hexagon/mm/pgalloc.c b/arch/hexagon/mm/pgalloc.c deleted file mode 100644 index 4d4316140237..000000000000 --- a/arch/hexagon/mm/pgalloc.c +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. - */ - -#include - -void __init pgtable_cache_init(void) -{ -} diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index b1e7468eb65a..d602e7c622db 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -566,11 +566,6 @@ extern struct page *zero_page_memmap_ptr; #define KERNEL_TR_PAGE_SHIFT _PAGE_SIZE_64M #define KERNEL_TR_PAGE_SIZE (1 << KERNEL_TR_PAGE_SHIFT) -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - /* These tell get_user_pages() that the first gate page is accessible from user-level. */ #define FIXADDR_USER_START GATE_ADDR #ifdef HAVE_BUGGY_SEGREL diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index cc476c1d72e5..646c174fff99 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -176,9 +176,4 @@ pgprot_t pgprot_dmacoherent(pgprot_t prot); #include #endif /* !__ASSEMBLY__ */ -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - #endif /* _M68K_PGTABLE_H */ diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index 69e271101223..c18165b0d904 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -44,11 +44,6 @@ extern void paging_init(void); */ #define ZERO_PAGE(vaddr) (virt_to_page(0)) -/* - * No page table caches to initialise. - */ -#define pgtable_cache_init() do { } while (0) - /* * All 32bit addresses are effectively valid for vmalloc... * Sort of meaningless for non-VM targets. diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 142d3f004848..954b69af451f 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -46,8 +46,6 @@ extern int mem_init_done; #define swapper_pg_dir ((pgd_t *) NULL) -#define pgtable_cache_init() do {} while (0) - #define arch_enter_lazy_cpu_mode() do {} while (0) #define pgprot_noncached_wc(prot) prot @@ -526,11 +524,6 @@ extern unsigned long iopa(unsigned long addr); /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ #define kern_addr_valid(addr) (1) -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - void do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code); diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 4dca733d5076..f85bd5b15f51 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -661,9 +661,4 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - #endif /* _ASM_PGTABLE_H */ diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h index c70cc56bec09..0588ec99725c 100644 --- a/arch/nds32/include/asm/pgtable.h +++ b/arch/nds32/include/asm/pgtable.h @@ -403,8 +403,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; * into virtual address `from' */ -#define pgtable_cache_init() do { } while (0) - #endif /* !__ASSEMBLY__ */ #endif /* _ASMNDS32_PGTABLE_H */ diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 95237b7f6fc1..99985d8b7166 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -291,8 +291,6 @@ static inline void pte_clear(struct mm_struct *mm, #include -#define pgtable_cache_init() do { } while (0) - extern void __init paging_init(void); extern void __init mmu_init(void); diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 2fe9ff5b5d6f..248d22d8faa7 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -443,11 +443,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, #include -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - typedef pte_t *pte_addr_t; #endif /* __ASSEMBLY__ */ diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 6d58c1739b42..4ac374b3a99f 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -132,8 +132,6 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) #define PTRS_PER_PTE (1UL << BITS_PER_PTE) /* Definitions for 2nd level */ -#define pgtable_cache_init() do { } while (0) - #define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE) #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 8b7865a2d576..4053b2ab427c 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -87,7 +87,6 @@ extern unsigned long ioremap_bot; unsigned long vmalloc_to_phys(void *vmalloc_addr); void pgtable_cache_add(unsigned int shift); -void pgtable_cache_init(void); #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32) void mark_initmem_nx(void); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 80905b27ee98..c60123f018f5 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -424,11 +424,6 @@ extern void *dtb_early_va; extern void setup_bootmem(void); extern void paging_init(void); -static inline void pgtable_cache_init(void) -{ - /* No page table caches to initialize */ -} - #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 8f59454ac407..36c578c0ff96 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1682,11 +1682,6 @@ extern void s390_reset_cmma(struct mm_struct *mm); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -/* - * No page table caches to initialise - */ -static inline void pgtable_cache_init(void) { } - #include #endif /* _S390_PAGE_H */ diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 9085d1142fa3..cbd0f3c55a0c 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -123,11 +123,6 @@ typedef pte_t *pte_addr_t; #define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) -/* - * Initialise the page table caches - */ -extern void pgtable_cache_init(void); - struct vm_area_struct; struct mm_struct; diff --git a/arch/sh/mm/nommu.c b/arch/sh/mm/nommu.c index cc779a90d917..dca946f426c6 100644 --- a/arch/sh/mm/nommu.c +++ b/arch/sh/mm/nommu.c @@ -97,7 +97,3 @@ void __init page_table_range_init(unsigned long start, unsigned long end, void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) { } - -void pgtable_cache_init(void) -{ -} diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 4eebed6c6781..31da44826645 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -445,9 +445,4 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, /* We provide our own get_unmapped_area to cope with VA holes for userland */ #define HAVE_ARCH_UNMAPPED_AREA -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - #endif /* !(_SPARC_PGTABLE_H) */ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 1599de730532..b57f9c631eca 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1135,7 +1135,6 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long, unsigned long); #define HAVE_ARCH_FB_UNMAPPED_AREA -void pgtable_cache_init(void); void sun4v_register_fault_status(void); void sun4v_ktsb_register(void); void __init cheetah_ecache_flush_init(void); diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index e4d3ed980d82..36a44d58f373 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -32,8 +32,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* zero page used for uninitialized stuff */ extern unsigned long *empty_zero_page; -#define pgtable_cache_init() do ; while (0) - /* Just any arbitrary offset to the start of the vmalloc VM area: the * current 8MB value just means that there will be a 8MB "hole" after the * physical memory until the kernel virtual memory starts. That means that diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h index 126e961a8cb0..c8f7ba12f309 100644 --- a/arch/unicore32/include/asm/pgtable.h +++ b/arch/unicore32/include/asm/pgtable.h @@ -285,8 +285,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; #include -#define pgtable_cache_init() do { } while (0) - #endif /* !__ASSEMBLY__ */ #endif /* __UNICORE_PGTABLE_H__ */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index b9b9f8aa963e..0dca7f7aeff2 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -29,7 +29,6 @@ extern pgd_t swapper_pg_dir[1024]; extern pgd_t initial_page_table[1024]; extern pmd_t initial_pg_pmd[]; -static inline void pgtable_cache_init(void) { } void paging_init(void); void sync_initial_page_table(void); diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index a26d2d58b9c9..0b6c4042942a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -241,8 +241,6 @@ extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define pgtable_cache_init() do { } while (0) - #define PAGE_AGP PAGE_KERNEL_NOCACHE #define HAVE_PAGE_AGP 1 diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 44816ff6411f..463940faf52f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -357,7 +357,7 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, static struct kmem_cache *pgd_cache; -void __init pgd_cache_init(void) +void __init pgtable_cache_init(void) { /* * When PAE kernel is running as a Xen domain, it does not use @@ -402,10 +402,6 @@ static inline void _pgd_free(pgd_t *pgd) } #else -void __init pgd_cache_init(void) -{ -} - static inline pgd_t *_pgd_alloc(void) { return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index ce3ff5e591b9..3f7fe5a8c286 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -238,7 +238,6 @@ extern void paging_init(void); # define swapper_pg_dir NULL static inline void paging_init(void) { } #endif -static inline void pgtable_cache_init(void) { } /* * The pmd contains the kernel virtual address of the pte page. diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 75d9d68a6de7..fae6abb3d586 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1126,7 +1126,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, static inline void init_espfix_bsp(void) { } #endif -extern void __init pgd_cache_init(void); +extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) diff --git a/init/main.c b/init/main.c index 3ca67e8b92fd..99a5f55e0d02 100644 --- a/init/main.c +++ b/init/main.c @@ -507,7 +507,7 @@ void __init __weak mem_encrypt_init(void) { } void __init __weak poking_init(void) { } -void __init __weak pgd_cache_init(void) { } +void __init __weak pgtable_cache_init(void) { } bool initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); @@ -565,7 +565,6 @@ static void __init mm_init(void) init_espfix_bsp(); /* Should be run after espfix64 is set up. */ pti_init(); - pgd_cache_init(); } void __init __weak arch_call_rest_init(void) From 6aa9b8b2c6355fc3339d6819ec250eadfe79fdb5 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 23 Sep 2019 15:35:34 -0700 Subject: [PATCH 1015/2880] mm: do not hash address in print_bad_pte() Using %px to show the actual address in print_bad_pte() to help us to debug issue. Link: http://lkml.kernel.org/r/20190831011816.141002-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 9e6ac954c70b..b1ca51a079f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -518,7 +518,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); - pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", vma->vm_file, From 3fccb74cf3a688002c3340329e9e310c7aa8c816 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:37 -0700 Subject: [PATCH 1016/2880] mm/memory_hotplug: remove move_pfn_range() Let's remove this indirection. We need the zone in the caller either way, so let's just detect it there. Add some documentation for move_pfn_range_to_zone() instead. [akpm@linux-foundation.org: restore newline, per David] Link: http://lkml.kernel.org/r/20190724142324.3686-1-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: David Hildenbrand Cc: Pavel Tatashin Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5f2c83ce9fde..5b8811945bbb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -714,8 +714,13 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon pgdat->node_start_pfn = start_pfn; pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; -} +} +/* + * Associate the pfn range with the given zone, initializing the memmaps + * and resizing the pgdat/zone data to span the added pages. After this + * call, all affected pages are PG_reserved. + */ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { @@ -804,20 +809,6 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, return default_zone_for_pfn(nid, start_pfn, nr_pages); } -/* - * Associates the given pfn range with the given node and the zone appropriate - * for the given online type. - */ -static struct zone * __meminit move_pfn_range(int online_type, int nid, - unsigned long start_pfn, unsigned long nr_pages) -{ - struct zone *zone; - - zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); - move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL); - return zone; -} - int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -840,7 +831,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ put_device(&mem->dev); /* associate pfn range with the zone */ - zone = move_pfn_range(online_type, nid, pfn, nr_pages); + zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL); arg.start_pfn = pfn; arg.nr_pages = nr_pages; From d84f2f5a755208da3f93e17714631485cb3da11c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:40 -0700 Subject: [PATCH 1017/2880] drivers/base/node.c: simplify unregister_memory_block_under_nodes() We don't allow to offline memory block devices that belong to multiple numa nodes. Therefore, such devices can never get removed. It is sufficient to process a single node when removing the memory block. No need to iterate over each and every PFN. We already have the nid stored for each memory block. Make sure that the nid always has a sane value. Please note that checking for node_online(nid) is not required. If we would have a memory block belonging to a node that is no longer offline, then we would have a BUG in the node offlining code. Link: http://lkml.kernel.org/r/20190719135244.15242-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: David Hildenbrand Cc: Stephen Rothwell Cc: Pavel Tatashin Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 1 + drivers/base/node.c | 39 +++++++++++++++------------------------ 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 20c39d1bcef8..154d5d4a0779 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -674,6 +674,7 @@ static int init_memory_block(struct memory_block **memory, mem->state = state; start_pfn = section_nr_to_pfn(mem->start_section_nr); mem->phys_device = arch_get_memory_phys_device(start_pfn); + mem->nid = NUMA_NO_NODE; ret = register_memory(mem); diff --git a/drivers/base/node.c b/drivers/base/node.c index 75b7e6f6535b..840c95baa1d8 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -759,8 +759,6 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk, int ret, nid = *(int *)arg; unsigned long pfn, sect_start_pfn, sect_end_pfn; - mem_blk->nid = nid; - sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); sect_end_pfn += PAGES_PER_SECTION - 1; @@ -789,6 +787,13 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk, if (page_nid != nid) continue; } + + /* + * If this memory block spans multiple nodes, we only indicate + * the last processed node. + */ + mem_blk->nid = nid; + ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, &mem_blk->dev.kobj, kobject_name(&mem_blk->dev.kobj)); @@ -804,32 +809,18 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk, } /* - * Unregister memory block device under all nodes that it spans. - * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes). + * Unregister a memory block device under the node it spans. Memory blocks + * with multiple nodes cannot be offlined and therefore also never be removed. */ void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { - unsigned long pfn, sect_start_pfn, sect_end_pfn; - static nodemask_t unlinked_nodes; + if (mem_blk->nid == NUMA_NO_NODE) + return; - nodes_clear(unlinked_nodes); - sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); - sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); - for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { - int nid; - - nid = get_nid_for_pfn(pfn); - if (nid < 0) - continue; - if (!node_online(nid)) - continue; - if (node_test_and_set(nid, unlinked_nodes)) - continue; - sysfs_remove_link(&node_devices[nid]->dev.kobj, - kobject_name(&mem_blk->dev.kobj)); - sysfs_remove_link(&mem_blk->dev.kobj, - kobject_name(&node_devices[nid]->dev.kobj)); - } + sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj, + kobject_name(&mem_blk->dev.kobj)); + sysfs_remove_link(&mem_blk->dev.kobj, + kobject_name(&node_devices[mem_blk->nid]->dev.kobj)); } int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) From f915fb7fb2c1c57be05880012b46d2edd5124797 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:43 -0700 Subject: [PATCH 1018/2880] drivers/base/memory.c: fixup documentation of removable/phys_index/block_size_bytes Let's rephrase to memory block terminology and add some further clarifications. Link: http://lkml.kernel.org/r/20190806080826.5963-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 154d5d4a0779..63fc5aa51c21 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -116,10 +116,8 @@ static unsigned long get_memory_block_size(void) } /* - * use this as the physical section index that this memsection - * uses. + * Show the first physical section index (number) of this memory block. */ - static ssize_t phys_index_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -131,7 +129,10 @@ static ssize_t phys_index_show(struct device *dev, } /* - * Show whether the section of memory is likely to be hot-removable + * Show whether the memory block is likely to be offlineable (or is already + * offline). Once offline, the memory block could be removed. The return + * value does, however, not indicate that there is a way to remove the + * memory block. */ static ssize_t removable_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -455,7 +456,7 @@ static DEVICE_ATTR_RO(phys_device); static DEVICE_ATTR_RO(removable); /* - * Block size attribute stuff + * Show the memory block size (shared by all memory blocks). */ static ssize_t block_size_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) From 902ce63b337381092ff865f542e854ff3d0ebe2b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:46 -0700 Subject: [PATCH 1019/2880] driver/base/memory.c: validate memory block size early Let's validate the memory block size early, when initializing the memory device infrastructure. Fail hard in case the value is not suitable. As nobody checks the return value of memory_dev_init(), turn it into a void function and fail with a panic in all scenarios instead. Otherwise, we'll crash later during boot when core/drivers expect that the memory device infrastructure (including memory_block_size_bytes()) works as expected. I think long term, we should move the whole memory block size configuration (set_memory_block_size_order() and memory_block_size_bytes()) into drivers/base/memory.c. Link: http://lkml.kernel.org/r/20190806090142.22709-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Pavel Tatashin Cc: Michal Hocko Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 31 +++++++++---------------------- include/linux/memory.h | 6 +++--- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 63fc5aa51c21..763154c82eb3 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -100,21 +100,6 @@ unsigned long __weak memory_block_size_bytes(void) } EXPORT_SYMBOL_GPL(memory_block_size_bytes); -static unsigned long get_memory_block_size(void) -{ - unsigned long block_sz; - - block_sz = memory_block_size_bytes(); - - /* Validate blk_sz is a power of 2 and not less than section size */ - if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { - WARN_ON(1); - block_sz = MIN_MEMORY_BLOCK_SIZE; - } - - return block_sz; -} - /* * Show the first physical section index (number) of this memory block. */ @@ -461,7 +446,7 @@ static DEVICE_ATTR_RO(removable); static ssize_t block_size_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%lx\n", get_memory_block_size()); + return sprintf(buf, "%lx\n", memory_block_size_bytes()); } static DEVICE_ATTR_RO(block_size_bytes); @@ -812,19 +797,22 @@ static const struct attribute_group *memory_root_attr_groups[] = { /* * Initialize the sysfs support for memory devices... */ -int __init memory_dev_init(void) +void __init memory_dev_init(void) { int ret; int err; unsigned long block_sz, nr; + /* Validate the configured memory block size */ + block_sz = memory_block_size_bytes(); + if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) + panic("Memory block size not suitable: 0x%lx\n", block_sz); + sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; + ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); if (ret) goto out; - block_sz = get_memory_block_size(); - sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - /* * Create entries for memory sections that were found * during boot and have been initialized @@ -840,8 +828,7 @@ int __init memory_dev_init(void) out: if (ret) - printk(KERN_ERR "%s() failed: %d\n", __func__, ret); - return ret; + panic("%s() failed: %d\n", __func__, ret); } /** diff --git a/include/linux/memory.h b/include/linux/memory.h index 02e633f3ede0..b3d9df060626 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -80,9 +80,9 @@ struct mem_section; #define IPC_CALLBACK_PRI 10 #ifndef CONFIG_MEMORY_HOTPLUG_SPARSE -static inline int memory_dev_init(void) +static inline void memory_dev_init(void) { - return 0; + return; } static inline int register_memory_notifier(struct notifier_block *nb) { @@ -113,7 +113,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size); void remove_memory_block_devices(unsigned long start, unsigned long size); -extern int memory_dev_init(void); +extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); extern int memory_isolate_notify(unsigned long val, void *v); extern struct memory_block *find_memory_block(struct mem_section *); From b6c88d3b9d38f9448e0fcf44847a075ea81d5ca2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:49 -0700 Subject: [PATCH 1020/2880] drivers/base/memory.c: don't store end_section_nr in memory blocks Each memory block spans the same amount of sections/pages/bytes. The size is determined before the first memory block is created. No need to store what we can easily calculate - and the calculations even look simpler now. Michal brought up the idea of variable-sized memory blocks. However, if we ever implement something like this, we will need an API compatibility switch and reworks at various places (most code assumes a fixed memory block size). So let's cleanup what we have right now. While at it, fix the variable naming in register_mem_sect_under_node() - we no longer talk about a single section. Link: http://lkml.kernel.org/r/20190809110200.2746-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Pavel Tatashin Cc: Michal Hocko Cc: Dan Williams Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 1 - drivers/base/node.c | 10 +++++----- include/linux/memory.h | 1 - mm/memory_hotplug.c | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 763154c82eb3..6bea4f3f8040 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -656,7 +656,6 @@ static int init_memory_block(struct memory_block **memory, return -ENOMEM; mem->start_section_nr = block_id * sections_per_block; - mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; mem->state = state; start_pfn = section_nr_to_pfn(mem->start_section_nr); mem->phys_device = arch_get_memory_phys_device(start_pfn); diff --git a/drivers/base/node.c b/drivers/base/node.c index 840c95baa1d8..257449cf061f 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -756,13 +756,13 @@ static int __ref get_nid_for_pfn(unsigned long pfn) static int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) { + unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE; + unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); + unsigned long end_pfn = start_pfn + memory_block_pfns - 1; int ret, nid = *(int *)arg; - unsigned long pfn, sect_start_pfn, sect_end_pfn; + unsigned long pfn; - sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); - sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); - sect_end_pfn += PAGES_PER_SECTION - 1; - for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { + for (pfn = start_pfn; pfn <= end_pfn; pfn++) { int page_nid; /* diff --git a/include/linux/memory.h b/include/linux/memory.h index b3d9df060626..0ebb105eb261 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -25,7 +25,6 @@ struct memory_block { unsigned long start_section_nr; - unsigned long end_section_nr; unsigned long state; /* serialized by the dev->lock */ int section_count; /* serialized by mem_sysfs_mutex */ int online_type; /* for passing data to online routine */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5b8811945bbb..3706a137d880 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1654,7 +1654,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) phys_addr_t beginpa, endpa; beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); - endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; + endpa = beginpa + memory_block_size_bytes() - 1; pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); From 33fce0113da2f8a5c1bbce0c46a8b131500f1677 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 23 Sep 2019 15:35:52 -0700 Subject: [PATCH 1021/2880] mm/memory_hotplug.c: prevent memory leak when reusing pgdat When offlining a node in try_offline_node(), pgdat is not released. So that pgdat could be reused in hotadd_new_pgdat(). While we reallocate pgdat->per_cpu_nodestats if this pgdat is reused. This patch prevents the memory leak by just allocating per_cpu_nodestats when it is a new pgdat. Link: http://lkml.kernel.org/r/20190813020608.10194-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Acked-by: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3706a137d880..c28e5dd017ba 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -925,8 +925,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) if (!pgdat) return NULL; + pgdat->per_cpu_nodestats = + alloc_percpu(struct per_cpu_nodestat); arch_refresh_nodedata(nid, pgdat); } else { + int cpu; /* * Reset the nr_zones, order and classzone_idx before reuse. * Note that kswapd will init kswapd_classzone_idx properly @@ -935,6 +938,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) pgdat->nr_zones = 0; pgdat->kswapd_order = 0; pgdat->kswapd_classzone_idx = 0; + for_each_online_cpu(cpu) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + memset(p, 0, sizeof(*p)); + } } /* we can use NODE_DATA(nid) from here */ @@ -944,7 +953,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_core_hotplug(nid); - pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* * The node we allocated has no zone fallback lists. For avoiding From 00ff9a91bdb74933648a5b346d9f0edb99bd76d3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:55 -0700 Subject: [PATCH 1022/2880] mm/memory_hotplug.c: use PFN_UP / PFN_DOWN in walk_system_ram_range() Patch series "mm/memory_hotplug: online_pages() cleanups", v2. Some cleanups (+ one fix for a special case) in the context of online_pages(). This patch (of 5): This makes it clearer that we will never call func() with duplicate PFNs in case we have multiple sub-page memory resources. All unaligned parts of PFNs are completely discarded. Link: http://lkml.kernel.org/r/20190814154109.3448-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Reviewed-by: Wei Yang Cc: Dan Williams Cc: Borislav Petkov Cc: Bjorn Helgaas Cc: Ingo Molnar Cc: Dave Hansen Cc: Nadav Amit Cc: Oscar Salvador Cc: Arun KS Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 74877e9d90ca..76036a41143b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -487,8 +487,8 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, while (start < end && !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, false, &res)) { - pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; - end_pfn = (res.end + 1) >> PAGE_SHIFT; + pfn = PFN_UP(res.start); + end_pfn = PFN_DOWN(res.end + 1); if (end_pfn > pfn) ret = (*func)(pfn, end_pfn - pfn, arg); if (ret) From 5ecae6359e3a37624cd34d02e4b3401cf98bb62f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:59 -0700 Subject: [PATCH 1023/2880] mm/memory_hotplug: drop PageReserved() check in online_pages_range() move_pfn_range_to_zone() will set all pages to PG_reserved via memmap_init_zone(). The only way a page could no longer be reserved would be if a MEM_GOING_ONLINE notifier would clear PG_reserved - which is not done (the online_page callback is used for that purpose by e.g., Hyper-V instead). walk_system_ram_range() will never call online_pages_range() with duplicate PFNs, so drop the PageReserved() check. This seems to be a leftover from ancient times where the memmap was initialized when adding memory and we wanted to check for already onlined memory. Link: http://lkml.kernel.org/r/20190814154109.3448-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Dan Williams Cc: Arun KS Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Nadav Amit Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c28e5dd017ba..c38050785d12 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -653,9 +653,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, { unsigned long onlined_pages = *(unsigned long *)arg; - if (PageReserved(pfn_to_page(start_pfn))) - onlined_pages += online_pages_blocks(start_pfn, nr_pages); - + onlined_pages += online_pages_blocks(start_pfn, nr_pages); online_mem_sections(start_pfn, start_pfn + nr_pages); *(unsigned long *)arg = onlined_pages; From b2c2ab208e4fa12b0ae5692d14565006899a11fd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:36:02 -0700 Subject: [PATCH 1024/2880] mm/memory_hotplug: simplify online_pages_range() online_pages always corresponds to nr_pages. Simplify the code, getting rid of online_pages_blocks(). Add some comments. Link: http://lkml.kernel.org/r/20190814154109.3448-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Dan Williams Cc: Arun KS Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Nadav Amit Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c38050785d12..b5ad646df86b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -632,31 +632,27 @@ static void generic_online_page(struct page *page, unsigned int order) #endif } -static int online_pages_blocks(unsigned long start, unsigned long nr_pages) -{ - unsigned long end = start + nr_pages; - int order, onlined_pages = 0; - - while (start < end) { - order = min(MAX_ORDER - 1, - get_order(PFN_PHYS(end) - PFN_PHYS(start))); - (*online_page_callback)(pfn_to_page(start), order); - - onlined_pages += (1UL << order); - start += (1UL << order); - } - return onlined_pages; -} - static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) { - unsigned long onlined_pages = *(unsigned long *)arg; + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + int order; - onlined_pages += online_pages_blocks(start_pfn, nr_pages); - online_mem_sections(start_pfn, start_pfn + nr_pages); + /* + * Online the pages. The callback might decide to keep some pages + * PG_reserved (to add them to the buddy later), but we still account + * them as being online/belonging to this zone ("present"). + */ + for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) { + order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn))); + (*online_page_callback)(pfn_to_page(pfn), order); + } - *(unsigned long *)arg = onlined_pages; + /* mark all involved sections as online */ + online_mem_sections(start_pfn, end_pfn); + + *(unsigned long *)arg += nr_pages; return 0; } From bd02cc01d342b43618c86e25552150f7a7e09080 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:36:05 -0700 Subject: [PATCH 1025/2880] mm/memory_hotplug: make sure the pfn is aligned to the order when onlining Commit a9cd410a3d29 ("mm/page_alloc.c: memory hotplug: free pages as higher order") assumed that any PFN we get via memory resources is aligned to to MAX_ORDER - 1, I am not convinced that is always true. Let's play safe, check the alignment and fallback to single pages. akpm: warn in this situation so we get to find out if and why this ever occurs. [akpm@linux-foundation.org: add WARN_ON_ONCE()] Link: http://lkml.kernel.org/r/20190814154109.3448-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Arun KS Cc: Oscar Salvador Cc: Michal Hocko Cc: Pavel Tatashin Cc: Dan Williams Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Nadav Amit Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b5ad646df86b..aa54e15ea830 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -646,6 +646,9 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, */ for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) { order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn))); + /* __free_pages_core() wants pfns to be aligned to the order */ + if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order))) + order = 0; (*online_page_callback)(pfn_to_page(pfn), order); } From ca9a46f8a4f00c83dcc6c787ae659d117433cbd2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:36:08 -0700 Subject: [PATCH 1026/2880] mm/memory_hotplug: online_pages cannot be 0 in online_pages() walk_system_ram_range() will fail with -EINVAL in case online_pages_range() was never called (== no resource applicable in the range). Otherwise, we will always call online_pages_range() with nr_pages > 0 and, therefore, have online_pages > 0. Remove that special handling. Link: http://lkml.kernel.org/r/20190814154109.3448-6-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Michal Hocko Cc: Pavel Tatashin Cc: Dan Williams Cc: Arun KS Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Nadav Amit Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aa54e15ea830..49f7bf91c25a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -853,6 +853,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { + /* not a single memory resource was applicable */ if (need_zonelists_rebuild) zone_pcp_reset(zone); goto failed_addition; @@ -866,27 +867,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ shuffle_zone(zone); - if (onlined_pages) { - node_states_set_node(nid, &arg); - if (need_zonelists_rebuild) - build_all_zonelists(NULL); - else - zone_pcp_update(zone); - } + node_states_set_node(nid, &arg); + if (need_zonelists_rebuild) + build_all_zonelists(NULL); + else + zone_pcp_update(zone); init_per_zone_wmark_min(); - if (onlined_pages) { - kswapd_run(nid); - kcompactd_run(nid); - } + kswapd_run(nid); + kcompactd_run(nid); vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); - if (onlined_pages) - memory_notify(MEM_ONLINE, &arg); + memory_notify(MEM_ONLINE, &arg); mem_hotplug_done(); return 0; From 29a90db9299575a4bba82158f9d4e81405c54646 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 23 Sep 2019 15:36:18 -0700 Subject: [PATCH 1027/2880] mm/memory_hotplug.c: s/is/if Correct typo in comment. Link: http://lkml.kernel.org/r/1568233954-3913-1-git-send-email-jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 49f7bf91c25a..b1be791f772d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1793,7 +1793,7 @@ void __remove_memory(int nid, u64 start, u64 size) { /* - * trigger BUG() is some memory is not offlined prior to calling this + * trigger BUG() if some memory is not offlined prior to calling this * function */ if (try_remove_memory(nid, start, size)) From ae83189405ea5c693683327fa69ac95a23ec59be Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Mon, 23 Sep 2019 15:36:21 -0700 Subject: [PATCH 1028/2880] mm/sparse.c: fix memory leak of sparsemap_buf in aligned memory sparse_buffer_alloc(xsize) gets the size of memory from sparsemap_buf after being aligned with the size. However, the size is at least PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION) and usually larger than PAGE_SIZE. Also, sparse_buffer_fini() only frees memory between sparsemap_buf and sparsemap_buf_end, since sparsemap_buf may be changed by PTR_ALIGN() first, the aligned space before sparsemap_buf is wasted and no one will touch it. In our ARM32 platform (without SPARSEMEM_VMEMMAP) Sparse_buffer_init Reserve d359c000 - d3e9c000 (9M) Sparse_buffer_alloc Alloc d3a00000 - d3E80000 (4.5M) Sparse_buffer_fini Free d3e80000 - d3e9c000 (~=100k) The reserved memory between d359c000 - d3a00000 (~=4.4M) is unfreed. In ARM64 platform (with SPARSEMEM_VMEMMAP) sparse_buffer_init Reserve ffffffc07d623000 - ffffffc07f623000 (32M) Sparse_buffer_alloc Alloc ffffffc07d800000 - ffffffc07f600000 (30M) Sparse_buffer_fini Free ffffffc07f600000 - ffffffc07f623000 (140K) The reserved memory between ffffffc07d623000 - ffffffc07d800000 (~=1.9M) is unfreed. Let's explicit free redundant aligned memory. [arnd@arndb.de: mark sparse_buffer_free as __meminit] Link: http://lkml.kernel.org/r/20190709185528.3251709-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20190705114730.28534-1-lecopzer.chen@mediatek.com Signed-off-by: Lecopzer Chen Signed-off-by: Mark-PK Tsai Signed-off-by: Arnd Bergmann Cc: YJ Chiang Cc: Lecopzer Chen Cc: Pavel Tatashin Cc: Oscar Salvador Cc: Michal Hocko Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 72f010d9bff5..2bfd078301f8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -470,6 +470,12 @@ struct page __init *__populate_section_memmap(unsigned long pfn, static void *sparsemap_buf __meminitdata; static void *sparsemap_buf_end __meminitdata; +static inline void __meminit sparse_buffer_free(unsigned long size) +{ + WARN_ON(!sparsemap_buf || size == 0); + memblock_free_early(__pa(sparsemap_buf), size); +} + static void __init sparse_buffer_init(unsigned long size, int nid) { phys_addr_t addr = __pa(MAX_DMA_ADDRESS); @@ -486,7 +492,7 @@ static void __init sparse_buffer_fini(void) unsigned long size = sparsemap_buf_end - sparsemap_buf; if (sparsemap_buf && size > 0) - memblock_free_early(__pa(sparsemap_buf), size); + sparse_buffer_free(size); sparsemap_buf = NULL; } @@ -498,8 +504,12 @@ void * __meminit sparse_buffer_alloc(unsigned long size) ptr = PTR_ALIGN(sparsemap_buf, size); if (ptr + size > sparsemap_buf_end) ptr = NULL; - else + else { + /* Free redundant aligned space */ + if ((unsigned long)(ptr - sparsemap_buf) > 0) + sparse_buffer_free((unsigned long)(ptr - sparsemap_buf)); sparsemap_buf = ptr + size; + } } return ptr; } From db57e98d87908b8837352abe08515e42752270c1 Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Mon, 23 Sep 2019 15:36:24 -0700 Subject: [PATCH 1029/2880] mm/sparse.c: fix ALIGN() without power of 2 in sparse_buffer_alloc() The size argument passed into sparse_buffer_alloc() has already been aligned with PAGE_SIZE or PMD_SIZE. If the size after aligned is not power of 2 (e.g. 0x480000), the PTR_ALIGN() will return wrong value. Use roundup to round sparsemap_buf up to next multiple of size. Link: http://lkml.kernel.org/r/20190705114826.28586-1-lecopzer.chen@mediatek.com Signed-off-by: Lecopzer Chen Signed-off-by: Mark-PK Tsai Cc: YJ Chiang Cc: Lecopzer Chen Cc: Pavel Tatashin Cc: Oscar Salvador Cc: Michal Hocko Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index 2bfd078301f8..79355a86064f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -501,7 +501,7 @@ void * __meminit sparse_buffer_alloc(unsigned long size) void *ptr = NULL; if (sparsemap_buf) { - ptr = PTR_ALIGN(sparsemap_buf, size); + ptr = (void *) roundup((unsigned long)sparsemap_buf, size); if (ptr + size > sparsemap_buf_end) ptr = NULL; else { From c1cbc3eebf7a9ae46473a66710c0859aaafc1607 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 23 Sep 2019 15:36:27 -0700 Subject: [PATCH 1030/2880] mm/sparse.c: use __nr_to_section(section_nr) to get mem_section __pfn_to_section is defined as __nr_to_section(pfn_to_section_nr(pfn)). Since we already get section_nr, it is not necessary to get mem_section from start_pfn. By doing so, we reduce one redundant operation. Link: http://lkml.kernel.org/r/20190809010242.29797-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Reviewed-by: Anshuman Khandual Tested-by: Anshuman Khandual Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Michal Hocko Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index 79355a86064f..a1679024ab5c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -877,7 +877,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, */ page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); - ms = __pfn_to_section(start_pfn); + ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); section_mark_present(ms); From 9f82883c6d9af516c2a7f9fe85eb09e9c25bbe0a Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Mon, 23 Sep 2019 15:36:30 -0700 Subject: [PATCH 1031/2880] mm/sparse.c: don't manually decrement num_poisoned_pages Use the function written to do it instead. Link: http://lkml.kernel.org/r/20190827053656.32191-2-alastair@au1.ibm.com Signed-off-by: Alastair D'Silva Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Acked-by: Mike Rapoport Reviewed-by: Wei Yang Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index a1679024ab5c..d7af5cfdc810 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include "internal.h" #include @@ -908,7 +910,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) for (i = 0; i < nr_pages; i++) { if (PageHWPoison(&memmap[i])) { - atomic_long_sub(1, &num_poisoned_pages); + num_poisoned_pages_dec(); ClearPageHWPoison(&memmap[i]); } } From 5ed867037eb1f15b7e8cc92497671fd4b3864e4a Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Mon, 23 Sep 2019 15:36:33 -0700 Subject: [PATCH 1032/2880] mm/sparse.c: remove NULL check in clear_hwpoisoned_pages() There is no possibility for memmap to be NULL in the current codebase. This check was added in commit 95a4774d055c ("memory-hotplug: update mce_bad_pages when removing the memory") where memmap was originally inited to NULL, and only conditionally given a value. The code that could have passed a NULL has been removed by commit ba72b4c8cf60 ("mm/sparsemem: support sub-section hotplug"), so there is no longer a possibility that memmap can be NULL. Link: http://lkml.kernel.org/r/20190829035151.20975-1-alastair@d-silva.org Signed-off-by: Alastair D'Silva Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Cc: Mike Rapoport Cc: Wei Yang Cc: Qian Cai Cc: Alexander Duyck Cc: Logan Gunthorpe Cc: Baoquan He Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index d7af5cfdc810..bf32de9e666b 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -896,9 +896,6 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) { int i; - if (!memmap) - return; - /* * A further optimization is to have per section refcounted * num_poisoned_pages. But that would need more space per memmap, so From dd3b8353bae79395b12a178de057b183ff0122eb Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 23 Sep 2019 15:36:36 -0700 Subject: [PATCH 1033/2880] mm/vmalloc: do not keep unpurged areas in the busy tree The busy tree can be quite big, even though the area is freed or unmapped it still stays there until "purge" logic removes it. 1) Optimize and reduce the size of "busy" tree by removing a node from it right away as soon as user triggers free paths. It is possible to do so, because the allocation is done using another augmented tree. The vmalloc test driver shows the difference, for example the "fix_size_alloc_test" is ~11% better comparing with default configuration: sudo ./test_vmalloc.sh performance Summary: fix_size_alloc_test loops: 1000000 avg: 993985 usec Summary: full_fit_alloc_test loops: 1000000 avg: 973554 usec Summary: long_busy_list_alloc_test loops: 1000000 avg: 12617652 usec Summary: fix_size_alloc_test loops: 1000000 avg: 882263 usec Summary: full_fit_alloc_test loops: 1000000 avg: 973407 usec Summary: long_busy_list_alloc_test loops: 1000000 avg: 12593929 usec 2) Since the busy tree now contains allocated areas only and does not interfere with lazily free nodes, introduce the new function show_purge_info() that dumps "unpurged" areas that is propagated through "/proc/vmallocinfo". 3) Eliminate VM_LAZY_FREE flag. Link: http://lkml.kernel.org/r/20190716152656.12255-2-lpf.vector@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Pengfei Li Cc: Roman Gushchin Cc: Uladzislau Rezki Cc: Hillf Danton Cc: Michal Hocko Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c1246d77cf75..d535ef125bda 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -329,7 +329,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 -#define VM_LAZY_FREE 0x02 #define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); @@ -1282,7 +1281,14 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) llist_for_each_entry_safe(va, n_va, valist, purge_list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; - __free_vmap_area(va); + /* + * Finally insert or merge lazily-freed area. It is + * detached and there is no need to "unlink" it from + * anything. + */ + merge_or_add_vmap_area(va, + &free_vmap_area_root, &free_vmap_area_list); + atomic_long_sub(nr, &vmap_lazy_nr); if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) @@ -1324,6 +1330,10 @@ static void free_vmap_area_noflush(struct vmap_area *va) { unsigned long nr_lazy; + spin_lock(&vmap_area_lock); + unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -2143,14 +2153,13 @@ struct vm_struct *remove_vm_area(const void *addr) might_sleep(); - va = find_vmap_area((unsigned long)addr); + spin_lock(&vmap_area_lock); + va = __find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->vm; - spin_lock(&vmap_area_lock); va->vm = NULL; va->flags &= ~VM_VM_AREA; - va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); kasan_free_shadow(vm); @@ -2158,6 +2167,8 @@ struct vm_struct *remove_vm_area(const void *addr) return vm; } + + spin_unlock(&vmap_area_lock); return NULL; } @@ -3450,6 +3461,22 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) } } +static void show_purge_info(struct seq_file *m) +{ + struct llist_node *head; + struct vmap_area *va; + + head = READ_ONCE(vmap_purge_list.first); + if (head == NULL) + return; + + llist_for_each_entry(va, head, purge_list) { + seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); + } +} + static int s_show(struct seq_file *m, void *p) { struct vmap_area *va; @@ -3462,10 +3489,9 @@ static int s_show(struct seq_file *m, void *p) * behalf of vmap area is being tear down or vm_map_ram allocation. */ if (!(va->flags & VM_VM_AREA)) { - seq_printf(m, "0x%pK-0x%pK %7ld %s\n", + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start, - va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram"); + va->va_end - va->va_start); return 0; } @@ -3504,6 +3530,16 @@ static int s_show(struct seq_file *m, void *p) show_numa_info(m, v); seq_putc(m, '\n'); + + /* + * As a final step, dump "unpurged" areas. Note, + * that entire "/proc/vmallocinfo" output will not + * be address sorted, because the purge list is not + * sorted. + */ + if (list_is_last(&va->list, &vmap_area_list)) + show_purge_info(m); + return 0; } From 688fcbfc06e4fdfbb7e1d5a942a1460fe6379d2d Mon Sep 17 00:00:00 2001 From: Pengfei Li Date: Mon, 23 Sep 2019 15:36:39 -0700 Subject: [PATCH 1034/2880] mm/vmalloc: modify struct vmap_area to reduce its size Objective --------- The current implementation of struct vmap_area wasted space. After applying this commit, sizeof(struct vmap_area) has been reduced from 11 words to 8 words. Description ----------- 1) Pack "subtree_max_size", "vm" and "purge_list". This is no problem because A) "subtree_max_size" is only used when vmap_area is in "free" tree B) "vm" is only used when vmap_area is in "busy" tree C) "purge_list" is only used when vmap_area is in vmap_purge_list 2) Eliminate "flags". ;Since only one flag VM_VM_AREA is being used, and the same thing can be done by judging whether "vm" is NULL, then the "flags" can be eliminated. Link: http://lkml.kernel.org/r/20190716152656.12255-3-lpf.vector@gmail.com Signed-off-by: Pengfei Li Suggested-by: Uladzislau Rezki (Sony) Reviewed-by: Uladzislau Rezki (Sony) Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Roman Gushchin Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 20 +++++++++++++------- mm/vmalloc.c | 24 ++++++++++-------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index dfa718ffdd4f..4e7809408073 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -53,15 +53,21 @@ struct vmap_area { unsigned long va_start; unsigned long va_end; - /* - * Largest available free size in subtree. - */ - unsigned long subtree_max_size; - unsigned long flags; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ - struct llist_node purge_list; /* "lazy purge" list */ - struct vm_struct *vm; + + /* + * The following three variables can be packed, because + * a vmap_area object is always one of the three states: + * 1) in "free" tree (root is vmap_area_root) + * 2) in "busy" tree (root is free_vmap_area_root) + * 3) in purge list (head is vmap_purge_list) + */ + union { + unsigned long subtree_max_size; /* in "free" tree */ + struct vm_struct *vm; /* in "busy" tree */ + struct llist_node purge_list; /* in purge list */ + }; }; /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d535ef125bda..f095843fc243 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -329,7 +329,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 -#define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); /* Export for kexec only */ @@ -1115,7 +1114,7 @@ retry: va->va_start = addr; va->va_end = addr + size; - va->flags = 0; + va->vm = NULL; insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); @@ -1928,7 +1927,6 @@ void __init vmalloc_init(void) if (WARN_ON_ONCE(!va)) continue; - va->flags = VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; @@ -2026,7 +2024,6 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; - va->flags |= VM_VM_AREA; spin_unlock(&vmap_area_lock); } @@ -2131,10 +2128,10 @@ struct vm_struct *find_vm_area(const void *addr) struct vmap_area *va; va = find_vmap_area((unsigned long)addr); - if (va && va->flags & VM_VM_AREA) - return va->vm; + if (!va) + return NULL; - return NULL; + return va->vm; } /** @@ -2155,11 +2152,10 @@ struct vm_struct *remove_vm_area(const void *addr) spin_lock(&vmap_area_lock); va = __find_vmap_area((unsigned long)addr); - if (va && va->flags & VM_VM_AREA) { + if (va && va->vm) { struct vm_struct *vm = va->vm; va->vm = NULL; - va->flags &= ~VM_VM_AREA; spin_unlock(&vmap_area_lock); kasan_free_shadow(vm); @@ -2862,7 +2858,7 @@ long vread(char *buf, char *addr, unsigned long count) if (!count) break; - if (!(va->flags & VM_VM_AREA)) + if (!va->vm) continue; vm = va->vm; @@ -2942,7 +2938,7 @@ long vwrite(char *buf, char *addr, unsigned long count) if (!count) break; - if (!(va->flags & VM_VM_AREA)) + if (!va->vm) continue; vm = va->vm; @@ -3485,10 +3481,10 @@ static int s_show(struct seq_file *m, void *p) va = list_entry(p, struct vmap_area, list); /* - * s_show can encounter race with remove_vm_area, !VM_VM_AREA on - * behalf of vmap area is being tear down or vm_map_ram allocation. + * s_show can encounter race with remove_vm_area, !vm on behalf + * of vmap area is being tear down or vm_map_ram allocation. */ - if (!(va->flags & VM_VM_AREA)) { + if (!va->vm) { seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); From 7ea362427c170061b8822dd41bafaa72b3bcb9ad Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Mon, 23 Sep 2019 15:36:42 -0700 Subject: [PATCH 1035/2880] mm/vmalloc.c: move 'area->pages' after if statement If !area->pages statement is true where memory allocation fails, area is freed. In this case 'area->pages = pages' should not executed. So move 'area->pages = pages' after if statement. [akpm@linux-foundation.org: give area->pages the same treatment] Link: http://lkml.kernel.org/r/20190830035716.GA190684@LGEARND20B15 Signed-off-by: Austin Kim Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Uladzislau Rezki (Sony) Cc: Roman Gushchin Cc: Roman Penyaev Cc: Rick Edgecombe Cc: Mike Rapoport Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f095843fc243..fcadd3e25c0c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2409,7 +2409,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); - area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, @@ -2417,13 +2416,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } else { pages = kmalloc_node(array_size, nested_gfp, node); } - area->pages = pages; - if (!area->pages) { + + if (!pages) { remove_vm_area(area->addr); kfree(area); return NULL; } + area->pages = pages; + area->nr_pages = nr_pages; + for (i = 0; i < area->nr_pages; i++) { struct page *page; From 2286bf4e4d8f2fda407222b54f685dff3b80e0a0 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 23 Sep 2019 15:36:45 -0700 Subject: [PATCH 1036/2880] mm: use CPU_BITS_NONE to initialize init_mm.cpu_bitmask Replace open-coded bitmap array initialization of init_mm.cpu_bitmask with neat CPU_BITS_NONE macro. And, since init_mm.cpu_bitmask is statically set to zero, there is no way to clear it again in start_kernel(). Link: http://lkml.kernel.org/r/1565703815-8584-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 1 - mm/init-mm.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index 99a5f55e0d02..208b8fa1808e 100644 --- a/init/main.c +++ b/init/main.c @@ -594,7 +594,6 @@ asmlinkage __visible void __init start_kernel(void) page_address_init(); pr_notice("%s", linux_banner); setup_arch(&command_line); - mm_init_cpumask(&init_mm); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); diff --git a/mm/init-mm.c b/mm/init-mm.c index a787a319211e..fb1e15028ef0 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -35,6 +35,6 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, - .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, + .cpu_bitmap = CPU_BITS_NONE, INIT_MM_CONTEXT(init_mm) }; From b57a775f5130dd83ba0e7f46cd86741bf19b71c5 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 23 Sep 2019 15:36:48 -0700 Subject: [PATCH 1037/2880] mm: silence -Woverride-init/initializer-overrides When compiling a kernel with W=1, there are several of those warnings due to arm64 overriding a field on purpose. Just disable those warnings for both GCC and Clang of this file, so it will help dig "gems" hidden in the W=1 warnings by reducing some noises. mm/init-mm.c:39:2: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides] INIT_MM_CONTEXT(init_mm) ^~~~~~~~~~~~~~~~~~~~~~~~ ./arch/arm64/include/asm/mmu.h:133:9: note: expanded from macro 'INIT_MM_CONTEXT' .pgd = init_pg_dir, ^~~~~~~~~~~ mm/init-mm.c:30:10: note: previous initialization is here .pgd = swapper_pg_dir, ^~~~~~~~~~~~~~ Note: there is a side project trying to support explicitly allowing specific initializer overrides in Clang, but there is no guarantee it will happen or not. https://github.com/ClangBuiltLinux/linux/issues/639 Link: http://lkml.kernel.org/r/1566920867-27453-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Cc: Nick Desaulniers Cc: Masahiro Yamada Cc: Mark Rutland Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/Makefile b/mm/Makefile index d11de59e9c3c..d996846697ef 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n +CFLAGS_init-mm.o += $(call cc-disable-warning, override-init) +CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides) + mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ From 63398413c00c7836ea87a1fa205c91d2199b25cf Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 23 Sep 2019 15:36:51 -0700 Subject: [PATCH 1038/2880] z3fold: fix memory leak in kmem cache Currently there is a leak in init_z3fold_page() -- it allocates handles from kmem cache even for headless pages, but then they are never used and never freed, so eventually kmem cache may get exhausted. This patch provides a fix for that. Link: http://lkml.kernel.org/r/20190917185352.44cf285d3ebd9e64548de5de@gmail.com Signed-off-by: Vitaly Wool Reported-by: Markus Linnala Tested-by: Markus Linnala Cc: Dan Streetman Cc: Henry Burns Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index d637d9ee005c..05bdf90646e7 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -295,14 +295,11 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool) } /* Initializes the z3fold header of a newly allocated z3fold page */ -static struct z3fold_header *init_z3fold_page(struct page *page, +static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, struct z3fold_pool *pool, gfp_t gfp) { struct z3fold_header *zhdr = page_address(page); - struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp); - - if (!slots) - return NULL; + struct z3fold_buddy_slots *slots; INIT_LIST_HEAD(&page->lru); clear_bit(PAGE_HEADLESS, &page->private); @@ -310,6 +307,12 @@ static struct z3fold_header *init_z3fold_page(struct page *page, clear_bit(NEEDS_COMPACTING, &page->private); clear_bit(PAGE_STALE, &page->private); clear_bit(PAGE_CLAIMED, &page->private); + if (headless) + return zhdr; + + slots = alloc_slots(pool, gfp); + if (!slots) + return NULL; spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); @@ -930,7 +933,7 @@ retry: if (!page) return -ENOMEM; - zhdr = init_z3fold_page(page, pool, gfp); + zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp); if (!zhdr) { __free_page(page); return -ENOMEM; From a94b525241c0fff3598809131d7cfcfe1d572d8c Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 23 Sep 2019 15:36:54 -0700 Subject: [PATCH 1039/2880] mm/compaction.c: clear total_{migrate,free}_scanned before scanning a new zone total_{migrate,free}_scanned will be added to COMPACTMIGRATE_SCANNED and COMPACTFREE_SCANNED in compact_zone(). We should clear them before scanning a new zone. In the proc triggered compaction, we forgot clearing them. [laoar.shao@gmail.com: introduce a helper compact_zone_counters_init()] Link: http://lkml.kernel.org/r/1563869295-25748-1-git-send-email-laoar.shao@gmail.com [akpm@linux-foundation.org: expand compact_zone_counters_init() into its single callsite, per mhocko] [vbabka@suse.cz: squash compact_zone() list_head init as well] Link: http://lkml.kernel.org/r/1fb6f7da-f776-9e42-22f8-bbb79b030b98@suse.cz [akpm@linux-foundation.org: kcompactd_do_work(): avoid unnecessary initialization of cc.zone] Link: http://lkml.kernel.org/r/1563789275-9639-1-git-send-email-laoar.shao@gmail.com Fixes: 7f354a548d1c ("mm, compaction: add vmstats for kcompactd work") Signed-off-by: Yafang Shao Signed-off-by: Vlastimil Babka Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Yafang Shao Cc: Mel Gorman Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 777c088e9113..d99d59412c75 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2078,6 +2078,17 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; + /* + * These counters track activities during zone compaction. Initialize + * them before compacting a new zone. + */ + cc->total_migrate_scanned = 0; + cc->total_free_scanned = 0; + cc->nr_migratepages = 0; + cc->nr_freepages = 0; + INIT_LIST_HEAD(&cc->freepages); + INIT_LIST_HEAD(&cc->migratepages); + cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, cc->classzone_idx); @@ -2281,10 +2292,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, { enum compact_result ret; struct compact_control cc = { - .nr_freepages = 0, - .nr_migratepages = 0, - .total_migrate_scanned = 0, - .total_free_scanned = 0, .order = order, .search_order = order, .gfp_mask = gfp_mask, @@ -2305,8 +2312,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, if (capture) current->capture_control = &capc; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); ret = compact_zone(&cc, &capc); @@ -2408,8 +2413,6 @@ static void compact_node(int nid) struct zone *zone; struct compact_control cc = { .order = -1, - .total_migrate_scanned = 0, - .total_free_scanned = 0, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, @@ -2423,11 +2426,7 @@ static void compact_node(int nid) if (!populated_zone(zone)) continue; - cc.nr_freepages = 0; - cc.nr_migratepages = 0; cc.zone = zone; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); compact_zone(&cc, NULL); @@ -2529,8 +2528,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct compact_control cc = { .order = pgdat->kcompactd_max_order, .search_order = pgdat->kcompactd_max_order, - .total_migrate_scanned = 0, - .total_free_scanned = 0, .classzone_idx = pgdat->kcompactd_classzone_idx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = false, @@ -2554,16 +2551,10 @@ static void kcompactd_do_work(pg_data_t *pgdat) COMPACT_CONTINUE) continue; - cc.nr_freepages = 0; - cc.nr_migratepages = 0; - cc.total_migrate_scanned = 0; - cc.total_free_scanned = 0; - cc.zone = zone; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); - if (kthread_should_stop()) return; + + cc.zone = zone; status = compact_zone(&cc, NULL); if (status == COMPACT_SUCCESS) { From 32aaf0553df99cc4314f6e9f43216cd83afc6c20 Mon Sep 17 00:00:00 2001 From: Pengfei Li Date: Mon, 23 Sep 2019 15:36:58 -0700 Subject: [PATCH 1040/2880] mm/compaction.c: remove unnecessary zone parameter in isolate_migratepages() Like commit 40cacbcb3240 ("mm, compaction: remove unnecessary zone parameter in some instances"), remove unnecessary zone parameter. No functional change. Link: http://lkml.kernel.org/r/20190806151616.21107-1-lpf.vector@gmail.com Signed-off-by: Pengfei Li Reviewed-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Qian Cai Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index d99d59412c75..ce08b39d85d4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1737,8 +1737,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) * starting at the block pointed to by the migrate scanner pfn within * compact_control. */ -static isolate_migrate_t isolate_migratepages(struct zone *zone, - struct compact_control *cc) +static isolate_migrate_t isolate_migratepages(struct compact_control *cc) { unsigned long block_start_pfn; unsigned long block_end_pfn; @@ -1756,8 +1755,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, */ low_pfn = fast_find_migrateblock(cc); block_start_pfn = pageblock_start_pfn(low_pfn); - if (block_start_pfn < zone->zone_start_pfn) - block_start_pfn = zone->zone_start_pfn; + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; /* * fast_find_migrateblock marks a pageblock skipped so to avoid @@ -1787,8 +1786,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) cond_resched(); - page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, - zone); + page = pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone); if (!page) continue; @@ -2169,7 +2168,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->rescan = true; } - switch (isolate_migratepages(cc->zone, cc)) { + switch (isolate_migratepages(cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; putback_movable_pages(&cc->migratepages); From 4406548ee39c2268876b61a927acad45da6f9aef Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 23 Sep 2019 15:37:01 -0700 Subject: [PATCH 1041/2880] mm/mempolicy.c: remove unnecessary nodemask check in kernel_migrate_pages() 1) task_nodes = cpuset_mems_allowed(current); -> cpuset_mems_allowed() guaranteed to return some non-empty subset of node_states[N_MEMORY]. 2) nodes_and(*new, *new, task_nodes); -> after nodes_and(), the 'new' should be empty or appropriate nodemask(online node and with memory). After 1) and 2), we could remove unnecessary check whether the 'new' AND node_states[N_MEMORY] is empty. Link: http://lkml.kernel.org/r/20190806023634.55356-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Williams Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f000771558d8..464406e8da91 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1512,10 +1512,6 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, if (nodes_empty(*new)) goto out_put; - nodes_and(*new, *new, node_states[N_MEMORY]); - if (nodes_empty(*new)) - goto out_put; - err = security_task_movememory(task); if (err) goto out_put; From 8ac3f8fe91a2119522a73fbc41d354057054e6ed Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Mon, 23 Sep 2019 15:37:04 -0700 Subject: [PATCH 1042/2880] mm/oom_kill.c: add task UID to info message on an oom kill In the event of an oom kill, useful information about the killed process is printed to dmesg. Users, especially system administrators, will find it useful to immediately see the UID of the process. We already print uid when dumping eligible tasks so it is not overly hard to find that information in the oom report. However this information is unavailable when dumping of eligible tasks is disabled. In the following example, abuse_the_ram is the name of a program that attempts to iteratively allocate all available memory until it is stopped by force. Current message: Out of memory: Killed process 35389 (abuse_the_ram) total-vm:133718232kB, anon-rss:129624980kB, file-rss:0kB, shmem-rss:0kB Patched message: Out of memory: Killed process 2739 (abuse_the_ram), total-vm:133880028kB, anon-rss:129754836kB, file-rss:0kB, shmem-rss:0kB, UID:0 [akpm@linux-foundation.org: s/UID %d/UID:%u/ in printk] Link: http://lkml.kernel.org/r/1560362273-534-1-git-send-email-jsavitz@redhat.com Signed-off-by: Joel Savitz Suggested-by: David Rientjes Acked-by: Rafael Aquini Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eda2e2a0bdc6..95872bdfec4e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -884,12 +884,13 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) */ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); - pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u\n", message, task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), K(get_mm_counter(victim->mm, MM_FILEPAGES)), - K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); + K(get_mm_counter(victim->mm, MM_SHMEMPAGES)), + from_kuid(&init_user_ns, task_uid(victim))); task_unlock(victim); /* From f9c645621a28e37813a1de96d9cbd89cde94a1e4 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 23 Sep 2019 15:37:08 -0700 Subject: [PATCH 1043/2880] memcg, oom: don't require __GFP_FS when invoking memcg OOM killer Masoud Sharbiani noticed that commit 29ef680ae7c21110 ("memcg, oom: move out_of_memory back to the charge path") broke memcg OOM called from __xfs_filemap_fault() path. It turned out that try_charge() is retrying forever without making forward progress because mem_cgroup_oom(GFP_NOFS) cannot invoke the OOM killer due to commit 3da88fb3bacfaa33 ("mm, oom: move GFP_NOFS check to out_of_memory"). Allowing forced charge due to being unable to invoke memcg OOM killer will lead to global OOM situation. Also, just returning -ENOMEM will be risky because OOM path is lost and some paths (e.g. get_user_pages()) will leak -ENOMEM. Therefore, invoking memcg OOM killer (despite GFP_NOFS) will be the only choice we can choose for now. Until 29ef680ae7c21110, we were able to invoke memcg OOM killer when GFP_KERNEL reclaim failed [1]. But since 29ef680ae7c21110, we need to invoke memcg OOM killer when GFP_NOFS reclaim failed [2]. Although in the past we did invoke memcg OOM killer for GFP_NOFS [3], we might get pre-mature memcg OOM reports due to this patch. [1] leaker invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0 CPU: 0 PID: 2746 Comm: leaker Not tainted 4.18.0+ #19 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 Call Trace: dump_stack+0x63/0x88 dump_header+0x67/0x27a ? mem_cgroup_scan_tasks+0x91/0xf0 oom_kill_process+0x210/0x410 out_of_memory+0x10a/0x2c0 mem_cgroup_out_of_memory+0x46/0x80 mem_cgroup_oom_synchronize+0x2e4/0x310 ? high_work_func+0x20/0x20 pagefault_out_of_memory+0x31/0x76 mm_fault_error+0x55/0x115 ? handle_mm_fault+0xfd/0x220 __do_page_fault+0x433/0x4e0 do_page_fault+0x22/0x30 ? page_fault+0x8/0x30 page_fault+0x1e/0x30 RIP: 0033:0x4009f0 Code: 03 00 00 00 e8 71 fd ff ff 48 83 f8 ff 49 89 c6 74 74 48 89 c6 bf c0 0c 40 00 31 c0 e8 69 fd ff ff 45 85 ff 7e 21 31 c9 66 90 <41> 0f be 14 0e 01 d3 f7 c1 ff 0f 00 00 75 05 41 c6 04 0e 2a 48 83 RSP: 002b:00007ffe29ae96f0 EFLAGS: 00010206 RAX: 000000000000001b RBX: 0000000000000000 RCX: 0000000001ce1000 RDX: 0000000000000000 RSI: 000000007fffffe5 RDI: 0000000000000000 RBP: 000000000000000c R08: 0000000000000000 R09: 00007f94be09220d R10: 0000000000000002 R11: 0000000000000246 R12: 00000000000186a0 R13: 0000000000000003 R14: 00007f949d845000 R15: 0000000002800000 Task in /leaker killed as a result of limit of /leaker memory: usage 524288kB, limit 524288kB, failcnt 158965 memory+swap: usage 0kB, limit 9007199254740988kB, failcnt 0 kmem: usage 2016kB, limit 9007199254740988kB, failcnt 0 Memory cgroup stats for /leaker: cache:844KB rss:521136KB rss_huge:0KB shmem:0KB mapped_file:0KB dirty:132KB writeback:0KB inactive_anon:0KB active_anon:521224KB inactive_file:1012KB active_file:8KB unevictable:0KB Memory cgroup out of memory: Kill process 2746 (leaker) score 998 or sacrifice child Killed process 2746 (leaker) total-vm:536704kB, anon-rss:521176kB, file-rss:1208kB, shmem-rss:0kB oom_reaper: reaped process 2746 (leaker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [2] leaker invoked oom-killer: gfp_mask=0x600040(GFP_NOFS), nodemask=(null), order=0, oom_score_adj=0 CPU: 1 PID: 2746 Comm: leaker Not tainted 4.18.0+ #20 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 Call Trace: dump_stack+0x63/0x88 dump_header+0x67/0x27a ? mem_cgroup_scan_tasks+0x91/0xf0 oom_kill_process+0x210/0x410 out_of_memory+0x109/0x2d0 mem_cgroup_out_of_memory+0x46/0x80 try_charge+0x58d/0x650 ? __radix_tree_replace+0x81/0x100 mem_cgroup_try_charge+0x7a/0x100 __add_to_page_cache_locked+0x92/0x180 add_to_page_cache_lru+0x4d/0xf0 iomap_readpages_actor+0xde/0x1b0 ? iomap_zero_range_actor+0x1d0/0x1d0 iomap_apply+0xaf/0x130 iomap_readpages+0x9f/0x150 ? iomap_zero_range_actor+0x1d0/0x1d0 xfs_vm_readpages+0x18/0x20 [xfs] read_pages+0x60/0x140 __do_page_cache_readahead+0x193/0x1b0 ondemand_readahead+0x16d/0x2c0 page_cache_async_readahead+0x9a/0xd0 filemap_fault+0x403/0x620 ? alloc_set_pte+0x12c/0x540 ? _cond_resched+0x14/0x30 __xfs_filemap_fault+0x66/0x180 [xfs] xfs_filemap_fault+0x27/0x30 [xfs] __do_fault+0x19/0x40 __handle_mm_fault+0x8e8/0xb60 handle_mm_fault+0xfd/0x220 __do_page_fault+0x238/0x4e0 do_page_fault+0x22/0x30 ? page_fault+0x8/0x30 page_fault+0x1e/0x30 RIP: 0033:0x4009f0 Code: 03 00 00 00 e8 71 fd ff ff 48 83 f8 ff 49 89 c6 74 74 48 89 c6 bf c0 0c 40 00 31 c0 e8 69 fd ff ff 45 85 ff 7e 21 31 c9 66 90 <41> 0f be 14 0e 01 d3 f7 c1 ff 0f 00 00 75 05 41 c6 04 0e 2a 48 83 RSP: 002b:00007ffda45c9290 EFLAGS: 00010206 RAX: 000000000000001b RBX: 0000000000000000 RCX: 0000000001a1e000 RDX: 0000000000000000 RSI: 000000007fffffe5 RDI: 0000000000000000 RBP: 000000000000000c R08: 0000000000000000 R09: 00007f6d061ff20d R10: 0000000000000002 R11: 0000000000000246 R12: 00000000000186a0 R13: 0000000000000003 R14: 00007f6ce59b2000 R15: 0000000002800000 Task in /leaker killed as a result of limit of /leaker memory: usage 524288kB, limit 524288kB, failcnt 7221 memory+swap: usage 0kB, limit 9007199254740988kB, failcnt 0 kmem: usage 1944kB, limit 9007199254740988kB, failcnt 0 Memory cgroup stats for /leaker: cache:3632KB rss:518232KB rss_huge:0KB shmem:0KB mapped_file:0KB dirty:0KB writeback:0KB inactive_anon:0KB active_anon:518408KB inactive_file:3908KB active_file:12KB unevictable:0KB Memory cgroup out of memory: Kill process 2746 (leaker) score 992 or sacrifice child Killed process 2746 (leaker) total-vm:536704kB, anon-rss:518264kB, file-rss:1188kB, shmem-rss:0kB oom_reaper: reaped process 2746 (leaker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [3] leaker invoked oom-killer: gfp_mask=0x50, order=0, oom_score_adj=0 leaker cpuset=/ mems_allowed=0 CPU: 1 PID: 3206 Comm: leaker Not tainted 3.10.0-957.27.2.el7.x86_64 #1 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 Call Trace: [] dump_stack+0x19/0x1b [] dump_header+0x90/0x229 [] ? find_lock_task_mm+0x56/0xc0 [] ? try_get_mem_cgroup_from_mm+0x28/0x60 [] oom_kill_process+0x254/0x3d0 [] mem_cgroup_oom_synchronize+0x546/0x570 [] ? mem_cgroup_charge_common+0xc0/0xc0 [] pagefault_out_of_memory+0x14/0x90 [] mm_fault_error+0x6a/0x157 [] __do_page_fault+0x3c8/0x4f0 [] do_page_fault+0x35/0x90 [] page_fault+0x28/0x30 Task in /leaker killed as a result of limit of /leaker memory: usage 524288kB, limit 524288kB, failcnt 20628 memory+swap: usage 524288kB, limit 9007199254740988kB, failcnt 0 kmem: usage 0kB, limit 9007199254740988kB, failcnt 0 Memory cgroup stats for /leaker: cache:840KB rss:523448KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:523448KB inactive_file:464KB active_file:376KB unevictable:0KB Memory cgroup out of memory: Kill process 3206 (leaker) score 970 or sacrifice child Killed process 3206 (leaker) total-vm:536692kB, anon-rss:523304kB, file-rss:412kB, shmem-rss:0kB Bisected by Masoud Sharbiani. Link: http://lkml.kernel.org/r/cbe54ed1-b6ba-a056-8899-2dc42526371d@i-love.sakura.ne.jp Fixes: 3da88fb3bacfaa33 ("mm, oom: move GFP_NOFS check to out_of_memory") [necessary after 29ef680ae7c21110] Signed-off-by: Tetsuo Handa Reported-by: Masoud Sharbiani Tested-by: Masoud Sharbiani Acked-by: Michal Hocko Cc: David Rientjes Cc: [4.19+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 95872bdfec4e..a6b76624f5b8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1069,9 +1069,10 @@ bool out_of_memory(struct oom_control *oc) * The OOM killer does not compensate for IO-less reclaim. * pagefault_out_of_memory lost its gfp context so we have to * make sure exclude 0 mask - all other users should have at least - * ___GFP_DIRECT_RECLAIM to get here. + * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to + * invoke the OOM killer even if it is a GFP_NOFS allocation. */ - if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) + if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) return true; /* From 70cb6d2677905121bfc7fdf5babfd8444218edd9 Mon Sep 17 00:00:00 2001 From: Edward Chron Date: Mon, 23 Sep 2019 15:37:11 -0700 Subject: [PATCH 1044/2880] mm/oom: add oom_score_adj and pgtables to Killed process message For an OOM event: print oom_score_adj value for the OOM Killed process to document what the oom score adjust value was at the time the process was OOM Killed. The adjustment value can be set by user code and it affects the resulting oom_score so it is used to influence kill process selection. When eligible tasks are not printed (sysctl oom_dump_tasks = 0) printing this value is the only documentation of the value for the process being killed. Having this value on the Killed process message is useful to document if a miscconfiguration occurred or to confirm that the oom_score_adj configuration applies as expected. An example which illustates both misconfiguration and validation that the oom_score_adj was applied as expected is: Aug 14 23:00:02 testserver kernel: Out of memory: Killed process 2692 (systemd-udevd) total-vm:1056800kB, anon-rss:1052760kB, file-rss:4kB, shmem-rss:0kB pgtables:22kB oom_score_adj:1000 The systemd-udevd is a critical system application that should have an oom_score_adj of -1000. It was miconfigured to have a adjustment of 1000 making it a highly favored OOM kill target process. The output documents both the misconfiguration and the fact that the process was correctly targeted by OOM due to the miconfiguration. This can be quite helpful for triage and problem determination. The addition of the pgtables_bytes shows page table usage by the process and is a useful measure of the memory size of the process. Link: http://lkml.kernel.org/r/20190822173157.1569-1-echron@arista.com Signed-off-by: Edward Chron Acked-by: Michal Hocko Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a6b76624f5b8..76000feb0794 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -884,13 +884,13 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) */ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); - pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u\n", - message, task_pid_nr(victim), victim->comm, - K(victim->mm->total_vm), - K(get_mm_counter(victim->mm, MM_ANONPAGES)), - K(get_mm_counter(victim->mm, MM_FILEPAGES)), - K(get_mm_counter(victim->mm, MM_SHMEMPAGES)), - from_kuid(&init_user_ns, task_uid(victim))); + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n", + message, task_pid_nr(victim), victim->comm, K(mm->total_vm), + K(get_mm_counter(mm, MM_ANONPAGES)), + K(get_mm_counter(mm, MM_FILEPAGES)), + K(get_mm_counter(mm, MM_SHMEMPAGES)), + from_kuid(&init_user_ns, task_uid(victim)), + mm_pgtables_bytes(mm), victim->signal->oom_score_adj); task_unlock(victim); /* From f364f06b34b55285df7b132b4e3752d820412ad4 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 23 Sep 2019 15:37:14 -0700 Subject: [PATCH 1045/2880] mm/oom_kill.c: fix oom_cpuset_eligible() comment Commit ac311a14c682 ("oom: decouple mems_allowed from oom_unkillable_task") changed has_intersects_mems_allowed() to oom_cpuset_eligible(), but didn't change the comment. Link: http://lkml.kernel.org/r/1566959929-10638-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Yi Wang Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 76000feb0794..1530fccdbe4f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -73,7 +73,7 @@ static inline bool is_memcg_oom(struct oom_control *oc) /** * oom_cpuset_eligible() - check task eligiblity for kill * @start: task struct of which task to consider - * @mask: nodemask passed to page allocator for mempolicy ooms + * @oc: pointer to struct oom_control * * Task eligibility is determined by whether or not a candidate task, @tsk, * shares the same mempolicy nodes as current if it is bound by such a policy From 1eb41bb07e568629805acb8455c44b6709819551 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 Sep 2019 15:37:16 -0700 Subject: [PATCH 1046/2880] mm, oom: consider present pages for the node size constrained_alloc() calculates the size of the oom domain by using node_spanned_pages which is incorrect because this is the full range of the physical memory range that the numa node occupies rather than the memory that backs that range which is represented by node_present_pages. Sparsely populated nodes (e.g. after memory hot remove or simply sparse due to memory layout) can have really a large difference between the two. This shouldn't really cause any real user observable problems because the oom calculates a ratio against totalpages and used memory cannot exceed present pages but it is confusing and wrong from code point of view. Link: http://lkml.kernel.org/r/20190829163443.899-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: David Hildenbrand Reviewed-by: David Hildenbrand Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1530fccdbe4f..c1d9496b4c43 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -287,7 +287,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { oc->totalpages = total_swap_pages; for_each_node_mask(nid, *oc->nodemask) - oc->totalpages += node_spanned_pages(nid); + oc->totalpages += node_present_pages(nid); return CONSTRAINT_MEMORY_POLICY; } @@ -300,7 +300,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) if (cpuset_limited) { oc->totalpages = total_swap_pages; for_each_node_mask(nid, cpuset_current_mems_allowed) - oc->totalpages += node_spanned_pages(nid); + oc->totalpages += node_present_pages(nid); return CONSTRAINT_CPUSET; } return CONSTRAINT_NONE; From 4d0e3230a56a75bf26d74b3f4b6cb05a0e9fae60 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 23 Sep 2019 15:37:19 -0700 Subject: [PATCH 1047/2880] mm/memcontrol.c: fix a -Wunused-function warning mem_cgroup_id_get() was introduced in commit 73f576c04b94 ("mm:memcontrol: fix cgroup creation failure after many small jobs"). Later, it no longer has any user since the commits, 1f47b61fb407 ("mm: memcontrol: fix swap counter leak on swapout from offline cgroup") 58fa2a5512d9 ("mm: memcontrol: add sanity checks for memcg->id.ref on get/put") so safe to remove it. Link: http://lkml.kernel.org/r/1568648453-5482-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 80131025f2da..701c1ab79d24 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4930,11 +4930,6 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) } } -static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) -{ - mem_cgroup_id_get_many(memcg, 1); -} - static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) { mem_cgroup_id_put_many(memcg, 1); From 0158115f702b0ba208ab0b5adf44cae99b3ebcc7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 Sep 2019 15:37:22 -0700 Subject: [PATCH 1048/2880] memcg, kmem: deprecate kmem.limit_in_bytes Cgroup v1 memcg controller has exposed a dedicated kmem limit to users which turned out to be really a bad idea because there are paths which cannot shrink the kernel memory usage enough to get below the limit (e.g. because the accounted memory is not reclaimable). There are cases when the failure is even not allowed (e.g. __GFP_NOFAIL). This means that the kmem limit is in excess to the hard limit without any way to shrink and thus completely useless. OOM killer cannot be invoked to handle the situation because that would lead to a premature oom killing. As a result many places might see ENOMEM returning from kmalloc and result in unexpected errors. E.g. a global OOM killer when there is a lot of free memory because ENOMEM is translated into VM_FAULT_OOM in #PF path and therefore pagefault_out_of_memory would result in OOM killer. Please note that the kernel memory is still accounted to the overall limit along with the user memory so removing the kmem specific limit should still allow to contain kernel memory consumption. Unlike the kmem one, though, it invokes memory reclaim and targeted memcg oom killing if necessary. Start the deprecation process by crying to the kernel log. Let's see whether there are relevant usecases and simply return to EINVAL in the second stage if nobody complains in few releases. [akpm@linux-foundation.org: tweak documentation text] Link: http://lkml.kernel.org/r/20190911151612.GI4023@dhcp22.suse.cz Signed-off-by: Michal Hocko Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Andrey Ryabinin Cc: Thomas Lindroth Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v1/memory.rst | 4 +++- mm/memcontrol.c | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 41bdc038dad9..0ae4f564c2d6 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -85,8 +85,10 @@ Brief summary of control files. memory.oom_control set/show oom controls. memory.numa_stat show the number of memory usage per numa node - memory.kmem.limit_in_bytes set/show hard limit for kernel memory + This knob is deprecated and shouldn't be + used. It is planned that this be removed in + the foreseeable future. memory.kmem.usage_in_bytes show current kernel memory allocation memory.kmem.failcnt show the number of kernel memory usage hits limits diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 701c1ab79d24..7bb2971dc75e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3637,6 +3637,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: + pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); ret = memcg_update_kmem_max(memcg, nr_pages); break; case _TCP: From 1c6c15971e4709953f75082a5d44212536b1c2b7 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Mon, 23 Sep 2019 15:37:26 -0700 Subject: [PATCH 1049/2880] mm, reclaim: make should_continue_reclaim perform dryrun detection Patch series "address hugetlb page allocation stalls", v2. Allocation of hugetlb pages via sysctl or procfs can stall for minutes or hours. A simple example on a two node system with 8GB of memory is as follows: echo 4096 > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages echo 4096 > /proc/sys/vm/nr_hugepages Obviously, both allocation attempts will fall short of their 8GB goal. However, one or both of these commands may stall and not be interruptible. The issues were initially discussed in mail thread [1] and RFC code at [2]. This series addresses the issues causing the stalls. There are two distinct fixes, a cleanup, and an optimization. The reclaim patch by Hillf and compaction patch by Vlasitmil address corner cases in their respective areas. hugetlb page allocation could stall due to either of these issues. Vlasitmil added a cleanup patch after Hillf's modifications. The hugetlb patch by Mike is an optimization suggested during the debug and development process. [1] http://lkml.kernel.org/r/d38a095e-dc39-7e82-bb76-2c9247929f07@oracle.com [2] http://lkml.kernel.org/r/20190724175014.9935-1-mike.kravetz@oracle.com This patch (of 4): Address the issue of should_continue_reclaim returning true too often for __GFP_RETRY_MAYFAIL attempts when !nr_reclaimed and nr_scanned. This was observed during hugetlb page allocation causing stalls for minutes or hours. We can stop reclaiming pages if compaction reports it can make a progress. There might be side-effects for other high-order allocations that would potentially benefit from reclaiming more before compaction so that they would be faster and less likely to stall. However, the consequences of premature/over-reclaim are considered worse. We can also bail out of reclaiming pages if we know that there are not enough inactive lru pages left to satisfy the costly allocation. We can give up reclaiming pages too if we see dryrun occur, with the certainty of plenty of inactive pages. IOW with dryrun detected, we are sure we have reclaimed as many pages as we could. Link: http://lkml.kernel.org/r/20190806014744.15446-2-mike.kravetz@oracle.com Signed-off-by: Hillf Danton Signed-off-by: Mike Kravetz Tested-by: Mike Kravetz Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2ee4d9283738..83b5d5280e99 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2620,18 +2620,6 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return false; } - /* - * If we have not reclaimed enough pages for compaction and the - * inactive lists are large enough, continue reclaiming - */ - pages_for_compaction = compact_gap(sc->order); - inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); - if (get_nr_swap_pages() > 0) - inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); - if (sc->nr_reclaimed < pages_for_compaction && - inactive_lru_pages > pages_for_compaction) - return true; - /* If compaction would go ahead or the allocation would succeed, stop */ for (z = 0; z <= sc->reclaim_idx; z++) { struct zone *zone = &pgdat->node_zones[z]; @@ -2647,7 +2635,21 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, ; } } - return true; + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = compact_gap(sc->order); + inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); + if (get_nr_swap_pages() > 0) + inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); + + return inactive_lru_pages > pages_for_compaction && + /* + * avoid dryrun with plenty of inactive pages + */ + nr_scanned && nr_reclaimed; } static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) From 5ee04716c46ce58989b1256a98af1af89f385db8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 23 Sep 2019 15:37:29 -0700 Subject: [PATCH 1050/2880] mm, reclaim: cleanup should_continue_reclaim() After commit "mm, reclaim: make should_continue_reclaim perform dryrun detection", closer look at the function shows, that nr_reclaimed == 0 means the function will always return false. And since non-zero nr_reclaimed implies non_zero nr_scanned, testing nr_scanned serves no purpose, and so does the testing for __GFP_RETRY_MAYFAIL. This patch thus cleans up the function to test only !nr_reclaimed upfront, and remove the __GFP_RETRY_MAYFAIL test and nr_scanned parameter completely. Comment is also updated, explaining that approximating "full LRU list has been scanned" with nr_scanned == 0 didn't really work. Link: http://lkml.kernel.org/r/20190806014744.15446-3-mike.kravetz@oracle.com Signed-off-by: Vlastimil Babka Signed-off-by: Mike Kravetz Acked-by: Mike Kravetz Cc: Hillf Danton Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 43 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 83b5d5280e99..c27dd62ed594 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2586,7 +2586,6 @@ static bool in_reclaim_compaction(struct scan_control *sc) */ static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long nr_reclaimed, - unsigned long nr_scanned, struct scan_control *sc) { unsigned long pages_for_compaction; @@ -2597,28 +2596,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, if (!in_reclaim_compaction(sc)) return false; - /* Consider stopping depending on scan and reclaim activity */ - if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) { - /* - * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the - * full LRU list has been scanned and we are still failing - * to reclaim pages. This full LRU scan is potentially - * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed - */ - if (!nr_reclaimed && !nr_scanned) - return false; - } else { - /* - * For non-__GFP_RETRY_MAYFAIL allocations which can presumably - * fail without consequence, stop if we failed to reclaim - * any pages from the last SWAP_CLUSTER_MAX number of - * pages that were scanned. This will return to the - * caller faster at the risk reclaim/compaction and - * the resulting allocation attempt fails - */ - if (!nr_reclaimed) - return false; - } + /* + * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX + * number of pages that were scanned. This will return to the caller + * with the risk reclaim/compaction and the resulting allocation attempt + * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL + * allocations through requiring that the full LRU list has been scanned + * first, by assuming that zero delta of sc->nr_scanned means full LRU + * scan, but that approximation was wrong, and there were corner cases + * where always a non-zero amount of pages were scanned. + */ + if (!nr_reclaimed) + return false; /* If compaction would go ahead or the allocation would succeed, stop */ for (z = 0; z <= sc->reclaim_idx; z++) { @@ -2645,11 +2634,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, if (get_nr_swap_pages() > 0) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); - return inactive_lru_pages > pages_for_compaction && - /* - * avoid dryrun with plenty of inactive pages - */ - nr_scanned && nr_reclaimed; + return inactive_lru_pages > pages_for_compaction; } static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) @@ -2794,7 +2779,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) wait_iff_congested(BLK_RW_ASYNC, HZ/10); } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, - sc->nr_scanned - nr_scanned, sc)); + sc)); /* * Kswapd gives up on balancing particular nodes after too From 494330855641269c8a49f1580f0d4e2ead693245 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 23 Sep 2019 15:37:32 -0700 Subject: [PATCH 1051/2880] mm, compaction: raise compaction priority after it withdrawns Mike Kravetz reports that "hugetlb allocations could stall for minutes or hours when should_compact_retry() would return true more often then it should. Specifically, this was in the case where compact_result was COMPACT_DEFERRED and COMPACT_PARTIAL_SKIPPED and no progress was being made." The problem is that the compaction_withdrawn() test in should_compact_retry() includes compaction outcomes that are only possible on low compaction priority, and results in a retry without increasing the priority. This may result in furter reclaim, and more incomplete compaction attempts. With this patch, compaction priority is raised when possible, or should_compact_retry() returns false. The COMPACT_SKIPPED result doesn't really fit together with the other outcomes in compaction_withdrawn(), as that's a result caused by insufficient order-0 pages, not due to low compaction priority. With this patch, it is moved to a new compaction_needs_reclaim() function, and for that outcome we keep the current logic of retrying if it looks like reclaim will be able to help. Link: http://lkml.kernel.org/r/20190806014744.15446-4-mike.kravetz@oracle.com Reported-by: Mike Kravetz Signed-off-by: Vlastimil Babka Signed-off-by: Mike Kravetz Tested-by: Mike Kravetz Cc: Hillf Danton Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 22 +++++++++++++++++----- mm/page_alloc.c | 18 +++++++++++++----- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 9569e7c786d3..4b898cdbdf05 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -129,11 +129,8 @@ static inline bool compaction_failed(enum compact_result result) return false; } -/* - * Compaction has backed off for some reason. It might be throttling or - * lock contention. Retrying is still worthwhile. - */ -static inline bool compaction_withdrawn(enum compact_result result) +/* Compaction needs reclaim to be performed first, so it can continue. */ +static inline bool compaction_needs_reclaim(enum compact_result result) { /* * Compaction backed off due to watermark checks for order-0 @@ -142,6 +139,16 @@ static inline bool compaction_withdrawn(enum compact_result result) if (result == COMPACT_SKIPPED) return true; + return false; +} + +/* + * Compaction has backed off for some reason after doing some work or none + * at all. It might be throttling or lock contention. Retrying might be still + * worthwhile, but with a higher priority if allowed. + */ +static inline bool compaction_withdrawn(enum compact_result result) +{ /* * If compaction is deferred for high-order allocations, it is * because sync compaction recently failed. If this is the case @@ -207,6 +214,11 @@ static inline bool compaction_failed(enum compact_result result) return false; } +static inline bool compaction_needs_reclaim(enum compact_result result) +{ + return false; +} + static inline bool compaction_withdrawn(enum compact_result result) { return true; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df566c0f6729..591e026534c2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3955,16 +3955,24 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, goto check_priority; /* - * make sure the compaction wasn't deferred or didn't bail out early - * due to locks contention before we declare that we should give up. - * But do not retry if the given zonelist is not suitable for - * compaction. + * compaction was skipped because there are not enough order-0 pages + * to work with, so we retry only if it looks like reclaim can help. */ - if (compaction_withdrawn(compact_result)) { + if (compaction_needs_reclaim(compact_result)) { ret = compaction_zonelist_suitable(ac, order, alloc_flags); goto out; } + /* + * make sure the compaction wasn't deferred or didn't bail out early + * due to locks contention before we declare that we should give up. + * But the next retry should use a higher priority if allowed, so + * we don't just keep bailing out endlessly. + */ + if (compaction_withdrawn(compact_result)) { + goto check_priority; + } + /* * !costly requests are much more important than __GFP_RETRY_MAYFAIL * costly ones because they are de facto nofail and invoke OOM From f60858f9d327c4dd0c432abe9ec943a83929c229 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 23 Sep 2019 15:37:35 -0700 Subject: [PATCH 1052/2880] hugetlbfs: don't retry when pool page allocations start to fail When allocating hugetlbfs pool pages via /proc/sys/vm/nr_hugepages, the pages will be interleaved between all nodes of the system. If nodes are not equal, it is quite possible for one node to fill up before the others. When this happens, the code still attempts to allocate pages from the full node. This results in calls to direct reclaim and compaction which slow things down considerably. When allocating pool pages, note the state of the previous allocation for each node. If previous allocation failed, do not use the aggressive retry algorithm on successive attempts. The allocation will still succeed if there is memory available, but it will not try as hard to free up memory. Link: http://lkml.kernel.org/r/20190806014744.15446-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 79 insertions(+), 10 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6d7296dd11b8..ef37c85423a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1405,12 +1405,25 @@ pgoff_t __basepage_index(struct page *page) } static struct page *alloc_buddy_huge_page(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask) + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { int order = huge_page_order(h); struct page *page; + bool alloc_try_hard = true; - gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + /* + * By default we always try hard to allocate the page with + * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in + * a loop (to adjust global huge page counts) and previous allocation + * failed, do not continue to try hard on the same node. Use the + * node_alloc_noretry bitmap to manage this state information. + */ + if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) + alloc_try_hard = false; + gfp_mask |= __GFP_COMP|__GFP_NOWARN; + if (alloc_try_hard) + gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); @@ -1419,6 +1432,22 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, else __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + /* + * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this + * indicates an overall state change. Clear bit so that we resume + * normal 'try hard' allocations. + */ + if (node_alloc_noretry && page && !alloc_try_hard) + node_clear(nid, *node_alloc_noretry); + + /* + * If we tried hard to get a page but failed, set bit so that + * subsequent attempts will not try as hard until there is an + * overall state change. + */ + if (node_alloc_noretry && !page && alloc_try_hard) + node_set(nid, *node_alloc_noretry); + return page; } @@ -1427,7 +1456,8 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, * should use this function to get new hugetlb pages */ static struct page *alloc_fresh_huge_page(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask) + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { struct page *page; @@ -1435,7 +1465,7 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, page = alloc_gigantic_page(h, gfp_mask, nid, nmask); else page = alloc_buddy_huge_page(h, gfp_mask, - nid, nmask); + nid, nmask, node_alloc_noretry); if (!page) return NULL; @@ -1450,14 +1480,16 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, * Allocates a fresh page to the hugetlb allocator pool in the node interleaved * manner. */ -static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + nodemask_t *node_alloc_noretry) { struct page *page; int nr_nodes, node; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed); + page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, + node_alloc_noretry); if (page) break; } @@ -1601,7 +1633,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock(&hugetlb_lock); - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; @@ -1637,7 +1669,7 @@ struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, if (hstate_is_gigantic(h)) return NULL; - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; @@ -2207,13 +2239,33 @@ static void __init gather_bootmem_prealloc(void) static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; + nodemask_t *node_alloc_noretry; + + if (!hstate_is_gigantic(h)) { + /* + * Bit mask controlling how hard we retry per-node allocations. + * Ignore errors as lower level routines can deal with + * node_alloc_noretry == NULL. If this kmalloc fails at boot + * time, we are likely in bigger trouble. + */ + node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), + GFP_KERNEL); + } else { + /* allocations done at boot time */ + node_alloc_noretry = NULL; + } + + /* bit mask controlling how hard we retry per-node allocations */ + if (node_alloc_noretry) + nodes_clear(*node_alloc_noretry); for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h)) break; } else if (!alloc_pool_huge_page(h, - &node_states[N_MEMORY])) + &node_states[N_MEMORY], + node_alloc_noretry)) break; cond_resched(); } @@ -2225,6 +2277,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) h->max_huge_pages, buf, i); h->max_huge_pages = i; } + + kfree(node_alloc_noretry); } static void __init hugetlb_init_hstates(void) @@ -2323,6 +2377,17 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; + NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); + + /* + * Bit mask controlling how hard we retry per-node allocations. + * If we can not allocate the bit mask, do not attempt to allocate + * the requested huge pages. + */ + if (node_alloc_noretry) + nodes_clear(*node_alloc_noretry); + else + return -ENOMEM; spin_lock(&hugetlb_lock); @@ -2356,6 +2421,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { spin_unlock(&hugetlb_lock); + NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } /* Fall through to decrease pool */ @@ -2388,7 +2454,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, /* yield cpu to avoid soft lockup */ cond_resched(); - ret = alloc_pool_huge_page(h, nodes_allowed); + ret = alloc_pool_huge_page(h, nodes_allowed, + node_alloc_noretry); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -2429,6 +2496,8 @@ out: h->max_huge_pages = persistent_huge_pages(h); spin_unlock(&hugetlb_lock); + NODEMASK_FREE(node_alloc_noretry); + return 0; } From 276f756d7002ceb66f8ae3f7bd8028e7ae3deed8 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Mon, 23 Sep 2019 15:37:38 -0700 Subject: [PATCH 1053/2880] mm/migrate.c: clean up useless code in migrate_vma_collect_pmd() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove unused 'pfn' variable. Link: http://lkml.kernel.org/r/1565167272-21453-1-git-send-email-kernelfans@gmail.com Signed-off-by: Pingfan Liu Reviewed-by: Andrew Morton Reviewed-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Mel Gorman Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mike Kravetz Cc: Andrea Arcangeli Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 374ef2fcb722..73d476d690b1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2218,17 +2218,15 @@ again: pte_t pte; pte = *ptep; - pfn = pte_pfn(pte); if (pte_none(pte)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; - pfn = 0; goto next; } if (!pte_present(pte)) { - mpfn = pfn = 0; + mpfn = 0; /* * Only care about unaddressable device page special @@ -2245,10 +2243,10 @@ again: if (is_write_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { + pfn = pte_pfn(pte); if (is_zero_pfn(pfn)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; - pfn = 0; goto next; } page = vm_normal_page(migrate->vma, addr, pte); @@ -2258,10 +2256,9 @@ again: /* FIXME support THP */ if (!page || !page->mapping || PageTransCompound(page)) { - mpfn = pfn = 0; + mpfn = 0; goto next; } - pfn = page_to_pfn(page); /* * By getting a reference on the page we pin it and that blocks From 9ef258bad7af2f6abee2bdfaaa6c41890d4f4db6 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 23 Sep 2019 15:37:41 -0700 Subject: [PATCH 1054/2880] thp: update split_huge_page_pmd() comment According to 78ddc5347341 ("thp: rename split_huge_page_pmd() to split_huge_pmd()"), update related comment. Link: http://lkml.kernel.org/r/20190731033406.185285-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: "Kirill A . Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/book3s64/hash_utils.c | 2 +- include/asm-generic/pgtable.h | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 3410ea9f4de1..6c123760164e 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1748,7 +1748,7 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr, /* * IF we try to do a HUGE PTE update after a withdraw is done. * we will find the below NULL. This happens when we do - * split_huge_page_pmd + * split_huge_pmd */ if (!hpte_slot_array) return; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index fae6abb3d586..818691846c90 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1002,9 +1002,8 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) * need this). If THP is not enabled, the pmd can't go away under the * code even if MADV_DONTNEED runs, but if THP is enabled we need to * run a pmd_trans_unstable before walking the ptes after - * split_huge_page_pmd returns (because it may have run when the pmd - * become null, but then a page fault can map in a THP and not a - * regular page). + * split_huge_pmd returns (because it may have run when the pmd become + * null, but then a page fault can map in a THP and not a regular page). */ static inline int pmd_trans_unstable(pmd_t *pmd) { From 585e5a7babd91fd85a5cc97b7324c6c2fc29e1ec Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:37:44 -0700 Subject: [PATCH 1055/2880] filemap: check compound_head(page)->mapping in filemap_fault() Patch series "Enable THP for text section of non-shmem files", v10; This patchset follows up discussion at LSF/MM 2019. The motivation is to put text section of an application in THP, and thus reduces iTLB miss rate and improves performance. Both Facebook and Oracle showed strong interests to this feature. To make reviews easier, this set aims a mininal valid product. Current version of the work does not have any changes to file system specific code. This comes with some limitations (discussed later). This set enables an application to "hugify" its text section by simply running something like: madvise(0x600000, 0x80000, MADV_HUGEPAGE); Before this call, the /proc//maps looks like: 00400000-074d0000 r-xp 00000000 00:27 2006927 app After this call, part of the text section is split out and mapped to THP: 00400000-00425000 r-xp 00000000 00:27 2006927 app 00600000-00e00000 r-xp 00200000 00:27 2006927 app <<< on THP 00e00000-074d0000 r-xp 00a00000 00:27 2006927 app Limitations: 1. This only works for text section (vma with VM_DENYWRITE). 2. Original limitation #2 is removed in v3. We gated this feature with an experimental config, READ_ONLY_THP_FOR_FS. Once we get better support on the write path, we can remove the config and enable it by default. Tested cases: 1. Tested with btrfs and ext4. 2. Tested with real work application (memcache like caching service). 3. Tested with "THP aware uprobe": https://patchwork.kernel.org/project/linux-mm/list/?series=131339 This patch (of 7): Currently, filemap_fault() avoids race condition with truncate by checking page->mapping == mapping. This does not work for compound pages. This patch let it check compound_head(page)->mapping instead. Link: http://lkml.kernel.org/r/20190801184244.3169074-2-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: William Kucharski Cc: Hillf Danton Cc: Hugh Dickins Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 533f271d6839..fc7818b4537e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2540,7 +2540,7 @@ retry_find: goto out_retry; /* Did it get truncated? */ - if (unlikely(page->mapping != mapping)) { + if (unlikely(compound_head(page)->mapping != mapping)) { unlock_page(page); put_page(page); goto retry_find; From 31895438e702f48e25b7aa6d88f9c97c795c79c7 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:37:47 -0700 Subject: [PATCH 1056/2880] filemap: check compound_head(page)->mapping in pagecache_get_page() Similar to previous patch, pagecache_get_page() avoids race condition with truncate by checking page->mapping == mapping. This does not work for compound pages. This patch let it check compound_head(page)->mapping instead. Link: http://lkml.kernel.org/r/20190801184244.3169074-3-songliubraving@fb.com Signed-off-by: Song Liu Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Rik van Riel Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index fc7818b4537e..0b301103ea80 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1647,7 +1647,7 @@ repeat: } /* Has the page been truncated? */ - if (unlikely(page->mapping != mapping)) { + if (unlikely(compound_head(page)->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; From 520e5ba415906373186bcd3c7cffa3535bfdbdde Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:37:50 -0700 Subject: [PATCH 1057/2880] filemap: update offset check in filemap_fault() With THP, current check of offset: VM_BUG_ON_PAGE(page->index != offset, page); is no longer accurate. Update it to: VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); Link: http://lkml.kernel.org/r/20190801184244.3169074-4-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 0b301103ea80..f4d2971abd7c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2545,7 +2545,7 @@ retry_find: put_page(page); goto retry_find; } - VM_BUG_ON_PAGE(page->index != offset, page); + VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); /* * We have a locked page in the page cache, now we need to check From 60fbf0ab5da1c360e02b7f7d882bf1c0d8f7e32a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:37:54 -0700 Subject: [PATCH 1058/2880] mm,thp: stats for file backed THP In preparation for non-shmem THP, this patch adds a few stats and exposes them in /proc/meminfo, /sys/bus/node/devices//meminfo, and /proc//task//smaps. This patch is mostly a rewrite of Kirill A. Shutemov's earlier version: https://lkml.kernel.org/r/20170126115819.58875-5-kirill.shutemov@linux.intel.com/ Link: http://lkml.kernel.org/r/20190801184244.3169074-5-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 6 ++++++ fs/proc/meminfo.c | 4 ++++ fs/proc/task_mmu.c | 4 +++- include/linux/mmzone.h | 2 ++ mm/vmstat.c | 2 ++ 5 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 257449cf061f..296546ffed6c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -427,6 +427,8 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d AnonHugePages: %8lu kB\n" "Node %d ShmemHugePages: %8lu kB\n" "Node %d ShmemPmdMapped: %8lu kB\n" + "Node %d FileHugePages: %8lu kB\n" + "Node %d FilePmdMapped: %8lu kB\n" #endif , nid, K(node_page_state(pgdat, NR_FILE_DIRTY)), @@ -452,6 +454,10 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * + HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_FILE_THPS) * + HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) * HPAGE_PMD_NR) #endif ); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 4bd80e68eb03..ac9247371871 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -132,6 +132,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); + show_val_kb(m, "FileHugePages: ", + global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR); + show_val_kb(m, "FilePmdMapped: ", + global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); #endif #ifdef CONFIG_CMA diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ea1630465474..9442631fd4af 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -417,6 +417,7 @@ struct mem_size_stats { unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; + unsigned long file_thp; unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; @@ -588,7 +589,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, else if (is_zone_device_page(page)) /* pass */; else - VM_BUG_ON_PAGE(1, page); + mss->file_thp += HPAGE_PMD_SIZE; smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); } #else @@ -809,6 +810,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", mss->private_hugetlb >> 10, 7); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3f38c30d2f13..aafb7c38c627 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -235,6 +235,8 @@ enum node_stat_item { NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, + NR_FILE_THPS, + NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_VMSCAN_WRITE, diff --git a/mm/vmstat.c b/mm/vmstat.c index fd7e16ca6996..6afc892a148a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1158,6 +1158,8 @@ const char * const vmstat_text[] = { "nr_shmem", "nr_shmem_hugepages", "nr_shmem_pmdmapped", + "nr_file_hugepages", + "nr_file_pmdmapped", "nr_anon_transparent_hugepages", "nr_unstable", "nr_vmscan_write", From 579c571e2efdb8e5b50959ae66b6142e05bd704f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:37:57 -0700 Subject: [PATCH 1059/2880] khugepaged: rename collapse_shmem() and khugepaged_scan_shmem() Next patch will add khugepaged support of non-shmem files. This patch renames these two functions to reflect the new functionality: collapse_shmem() => collapse_file() khugepaged_scan_shmem() => khugepaged_scan_file() Link: http://lkml.kernel.org/r/20190801184244.3169074-6-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 04a54ff5a8ac..7e36cc893f25 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1287,7 +1287,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) } /** - * collapse_shmem - collapse small tmpfs/shmem pages into huge one. + * collapse_file - collapse small tmpfs/shmem pages into huge one. * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; @@ -1304,10 +1304,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static void collapse_shmem(struct mm_struct *mm, - struct address_space *mapping, pgoff_t start, +static void collapse_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage, int node) { + struct address_space *mapping = file->f_mapping; gfp_t gfp; struct page *new_page; struct mem_cgroup *memcg; @@ -1563,11 +1564,11 @@ out: /* TODO: tracepoints */ } -static void khugepaged_scan_shmem(struct mm_struct *mm, - struct address_space *mapping, - pgoff_t start, struct page **hpage) +static void khugepaged_scan_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage) { struct page *page = NULL; + struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; @@ -1631,16 +1632,15 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, result = SCAN_EXCEED_NONE_PTE; } else { node = khugepaged_find_target_node(); - collapse_shmem(mm, mapping, start, hpage, node); + collapse_file(mm, file, start, hpage, node); } } /* TODO: tracepoints */ } #else -static void khugepaged_scan_shmem(struct mm_struct *mm, - struct address_space *mapping, - pgoff_t start, struct page **hpage) +static void khugepaged_scan_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage) { BUILD_BUG(); } @@ -1722,8 +1722,7 @@ skip: file = get_file(vma->vm_file); up_read(&mm->mmap_sem); ret = 1; - khugepaged_scan_shmem(mm, file->f_mapping, - pgoff, hpage); + khugepaged_scan_file(mm, file, pgoff, hpage); fput(file); } else { ret = khugepaged_scan_pmd(mm, vma, From 99cb0dbd47a15d395bf3faa78dc122bc5efe3fc0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:00 -0700 Subject: [PATCH 1060/2880] mm,thp: add read-only THP support for (non-shmem) FS This patch is (hopefully) the first step to enable THP for non-shmem filesystems. This patch enables an application to put part of its text sections to THP via madvise, for example: madvise((void *)0x600000, 0x200000, MADV_HUGEPAGE); We tried to reuse the logic for THP on tmpfs. Currently, write is not supported for non-shmem THP. khugepaged will only process vma with VM_DENYWRITE. sys_mmap() ignores VM_DENYWRITE requests (see ksys_mmap_pgoff). The only way to create vma with VM_DENYWRITE is execve(). This requirement limits non-shmem THP to text sections. The next patch will handle writes, which would only happen when the all the vmas with VM_DENYWRITE are unmapped. An EXPERIMENTAL config, READ_ONLY_THP_FOR_FS, is added to gate this feature. [songliubraving@fb.com: fix build without CONFIG_SHMEM] Link: http://lkml.kernel.org/r/F53407FB-96CC-42E8-9862-105C92CC2B98@fb.com [songliubraving@fb.com: fix double unlock in collapse_file()] Link: http://lkml.kernel.org/r/B960CBFA-8EFC-4DA4-ABC5-1977FFF2CA57@fb.com Link: http://lkml.kernel.org/r/20190801184244.3169074-7-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Stephen Rothwell Cc: Dan Carpenter Cc: Hillf Danton Cc: Hugh Dickins Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 11 ++++ mm/filemap.c | 4 +- mm/khugepaged.c | 149 ++++++++++++++++++++++++++++++++++-------------- mm/rmap.c | 12 ++-- 4 files changed, 128 insertions(+), 48 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index f88be1cbcfb2..a5dae9a7eb51 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -712,6 +712,17 @@ config GUP_BENCHMARK config GUP_GET_PTE_LOW_HIGH bool +config READ_ONLY_THP_FOR_FS + bool "Read-only THP for filesystems (EXPERIMENTAL)" + depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM + + help + Allow khugepaged to put read-only file-backed pages in THP. + + This is marked experimental because it is a new feature. Write + support of file THPs will be developed in the next few release + cycles. + config ARCH_HAS_PTE_SPECIAL bool diff --git a/mm/filemap.c b/mm/filemap.c index f4d2971abd7c..91fe3a08ca4a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -203,8 +203,8 @@ static void unaccount_page_cache_page(struct address_space *mapping, __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); - } else { - VM_BUG_ON_PAGE(PageTransHuge(page), page); + } else if (PageTransHuge(page)) { + __dec_node_page_state(page, NR_FILE_THPS); } /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7e36cc893f25..8607c77431b3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -48,6 +48,7 @@ enum scan_result { SCAN_CGROUP_CHARGE_FAIL, SCAN_EXCEED_SWAP_PTE, SCAN_TRUNCATED, + SCAN_PAGE_HAS_PRIVATE, }; #define CREATE_TRACE_POINTS @@ -404,7 +405,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, (vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; - if (shmem_file(vma->vm_file)) { + + if (shmem_file(vma->vm_file) || + (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + vma->vm_file && + (vm_flags & VM_DENYWRITE))) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) return false; return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, @@ -456,8 +461,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long hstart, hend; /* - * khugepaged does not yet work on non-shmem files or special - * mappings. And file-private shmem THP is not supported. + * khugepaged only supports read-only files for non-shmem files. + * khugepaged does not yet work on special mappings. And + * file-private shmem THP is not supported. */ if (!hugepage_vma_check(vma, vm_flags)) return 0; @@ -1287,12 +1293,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) } /** - * collapse_file - collapse small tmpfs/shmem pages into huge one. + * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; * - scan page cache replacing old pages with the new one - * + swap in pages if necessary; + * + swap/gup in pages if necessary; * + fill in gaps; * + keep old pages around in case rollback is required; * - if replacing succeeds: @@ -1316,7 +1322,9 @@ static void collapse_file(struct mm_struct *mm, LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; + bool is_shmem = shmem_file(file); + VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); /* Only allocate from the target node */ @@ -1348,7 +1356,8 @@ static void collapse_file(struct mm_struct *mm, } while (1); __SetPageLocked(new_page); - __SetPageSwapBacked(new_page); + if (is_shmem) + __SetPageSwapBacked(new_page); new_page->index = start; new_page->mapping = mapping; @@ -1363,41 +1372,75 @@ static void collapse_file(struct mm_struct *mm, struct page *page = xas_next(&xas); VM_BUG_ON(index != xas.xa_index); - if (!page) { - /* - * Stop if extent has been truncated or hole-punched, - * and is now completely empty. - */ - if (index == start) { - if (!xas_next_entry(&xas, end - 1)) { - result = SCAN_TRUNCATED; + if (is_shmem) { + if (!page) { + /* + * Stop if extent has been truncated or + * hole-punched, and is now completely + * empty. + */ + if (index == start) { + if (!xas_next_entry(&xas, end - 1)) { + result = SCAN_TRUNCATED; + goto xa_locked; + } + xas_set(&xas, index); + } + if (!shmem_charge(mapping->host, 1)) { + result = SCAN_FAIL; goto xa_locked; } - xas_set(&xas, index); + xas_store(&xas, new_page); + nr_none++; + continue; } - if (!shmem_charge(mapping->host, 1)) { - result = SCAN_FAIL; + + if (xa_is_value(page) || !PageUptodate(page)) { + xas_unlock_irq(&xas); + /* swap in or instantiate fallocated page */ + if (shmem_getpage(mapping->host, index, &page, + SGP_NOHUGE)) { + result = SCAN_FAIL; + goto xa_unlocked; + } + } else if (trylock_page(page)) { + get_page(page); + xas_unlock_irq(&xas); + } else { + result = SCAN_PAGE_LOCK; goto xa_locked; } - xas_store(&xas, new_page); - nr_none++; - continue; - } - - if (xa_is_value(page) || !PageUptodate(page)) { - xas_unlock_irq(&xas); - /* swap in or instantiate fallocated page */ - if (shmem_getpage(mapping->host, index, &page, - SGP_NOHUGE)) { + } else { /* !is_shmem */ + if (!page || xa_is_value(page)) { + xas_unlock_irq(&xas); + page_cache_sync_readahead(mapping, &file->f_ra, + file, index, + PAGE_SIZE); + /* drain pagevecs to help isolate_lru_page() */ + lru_add_drain(); + page = find_lock_page(mapping, index); + if (unlikely(page == NULL)) { + result = SCAN_FAIL; + goto xa_unlocked; + } + } else if (!PageUptodate(page)) { + xas_unlock_irq(&xas); + wait_on_page_locked(page); + if (!trylock_page(page)) { + result = SCAN_PAGE_LOCK; + goto xa_unlocked; + } + get_page(page); + } else if (PageDirty(page)) { result = SCAN_FAIL; - goto xa_unlocked; + goto xa_locked; + } else if (trylock_page(page)) { + get_page(page); + xas_unlock_irq(&xas); + } else { + result = SCAN_PAGE_LOCK; + goto xa_locked; } - } else if (trylock_page(page)) { - get_page(page); - xas_unlock_irq(&xas); - } else { - result = SCAN_PAGE_LOCK; - goto xa_locked; } /* @@ -1426,6 +1469,12 @@ static void collapse_file(struct mm_struct *mm, goto out_unlock; } + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) { + result = SCAN_PAGE_HAS_PRIVATE; + goto out_unlock; + } + if (page_mapped(page)) unmap_mapping_pages(mapping, index, 1, false); @@ -1463,12 +1512,18 @@ out_unlock: goto xa_unlocked; } - __inc_node_page_state(new_page, NR_SHMEM_THPS); + if (is_shmem) + __inc_node_page_state(new_page, NR_SHMEM_THPS); + else + __inc_node_page_state(new_page, NR_FILE_THPS); + if (nr_none) { struct zone *zone = page_zone(new_page); __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); - __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); + if (is_shmem) + __mod_node_page_state(zone->zone_pgdat, + NR_SHMEM, nr_none); } xa_locked: @@ -1506,10 +1561,15 @@ xa_unlocked: SetPageUptodate(new_page); page_ref_add(new_page, HPAGE_PMD_NR - 1); - set_page_dirty(new_page); mem_cgroup_commit_charge(new_page, memcg, false, true); + + if (is_shmem) { + set_page_dirty(new_page); + lru_cache_add_anon(new_page); + } else { + lru_cache_add_file(new_page); + } count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); - lru_cache_add_anon(new_page); /* * Remove pte page tables, so we can re-fault the page as huge. @@ -1524,7 +1584,9 @@ xa_unlocked: /* Something went wrong: roll back page cache changes */ xas_lock_irq(&xas); mapping->nrpages -= nr_none; - shmem_uncharge(mapping->host, nr_none); + + if (is_shmem) + shmem_uncharge(mapping->host, nr_none); xas_set(&xas, start); xas_for_each(&xas, page, end - 1) { @@ -1607,7 +1669,8 @@ static void khugepaged_scan_file(struct mm_struct *mm, break; } - if (page_count(page) != 1 + page_mapcount(page)) { + if (page_count(page) != + 1 + page_mapcount(page) + page_has_private(page)) { result = SCAN_PAGE_COUNT; break; } @@ -1713,11 +1776,13 @@ skip: VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (shmem_file(vma->vm_file)) { + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { struct file *file; pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); - if (!shmem_huge_enabled(vma)) + + if (shmem_file(vma->vm_file) + && !shmem_huge_enabled(vma)) goto skip; file = get_file(vma->vm_file); up_read(&mm->mmap_sem); diff --git a/mm/rmap.c b/mm/rmap.c index 26006445c8b5..d9a23bb773bf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1189,8 +1189,10 @@ void page_add_file_rmap(struct page *page, bool compound) } if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); + if (PageSwapBacked(page)) + __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); + else + __inc_node_page_state(page, NR_FILE_PMDMAPPED); } else { if (PageTransCompound(page) && page_mapping(page)) { VM_WARN_ON_ONCE(!PageLocked(page)); @@ -1229,8 +1231,10 @@ static void page_remove_file_rmap(struct page *page, bool compound) } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) goto out; - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); + if (PageSwapBacked(page)) + __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); + else + __dec_node_page_state(page, NR_FILE_PMDMAPPED); } else { if (!atomic_add_negative(-1, &page->_mapcount)) goto out; From 09d91cda0e8207c1f14ee0d572f61a53dbcdaf85 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:03 -0700 Subject: [PATCH 1061/2880] mm,thp: avoid writes to file with THP in pagecache In previous patch, an application could put part of its text section in THP via madvise(). These THPs will be protected from writes when the application is still running (TXTBSY). However, after the application exits, the file is available for writes. This patch avoids writes to file THP by dropping page cache for the file when the file is open for write. A new counter nr_thps is added to struct address_space. In do_dentry_open(), if the file is open for write and nr_thps is non-zero, we drop page cache for the whole file. Link: http://lkml.kernel.org/r/20190801184244.3169074-8-songliubraving@fb.com Signed-off-by: Song Liu Reported-by: kbuild test robot Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 3 +++ fs/open.c | 8 ++++++++ include/linux/fs.h | 32 ++++++++++++++++++++++++++++++++ mm/filemap.c | 1 + mm/khugepaged.c | 4 +++- 5 files changed, 47 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 64bf28cf05cd..fef457a42882 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -181,6 +181,9 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->flags = 0; mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_set(&mapping->nr_thps, 0); +#endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->writeback_index = 0; diff --git a/fs/open.c b/fs/open.c index a59abe3c669a..c60cd22cc052 100644 --- a/fs/open.c +++ b/fs/open.c @@ -818,6 +818,14 @@ static int do_dentry_open(struct file *f, if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) return -EINVAL; } + + /* + * XXX: Huge page cache doesn't support writing yet. Drop all page + * cache for this file before processing writes. + */ + if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping)) + truncate_pagecache(inode, 0); + return 0; cleanup_all: diff --git a/include/linux/fs.h b/include/linux/fs.h index 866268c2c6e3..b0c6b0d34d02 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -429,6 +429,7 @@ int pagecache_write_end(struct file *, struct address_space *mapping, * @i_pages: Cached pages. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED mappings. + * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. @@ -446,6 +447,10 @@ struct address_space { struct xarray i_pages; gfp_t gfp_mask; atomic_t i_mmap_writable; +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + /* number of thp, only for non-shmem files */ + atomic_t nr_thps; +#endif struct rb_root_cached i_mmap; struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; @@ -2798,6 +2803,33 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) return errseq_sample(&mapping->wb_err); } +static inline int filemap_nr_thps(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + return atomic_read(&mapping->nr_thps); +#else + return 0; +#endif +} + +static inline void filemap_nr_thps_inc(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_inc(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + +static inline void filemap_nr_thps_dec(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_dec(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); diff --git a/mm/filemap.c b/mm/filemap.c index 91fe3a08ca4a..1146fcfa3215 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -205,6 +205,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, __dec_node_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { __dec_node_page_state(page, NR_FILE_THPS); + filemap_nr_thps_dec(mapping); } /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8607c77431b3..e89430ec5267 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1514,8 +1514,10 @@ out_unlock: if (is_shmem) __inc_node_page_state(new_page, NR_SHMEM_THPS); - else + else { __inc_node_page_state(new_page, NR_FILE_THPS); + filemap_nr_thps_inc(mapping); + } if (nr_none) { struct zone *zone = page_zone(new_page); From 364c1eebe453f06f0c1e837eb155a5725c9cd272 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 23 Sep 2019 15:38:06 -0700 Subject: [PATCH 1062/2880] mm: thp: extract split_queue_* into a struct Patch series "Make deferred split shrinker memcg aware", v6. Currently THP deferred split shrinker is not memcg aware, this may cause premature OOM with some configuration. For example the below test would run into premature OOM easily: $ cgcreate -g memory:thp $ echo 4G > /sys/fs/cgroup/memory/thp/memory/limit_in_bytes $ cgexec -g memory:thp transhuge-stress 4000 transhuge-stress comes from kernel selftest. It is easy to hit OOM, but there are still a lot THP on the deferred split queue, memcg direct reclaim can't touch them since the deferred split shrinker is not memcg aware. Convert deferred split shrinker memcg aware by introducing per memcg deferred split queue. The THP should be on either per node or per memcg deferred split queue if it belongs to a memcg. When the page is immigrated to the other memcg, it will be immigrated to the target memcg's deferred split queue too. Reuse the second tail page's deferred_list for per memcg list since the same THP can't be on multiple deferred split queues. Make deferred split shrinker not depend on memcg kmem since it is not slab. It doesn't make sense to not shrink THP even though memcg kmem is disabled. With the above change the test demonstrated above doesn't trigger OOM even though with cgroup.memory=nokmem. This patch (of 4): Put split_queue, split_queue_lock and split_queue_len into a struct in order to reduce code duplication when we convert deferred_split to memcg aware in the later patches. Link: http://lkml.kernel.org/r/1565144277-36240-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Suggested-by: "Kirill A . Shutemov" Acked-by: Kirill A. Shutemov Reviewed-by: Kirill Tkhai Cc: Johannes Weiner Cc: Michal Hocko Cc: Hugh Dickins Cc: Shakeel Butt Cc: David Rientjes Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 ++++++++--- mm/huge_memory.c | 45 +++++++++++++++++++++++------------------- mm/page_alloc.c | 8 +++++--- 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aafb7c38c627..bda20282746b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -679,6 +679,14 @@ struct zonelist { extern struct page *mem_map; #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +struct deferred_split { + spinlock_t split_queue_lock; + struct list_head split_queue; + unsigned long split_queue_len; +}; +#endif + /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which @@ -758,9 +766,7 @@ typedef struct pglist_data { #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE - spinlock_t split_queue_lock; - struct list_head split_queue; - unsigned long split_queue_len; + struct deferred_split deferred_split_queue; #endif /* Fields commonly accessed by the page reclaim scanner */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 483b07b2d6ae..c642c0394690 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2691,6 +2691,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int count, mapcount, extra_pins, ret; @@ -2777,17 +2778,17 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } /* Prevent deferred_split_scan() touching ->_refcount */ - spin_lock(&pgdata->split_queue_lock); + spin_lock(&ds_queue->split_queue_lock); count = page_count(head); mapcount = total_mapcount(head); if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { - pgdata->split_queue_len--; + ds_queue->split_queue_len--; list_del(page_deferred_list(head)); } if (mapping) __dec_node_page_state(page, NR_SHMEM_THPS); - spin_unlock(&pgdata->split_queue_lock); + spin_unlock(&ds_queue->split_queue_lock); __split_huge_page(page, list, end, flags); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2804,7 +2805,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) dump_page(page, "total_mapcount(head) > 0"); BUG(); } - spin_unlock(&pgdata->split_queue_lock); + spin_unlock(&ds_queue->split_queue_lock); fail: if (mapping) xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(&pgdata->lru_lock, flags); @@ -2827,52 +2828,56 @@ out: void free_transhuge_page(struct page *page) { struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; - spin_lock_irqsave(&pgdata->split_queue_lock, flags); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(page_deferred_list(page))) { - pgdata->split_queue_len--; + ds_queue->split_queue_len--; list_del(page_deferred_list(page)); } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); free_compound_page(page); } void deferred_split_huge_page(struct page *page) { struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; VM_BUG_ON_PAGE(!PageTransHuge(page), page); - spin_lock_irqsave(&pgdata->split_queue_lock, flags); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); - list_add_tail(page_deferred_list(page), &pgdata->split_queue); - pgdata->split_queue_len++; + list_add_tail(page_deferred_list(page), &ds_queue->split_queue); + ds_queue->split_queue_len++; } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); - return READ_ONCE(pgdata->split_queue_len); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + return READ_ONCE(ds_queue->split_queue_len); } static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; LIST_HEAD(list), *pos, *next; struct page *page; int split = 0; - spin_lock_irqsave(&pgdata->split_queue_lock, flags); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_safe(pos, next, &pgdata->split_queue) { + list_for_each_safe(pos, next, &ds_queue->split_queue) { page = list_entry((void *)pos, struct page, mapping); page = compound_head(page); if (get_page_unless_zero(page)) { @@ -2880,12 +2885,12 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } else { /* We lost race with put_compound_page() */ list_del_init(page_deferred_list(page)); - pgdata->split_queue_len--; + ds_queue->split_queue_len--; } if (!--sc->nr_to_scan) break; } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); list_for_each_safe(pos, next, &list) { page = list_entry((void *)pos, struct page, mapping); @@ -2899,15 +2904,15 @@ next: put_page(page); } - spin_lock_irqsave(&pgdata->split_queue_lock, flags); - list_splice_tail(&list, &pgdata->split_queue); - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + list_splice_tail(&list, &ds_queue->split_queue); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); /* * Stop shrinker if we didn't split any page, but the queue is empty. * This can happen if pages were freed under us. */ - if (!split && list_empty(&pgdata->split_queue)) + if (!split && list_empty(&ds_queue->split_queue)) return SHRINK_STOP; return split; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 591e026534c2..a41436cca563 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6646,9 +6646,11 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages, #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void pgdat_init_split_queue(struct pglist_data *pgdat) { - spin_lock_init(&pgdat->split_queue_lock); - INIT_LIST_HEAD(&pgdat->split_queue); - pgdat->split_queue_len = 0; + struct deferred_split *ds_queue = &pgdat->deferred_split_queue; + + spin_lock_init(&ds_queue->split_queue_lock); + INIT_LIST_HEAD(&ds_queue->split_queue); + ds_queue->split_queue_len = 0; } #else static void pgdat_init_split_queue(struct pglist_data *pgdat) {} From 7ae88534cdd96235cd775c03b32a75009355740b Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 23 Sep 2019 15:38:09 -0700 Subject: [PATCH 1063/2880] mm: move mem_cgroup_uncharge out of __page_cache_release() A later patch makes THP deferred split shrinker memcg aware, but it needs page->mem_cgroup information in THP destructor, which is called after mem_cgroup_uncharge() now. So move mem_cgroup_uncharge() from __page_cache_release() to compound page destructor, which is called by both THP and other compound pages except HugeTLB. And call it in __put_single_page() for single order page. Link: http://lkml.kernel.org/r/1565144277-36240-3-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Suggested-by: "Kirill A . Shutemov" Acked-by: Kirill A. Shutemov Reviewed-by: Kirill Tkhai Cc: Johannes Weiner Cc: Michal Hocko Cc: Hugh Dickins Cc: Shakeel Butt Cc: David Rientjes Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + mm/swap.c | 2 +- mm/vmscan.c | 6 ++---- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a41436cca563..3334a769eb91 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -670,6 +670,7 @@ out: void free_compound_page(struct page *page) { + mem_cgroup_uncharge(page); __free_pages_ok(page, compound_order(page)); } diff --git a/mm/swap.c b/mm/swap.c index 0226c5346560..784dc1620620 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -71,12 +71,12 @@ static void __page_cache_release(struct page *page) spin_unlock_irqrestore(&pgdat->lru_lock, flags); } __ClearPageWaiters(page); - mem_cgroup_uncharge(page); } static void __put_single_page(struct page *page) { __page_cache_release(page); + mem_cgroup_uncharge(page); free_unref_page(page); } diff --git a/mm/vmscan.c b/mm/vmscan.c index c27dd62ed594..c4ef8681637b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1487,10 +1487,9 @@ free_it: * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ - if (unlikely(PageTransHuge(page))) { - mem_cgroup_uncharge(page); + if (unlikely(PageTransHuge(page))) (*get_compound_page_dtor(page))(page); - } else + else list_add(&page->lru, &free_pages); continue; @@ -1911,7 +1910,6 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&pgdat->lru_lock); - mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&pgdat->lru_lock); } else From 0a432dcbeb32edcd211a5d8f7847d0da7642a8b4 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 23 Sep 2019 15:38:12 -0700 Subject: [PATCH 1064/2880] mm: shrinker: make shrinker not depend on memcg kmem Currently shrinker is just allocated and can work when memcg kmem is enabled. But, THP deferred split shrinker is not slab shrinker, it doesn't make too much sense to have such shrinker depend on memcg kmem. It should be able to reclaim THP even though memcg kmem is disabled. Introduce a new shrinker flag, SHRINKER_NONSLAB, for non-slab shrinker. When memcg kmem is disabled, just such shrinkers can be called in shrinking memcg slab. [yang.shi@linux.alibaba.com: add comment] Link: http://lkml.kernel.org/r/1566496227-84952-4-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1565144277-36240-4-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Kirill A. Shutemov Reviewed-by: Kirill Tkhai Cc: Johannes Weiner Cc: Michal Hocko Cc: "Kirill A . Shutemov" Cc: Hugh Dickins Cc: Shakeel Butt Cc: David Rientjes Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 19 +++++++----- include/linux/shrinker.h | 7 ++++- mm/memcontrol.c | 9 +----- mm/vmscan.c | 60 ++++++++++++++++++++------------------ 4 files changed, 49 insertions(+), 46 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ad8f1a397ae4..a3c0a639c824 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -128,9 +128,8 @@ struct mem_cgroup_per_node { struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; -#ifdef CONFIG_MEMCG_KMEM struct memcg_shrinker_map __rcu *shrinker_map; -#endif + struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ @@ -1311,6 +1310,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) } while ((memcg = parent_mem_cgroup(memcg))); return false; } + +extern int memcg_expand_shrinker_maps(int new_id); + +extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; @@ -1319,6 +1323,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; } + +static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id) +{ +} #endif struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); @@ -1390,10 +1399,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -extern int memcg_expand_shrinker_maps(int new_id); - -extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id); #else static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) @@ -1435,8 +1440,6 @@ static inline void memcg_put_cache_ids(void) { } -static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id) { } #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 9443cafd1969..0f80123650e2 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -69,7 +69,7 @@ struct shrinker { /* These are for internal use */ struct list_head list; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG /* ID in shrinker_idr */ int id; #endif @@ -81,6 +81,11 @@ struct shrinker { /* Flags */ #define SHRINKER_NUMA_AWARE (1 << 0) #define SHRINKER_MEMCG_AWARE (1 << 1) +/* + * It just makes sense when the shrinker is also MEMCG_AWARE for now, + * non-MEMCG_AWARE shrinker should not have this flag set. + */ +#define SHRINKER_NONSLAB (1 << 2) extern int prealloc_shrinker(struct shrinker *shrinker); extern void register_shrinker_prepared(struct shrinker *shrinker); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7bb2971dc75e..a385a7cb7d9f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -318,6 +318,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); struct workqueue_struct *memcg_kmem_cache_wq; +#endif static int memcg_shrinker_map_size; static DEFINE_MUTEX(memcg_shrinker_map_mutex); @@ -441,14 +442,6 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) } } -#else /* CONFIG_MEMCG_KMEM */ -static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) -{ - return 0; -} -static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } -#endif /* CONFIG_MEMCG_KMEM */ - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest diff --git a/mm/vmscan.c b/mm/vmscan.c index c4ef8681637b..4911754c93b7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -171,11 +171,22 @@ int vm_swappiness = 60; */ unsigned long vm_total_pages; +static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) +{ + /* Check for an overwrite */ + WARN_ON_ONCE(rs && task->reclaim_state); + + /* Check for the nulling of an already-nulled member */ + WARN_ON_ONCE(!rs && !task->reclaim_state); + + task->reclaim_state = rs; +} + static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); -#ifdef CONFIG_MEMCG_KMEM - +#ifdef CONFIG_MEMCG /* * We allow subsystems to populate their shrinker-related * LRU lists before register_shrinker_prepared() is called @@ -227,30 +238,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) idr_remove(&shrinker_idr, id); up_write(&shrinker_rwsem); } -#else /* CONFIG_MEMCG_KMEM */ -static int prealloc_memcg_shrinker(struct shrinker *shrinker) -{ - return 0; -} -static void unregister_memcg_shrinker(struct shrinker *shrinker) -{ -} -#endif /* CONFIG_MEMCG_KMEM */ - -static void set_task_reclaim_state(struct task_struct *task, - struct reclaim_state *rs) -{ - /* Check for an overwrite */ - WARN_ON_ONCE(rs && task->reclaim_state); - - /* Check for the nulling of an already-nulled member */ - WARN_ON_ONCE(!rs && !task->reclaim_state); - - task->reclaim_state = rs; -} - -#ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; @@ -305,6 +293,15 @@ static bool memcg_congested(pg_data_t *pgdat, } #else +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + return 0; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} + static bool global_reclaim(struct scan_control *sc) { return true; @@ -591,7 +588,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, return freed; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { @@ -599,7 +596,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, unsigned long ret, freed = 0; int i; - if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) + if (!mem_cgroup_online(memcg)) return 0; if (!down_read_trylock(&shrinker_rwsem)) @@ -625,6 +622,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, continue; } + /* Call non-slab shrinkers even though kmem is disabled */ + if (!memcg_kmem_enabled() && + !(shrinker->flags & SHRINKER_NONSLAB)) + continue; + ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { clear_bit(i, map->map); @@ -661,13 +663,13 @@ unlock: up_read(&shrinker_rwsem); return freed; } -#else /* CONFIG_MEMCG_KMEM */ +#else /* CONFIG_MEMCG */ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { return 0; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ /** * shrink_slab - shrink slab caches From 87eaceb3faa59b9b4d940ec9554ce251325d83fe Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 23 Sep 2019 15:38:15 -0700 Subject: [PATCH 1065/2880] mm: thp: make deferred split shrinker memcg aware Currently THP deferred split shrinker is not memcg aware, this may cause premature OOM with some configuration. For example the below test would run into premature OOM easily: $ cgcreate -g memory:thp $ echo 4G > /sys/fs/cgroup/memory/thp/memory/limit_in_bytes $ cgexec -g memory:thp transhuge-stress 4000 transhuge-stress comes from kernel selftest. It is easy to hit OOM, but there are still a lot THP on the deferred split queue, memcg direct reclaim can't touch them since the deferred split shrinker is not memcg aware. Convert deferred split shrinker memcg aware by introducing per memcg deferred split queue. The THP should be on either per node or per memcg deferred split queue if it belongs to a memcg. When the page is immigrated to the other memcg, it will be immigrated to the target memcg's deferred split queue too. Reuse the second tail page's deferred_list for per memcg list since the same THP can't be on multiple deferred split queues. [yang.shi@linux.alibaba.com: simplify deferred split queue dereference per Kirill Tkhai] Link: http://lkml.kernel.org/r/1566496227-84952-5-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1565144277-36240-5-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Kirill A. Shutemov Reviewed-by: Kirill Tkhai Cc: Johannes Weiner Cc: Michal Hocko Cc: "Kirill A . Shutemov" Cc: Hugh Dickins Cc: Shakeel Butt Cc: David Rientjes Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 9 ++++++ include/linux/memcontrol.h | 4 +++ include/linux/mm_types.h | 1 + mm/huge_memory.c | 62 ++++++++++++++++++++++++++++++++------ mm/memcontrol.c | 24 +++++++++++++++ 5 files changed, 91 insertions(+), 9 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 45ede62aa85b..61c9ffd89b05 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -267,6 +267,15 @@ static inline bool thp_migration_supported(void) return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } +static inline struct list_head *page_deferred_list(struct page *page) +{ + /* + * Global or memcg deferred list in the second tail pages is + * occupied by compound_head. + */ + return &page[2].deferred_list; +} + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a3c0a639c824..9b60863429cc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -330,6 +330,10 @@ struct mem_cgroup { struct list_head event_list; spinlock_t event_list_lock; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct deferred_split deferred_split_queue; +#endif + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0b739f360cec..5183e0d77dfa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -138,6 +138,7 @@ struct page { struct { /* Second tail page of compound page */ unsigned long _compound_pad_1; /* compound_head */ unsigned long _compound_pad_2; + /* For both global and memcg */ struct list_head deferred_list; }; struct { /* Page table pages */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c642c0394690..73fc517c08d2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -496,11 +496,25 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } -static inline struct list_head *page_deferred_list(struct page *page) +#ifdef CONFIG_MEMCG +static inline struct deferred_split *get_deferred_split_queue(struct page *page) { - /* ->lru in the tail pages is occupied by compound_head. */ - return &page[2].deferred_list; + struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + + if (memcg) + return &memcg->deferred_split_queue; + else + return &pgdat->deferred_split_queue; } +#else +static inline struct deferred_split *get_deferred_split_queue(struct page *page) +{ + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + + return &pgdat->deferred_split_queue; +} +#endif void prep_transhuge_page(struct page *page) { @@ -2691,7 +2705,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + struct deferred_split *ds_queue = get_deferred_split_queue(page); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int count, mapcount, extra_pins, ret; @@ -2827,8 +2841,7 @@ out: void free_transhuge_page(struct page *page) { - struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + struct deferred_split *ds_queue = get_deferred_split_queue(page); unsigned long flags; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); @@ -2842,17 +2855,37 @@ void free_transhuge_page(struct page *page) void deferred_split_huge_page(struct page *page) { - struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + struct deferred_split *ds_queue = get_deferred_split_queue(page); +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; +#endif unsigned long flags; VM_BUG_ON_PAGE(!PageTransHuge(page), page); + /* + * The try_to_unmap() in page reclaim path might reach here too, + * this may cause a race condition to corrupt deferred split queue. + * And, if page reclaim is already handling the same page, it is + * unnecessary to handle it again in shrinker. + * + * Check PageSwapCache to determine if the page is being + * handled by page reclaim since THP swap would add the page into + * swap cache before calling try_to_unmap(). + */ + if (PageSwapCache(page)) + return; + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(page_deferred_list(page), &ds_queue->split_queue); ds_queue->split_queue_len++; +#ifdef CONFIG_MEMCG + if (memcg) + memcg_set_shrinker_bit(memcg, page_to_nid(page), + deferred_split_shrinker.id); +#endif } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } @@ -2862,6 +2895,11 @@ static unsigned long deferred_split_count(struct shrinker *shrink, { struct pglist_data *pgdata = NODE_DATA(sc->nid); struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + +#ifdef CONFIG_MEMCG + if (sc->memcg) + ds_queue = &sc->memcg->deferred_split_queue; +#endif return READ_ONCE(ds_queue->split_queue_len); } @@ -2875,6 +2913,11 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, struct page *page; int split = 0; +#ifdef CONFIG_MEMCG + if (sc->memcg) + ds_queue = &sc->memcg->deferred_split_queue; +#endif + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_safe(pos, next, &ds_queue->split_queue) { @@ -2921,7 +2964,8 @@ static struct shrinker deferred_split_shrinker = { .count_objects = deferred_split_count, .scan_objects = deferred_split_scan, .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | + SHRINKER_NONSLAB, }; #ifdef CONFIG_DEBUG_FS diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a385a7cb7d9f..2156ef775d04 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5070,6 +5070,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) memcg->cgwb_frn[i].done = __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); + INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); + memcg->deferred_split_queue.split_queue_len = 0; #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; @@ -5449,6 +5454,14 @@ static int mem_cgroup_move_account(struct page *page, __mod_memcg_state(to, NR_WRITEBACK, nr_pages); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (compound && !list_empty(page_deferred_list(page))) { + spin_lock(&from->deferred_split_queue.split_queue_lock); + list_del_init(page_deferred_list(page)); + from->deferred_split_queue.split_queue_len--; + spin_unlock(&from->deferred_split_queue.split_queue_lock); + } +#endif /* * It is safe to change page->mem_cgroup here because the page * is referenced, charged, and isolated - we can't race with @@ -5457,6 +5470,17 @@ static int mem_cgroup_move_account(struct page *page, /* caller should have done css_get */ page->mem_cgroup = to; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (compound && list_empty(page_deferred_list(page))) { + spin_lock(&to->deferred_split_queue.split_queue_lock); + list_add_tail(page_deferred_list(page), + &to->deferred_split_queue.split_queue); + to->deferred_split_queue.split_queue_len++; + spin_unlock(&to->deferred_split_queue.split_queue_lock); + } +#endif + spin_unlock_irqrestore(&from->move_lock, flags); ret = 0; From 010c164a5fa7e169deab0a4d8211611f1930c1cd Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:19 -0700 Subject: [PATCH 1066/2880] mm: move memcmp_pages() and pages_identical() Patch series "THP aware uprobe", v13. This patchset makes uprobe aware of THPs. Currently, when uprobe is attached to text on THP, the page is split by FOLL_SPLIT. As a result, uprobe eliminates the performance benefit of THP. This set makes uprobe THP-aware. Instead of FOLL_SPLIT, we introduces FOLL_SPLIT_PMD, which only split PMD for uprobe. After all uprobes within the THP are removed, the PTE-mapped pages are regrouped as huge PMD. This set (plus a few THP patches) is also available at https://github.com/liu-song-6/linux/tree/uprobe-thp This patch (of 6): Move memcmp_pages() to mm/util.c and pages_identical() to mm.h, so that we can use them in other files. Link: http://lkml.kernel.org/r/20190815164525.1848545-2-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Kirill A. Shutemov Reviewed-by: Oleg Nesterov Cc: Johannes Weiner Cc: Matthew Wilcox Cc: William Kucharski Cc: Srikar Dronamraju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 +++++++ mm/ksm.c | 18 ------------------ mm/util.c | 13 +++++++++++++ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 57a9fa34f159..0ac87d9ee9d8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2868,5 +2868,12 @@ void __init setup_nr_node_ids(void); static inline void setup_nr_node_ids(void) {} #endif +extern int memcmp_pages(struct page *page1, struct page *page2); + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/ksm.c b/mm/ksm.c index 3dc4346411e4..dbee2eb4dd05 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1029,24 +1029,6 @@ static u32 calc_checksum(struct page *page) return checksum; } -static int memcmp_pages(struct page *page1, struct page *page2) -{ - char *addr1, *addr2; - int ret; - - addr1 = kmap_atomic(page1); - addr2 = kmap_atomic(page2); - ret = memcmp(addr1, addr2, PAGE_SIZE); - kunmap_atomic(addr2); - kunmap_atomic(addr1); - return ret; -} - -static inline int pages_identical(struct page *page1, struct page *page2) -{ - return !memcmp_pages(page1, page2); -} - static int write_protect_page(struct vm_area_struct *vma, struct page *page, pte_t *orig_pte) { diff --git a/mm/util.c b/mm/util.c index bab284d69c8c..37f7b6711514 100644 --- a/mm/util.c +++ b/mm/util.c @@ -783,3 +783,16 @@ out_mm: out: return res; } + +int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_atomic(page1); + addr2 = kmap_atomic(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr2); + kunmap_atomic(addr1); + return ret; +} From fb4fb04ff4dd377b3132e9b31259263ec37b830a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:22 -0700 Subject: [PATCH 1067/2880] uprobe: use original page when all uprobes are removed Currently, uprobe swaps the target page with a anonymous page in both install_breakpoint() and remove_breakpoint(). When all uprobes on a page are removed, the given mm is still using an anonymous page (not the original page). This patch allows uprobe to use original page when possible (all uprobes on the page are already removed, and the original page is in page cache and uptodate). As suggested by Oleg, we unmap the old_page and let the original page fault in. Link: http://lkml.kernel.org/r/20190815164525.1848545-3-songliubraving@fb.com Signed-off-by: Song Liu Suggested-by: Oleg Nesterov Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 66 +++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 15 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 84fa00497c49..648f47553bff 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -143,10 +143,12 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) * * @vma: vma that holds the pte pointing to page * @addr: address the old @page is mapped at - * @page: the cowed page we are replacing by kpage - * @kpage: the modified page we replace page by + * @old_page: the page we are replacing by new_page + * @new_page: the modified page we replace page by * - * Returns 0 on success, -EFAULT on failure. + * If @new_page is NULL, only unmap @old_page. + * + * Returns 0 on success, negative error code otherwise. */ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) @@ -166,10 +168,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, - false); - if (err) - return err; + if (new_page) { + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, + &memcg, false); + if (err) + return err; + } /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(old_page); @@ -177,15 +181,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; if (!page_vma_mapped_walk(&pvmw)) { - mem_cgroup_cancel_charge(new_page, memcg, false); + if (new_page) + mem_cgroup_cancel_charge(new_page, memcg, false); goto unlock; } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); - get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); - lru_cache_add_active_or_unevictable(new_page, vma); + if (new_page) { + get_page(new_page); + page_add_new_anon_rmap(new_page, vma, addr, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); + lru_cache_add_active_or_unevictable(new_page, vma); + } else + /* no new page, just dec_mm_counter for old_page */ + dec_mm_counter(mm, MM_ANONPAGES); if (!PageAnon(old_page)) { dec_mm_counter(mm, mm_counter_file(old_page)); @@ -194,8 +203,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); ptep_clear_flush_notify(vma, addr, pvmw.pte); - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + if (new_page) + set_pte_at_notify(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, false); if (!page_mapped(old_page)) @@ -488,6 +498,10 @@ retry: ref_ctr_updated = 1; } + ret = 0; + if (!is_register && !PageAnon(old_page)) + goto put_old; + ret = anon_vma_prepare(vma); if (ret) goto put_old; @@ -501,8 +515,30 @@ retry: copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + if (!is_register) { + struct page *orig_page; + pgoff_t index; + + VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); + + index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; + orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, + index); + + if (orig_page) { + if (PageUptodate(orig_page) && + pages_identical(new_page, orig_page)) { + /* let go new_page */ + put_page(new_page); + new_page = NULL; + } + put_page(orig_page); + } + } + ret = __replace_page(vma, vaddr, old_page, new_page); - put_page(new_page); + if (new_page) + put_page(new_page); put_old: put_page(old_page); From bfe7b00de6d1e25fee08484c4fbf1c1ed175be78 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:25 -0700 Subject: [PATCH 1068/2880] mm, thp: introduce FOLL_SPLIT_PMD Introduce a new foll_flag: FOLL_SPLIT_PMD. As the name says FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge page stays as-is. FOLL_SPLIT_PMD is useful for cases where we need to use regular pages, but would switch back to huge page and huge pmd on. One of such example is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe. Link: http://lkml.kernel.org/r/20190815164525.1848545-4-songliubraving@fb.com Signed-off-by: Song Liu Reviewed-by: Oleg Nesterov Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/gup.c | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0ac87d9ee9d8..233ad11938db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2591,6 +2591,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ +#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ /* * NOTE on FOLL_LONGTERM: diff --git a/mm/gup.c b/mm/gup.c index 012060efddf1..60c3915c8ee6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -384,7 +384,7 @@ retry_locked: spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - if (flags & FOLL_SPLIT) { + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { int ret; page = pmd_page(*pmd); if (is_huge_zero_page(page)) { @@ -393,7 +393,7 @@ retry_locked: split_huge_pmd(vma, pmd, address); if (pmd_trans_unstable(pmd)) ret = -EBUSY; - } else { + } else if (flags & FOLL_SPLIT) { if (unlikely(!try_get_page(page))) { spin_unlock(ptl); return ERR_PTR(-ENOMEM); @@ -405,6 +405,10 @@ retry_locked: put_page(page); if (pmd_none(*pmd)) return no_page_table(vma, flags); + } else { /* flags & FOLL_SPLIT_PMD */ + spin_unlock(ptl); + split_huge_pmd(vma, pmd, address); + ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; } return ret ? ERR_PTR(ret) : From 5a52c9df62b422087d0c88fc79641028e4472a3b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:27 -0700 Subject: [PATCH 1069/2880] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT Use the newly added FOLL_SPLIT_PMD in uprobe. This preserves the huge page when the uprobe is enabled. When the uprobe is disabled, newer instances of the same application could still benefit from huge page. For the next step, we will enable khugepaged to regroup the pmd, so that existing instances of the application could also benefit from huge page after the uprobe is disabled. Link: http://lkml.kernel.org/r/20190815164525.1848545-5-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Kirill A. Shutemov Reviewed-by: Srikar Dronamraju Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 648f47553bff..27b596f14463 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -155,7 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, { struct mm_struct *mm = vma->vm_mm; struct page_vma_mapped_walk pvmw = { - .page = old_page, + .page = compound_head(old_page), .vma = vma, .address = addr, }; @@ -166,8 +166,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); - VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); - if (new_page) { err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, false); @@ -481,7 +479,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); + FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL); if (ret <= 0) return ret; From 27e1f8273113adec0e98bf513e4091636b27cc2a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:30 -0700 Subject: [PATCH 1070/2880] khugepaged: enable collapse pmd for pte-mapped THP khugepaged needs exclusive mmap_sem to access page table. When it fails to lock mmap_sem, the page will fault in as pte-mapped THP. As the page is already a THP, khugepaged will not handle this pmd again. This patch enables the khugepaged to retry collapse the page table. struct mm_slot (in khugepaged.c) is extended with an array, containing addresses of pte-mapped THPs. We use array here for simplicity. We can easily replace it with more advanced data structures when needed. In khugepaged_scan_mm_slot(), if the mm contains pte-mapped THP, we try to collapse the page table. Since collapse may happen at an later time, some pages may already fault in. collapse_pte_mapped_thp() is added to properly handle these pages. collapse_pte_mapped_thp() also double checks whether all ptes in this pmd are mapping to the same THP. This is necessary because some subpage of the THP may be replaced, for example by uprobe. In such cases, it is not possible to collapse the pmd. [kirill.shutemov@linux.intel.com: add comments for retract_page_tables()] Link: http://lkml.kernel.org/r/20190816145443.6ard3iilytc6jlgv@box Link: http://lkml.kernel.org/r/20190815164525.1848545-6-songliubraving@fb.com Signed-off-by: Song Liu Signed-off-by: Kirill A. Shutemov Acked-by: Kirill A. Shutemov Suggested-by: Johannes Weiner Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/khugepaged.h | 12 +++ mm/khugepaged.c | 192 ++++++++++++++++++++++++++++++++++++- 2 files changed, 200 insertions(+), 4 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 082d1d2a5216..bc45ea1efbf7 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,6 +15,14 @@ extern int __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags); +#ifdef CONFIG_SHMEM +extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); +#else +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ +} +#endif #define khugepaged_enabled() \ (transparent_hugepage_flags & \ @@ -73,6 +81,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, { return 0; } +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_KHUGEPAGED_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e89430ec5267..0a1b4b484ac5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -77,6 +77,8 @@ static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct kmem_cache *mm_slot_cache __read_mostly; +#define MAX_PTE_MAPPED_THP 8 + /** * struct mm_slot - hash lookup from mm to mm_slot * @hash: hash collision list @@ -87,6 +89,10 @@ struct mm_slot { struct hlist_node hash; struct list_head mm_node; struct mm_struct *mm; + + /* pte-mapped THP in this mm */ + int nr_pte_mapped_thp; + unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP]; }; /** @@ -1254,6 +1260,159 @@ static void collect_mm_slot(struct mm_slot *mm_slot) } #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +/* + * Notify khugepaged that given addr of the mm is pte-mapped THP. Then + * khugepaged should try to collapse the page table. + */ +static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ + struct mm_slot *mm_slot; + + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) + mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; + spin_unlock(&khugepaged_mm_lock); + return 0; +} + +/** + * Try to collapse a pte-mapped THP for mm at address haddr. + * + * This function checks whether all the PTEs in the PMD are pointing to the + * right THP. If so, retract the page table so the THP can refault in with + * as pmd-mapped. + */ +void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) +{ + unsigned long haddr = addr & HPAGE_PMD_MASK; + struct vm_area_struct *vma = find_vma(mm, haddr); + struct page *hpage = NULL; + pte_t *start_pte, *pte; + pmd_t *pmd, _pmd; + spinlock_t *ptl; + int count = 0; + int i; + + if (!vma || !vma->vm_file || + vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) + return; + + /* + * This vm_flags may not have VM_HUGEPAGE if the page was not + * collapsed by this mm. But we can still collapse if the page is + * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() + * will not fail the vma for missing VM_HUGEPAGE + */ + if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) + return; + + pmd = mm_find_pmd(mm, haddr); + if (!pmd) + return; + + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + + /* step 1: check all mapped PTEs are to the right huge page */ + for (i = 0, addr = haddr, pte = start_pte; + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + struct page *page; + + /* empty pte, skip */ + if (pte_none(*pte)) + continue; + + /* page swapped out, abort */ + if (!pte_present(*pte)) + goto abort; + + page = vm_normal_page(vma, addr, *pte); + + if (!page || !PageCompound(page)) + goto abort; + + if (!hpage) { + hpage = compound_head(page); + /* + * The mapping of the THP should not change. + * + * Note that uprobe, debugger, or MAP_PRIVATE may + * change the page table, but the new page will + * not pass PageCompound() check. + */ + if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping)) + goto abort; + } + + /* + * Confirm the page maps to the correct subpage. + * + * Note that uprobe, debugger, or MAP_PRIVATE may change + * the page table, but the new page will not pass + * PageCompound() check. + */ + if (WARN_ON(hpage + i != page)) + goto abort; + count++; + } + + /* step 2: adjust rmap */ + for (i = 0, addr = haddr, pte = start_pte; + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + struct page *page; + + if (pte_none(*pte)) + continue; + page = vm_normal_page(vma, addr, *pte); + page_remove_rmap(page, false); + } + + pte_unmap_unlock(start_pte, ptl); + + /* step 3: set proper refcount and mm_counters. */ + if (hpage) { + page_ref_sub(hpage, count); + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); + } + + /* step 4: collapse pmd */ + ptl = pmd_lock(vma->vm_mm, pmd); + _pmd = pmdp_collapse_flush(vma, addr, pmd); + spin_unlock(ptl); + mm_dec_nr_ptes(mm); + pte_free(mm, pmd_pgtable(_pmd)); + return; + +abort: + pte_unmap_unlock(start_pte, ptl); +} + +static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + int i; + + if (likely(mm_slot->nr_pte_mapped_thp == 0)) + return 0; + + if (!down_write_trylock(&mm->mmap_sem)) + return -EBUSY; + + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) + collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]); + +out: + mm_slot->nr_pte_mapped_thp = 0; + up_write(&mm->mmap_sem); + return 0; +} + static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; @@ -1262,7 +1421,22 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - /* probably overkill */ + /* + * Check vma->anon_vma to exclude MAP_PRIVATE mappings that + * got written to. These VMAs are likely not worth investing + * down_write(mmap_sem) as PMD-mapping is likely to be split + * later. + * + * Not that vma->anon_vma check is racy: it can be set up after + * the check but before we took mmap_sem by the fault path. + * But page lock would prevent establishing any new ptes of the + * page, so we are safe. + * + * An alternative would be drop the check, but check that page + * table is clear before calling pmdp_collapse_flush() under + * ptl. It has higher chance to recover THP for the VMA, but + * has higher cost too. + */ if (vma->anon_vma) continue; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); @@ -1275,9 +1449,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) continue; /* * We need exclusive mmap_sem to retract page table. - * If trylock fails we would end up with pte-mapped THP after - * re-fault. Not ideal, but it's more important to not disturb - * the system too much. + * + * We use trylock due to lock inversion: we need to acquire + * mmap_sem while holding page lock. Fault path does it in + * reverse order. Trylock is a way to avoid deadlock. */ if (down_write_trylock(&vma->vm_mm->mmap_sem)) { spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); @@ -1287,6 +1462,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) up_write(&vma->vm_mm->mmap_sem); mm_dec_nr_ptes(vma->vm_mm); pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } else { + /* Try again later */ + khugepaged_add_pte_mapped_thp(vma->vm_mm, addr); } } i_mmap_unlock_write(mapping); @@ -1709,6 +1887,11 @@ static void khugepaged_scan_file(struct mm_struct *mm, { BUILD_BUG(); } + +static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +{ + return 0; +} #endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, @@ -1733,6 +1916,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, khugepaged_scan.mm_slot = mm_slot; } spin_unlock(&khugepaged_mm_lock); + khugepaged_collapse_pte_mapped_thps(mm_slot); mm = mm_slot->mm; /* From f385cb85a42fc4ba92464c2bfd2e2049d65353d3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:33 -0700 Subject: [PATCH 1071/2880] uprobe: collapse THP pmd after removing all uprobes After all uprobes are removed from the huge page (with PTE pgtable), it is possible to collapse the pmd and benefit from THP again. This patch does the collapse by calling collapse_pte_mapped_thp(). Link: http://lkml.kernel.org/r/20190815164525.1848545-7-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Kirill A. Shutemov Reported-by: kbuild test robot Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 27b596f14463..94d38a39d72e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -472,6 +473,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct page *old_page, *new_page; struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; + bool orig_page_huge = false; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); @@ -529,6 +531,9 @@ retry: /* let go new_page */ put_page(new_page); new_page = NULL; + + if (PageCompound(orig_page)) + orig_page_huge = true; } put_page(orig_page); } @@ -547,6 +552,10 @@ put_old: if (ret && is_register && ref_ctr_updated) update_ref_ctr(uprobe, mm, -1); + /* try collapse pmd for compound page */ + if (!ret && orig_page_huge) + collapse_pte_mapped_thp(mm, vaddr); + return ret; } From 649775be63c8b2e0b56ecc5bbc96d38205ec5259 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:37 -0700 Subject: [PATCH 1072/2880] mm, fs: move randomize_stack_top from fs to mm Patch series "Provide generic top-down mmap layout functions", v6. This series introduces generic functions to make top-down mmap layout easily accessible to architectures, in particular riscv which was the initial goal of this series. The generic implementation was taken from arm64 and used successively by arm, mips and finally riscv. Note that in addition the series fixes 2 issues: - stack randomization was taken into account even if not necessary. - [1] fixed an issue with mmap base which did not take into account randomization but did not report it to arm and mips, so by moving arm64 into a generic library, this problem is now fixed for both architectures. This work is an effort to factorize architecture functions to avoid code duplication and oversights as in [1]. [1]: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1429066.html This patch (of 14): This preparatory commit moves this function so that further introduction of generic topdown mmap layout is contained only in mm/util.c. Link: http://lkml.kernel.org/r/20190730055113.23635-2-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: Palmer Dabbelt Cc: Albert Ou Cc: Alexander Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 20 -------------------- include/linux/mm.h | 2 ++ mm/util.c | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d4e11b2e04f6..cec3b4146440 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -670,26 +670,6 @@ out: * libraries. There is no binary dependent code anywhere else. */ -#ifndef STACK_RND_MASK -#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ -#endif - -static unsigned long randomize_stack_top(unsigned long stack_top) -{ - unsigned long random_variable = 0; - - if (current->flags & PF_RANDOMIZE) { - random_variable = get_random_long(); - random_variable &= STACK_RND_MASK; - random_variable <<= PAGE_SHIFT; - } -#ifdef CONFIG_STACK_GROWSUP - return PAGE_ALIGN(stack_top) + random_variable; -#else - return PAGE_ALIGN(stack_top) - random_variable; -#endif -} - static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 233ad11938db..294a67b94147 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2328,6 +2328,8 @@ extern int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); +unsigned long randomize_stack_top(unsigned long stack_top); + extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long mmap_region(struct file *file, unsigned long addr, diff --git a/mm/util.c b/mm/util.c index 37f7b6711514..bf8af5e07c4a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include @@ -293,6 +295,26 @@ int vma_is_stack_for_current(struct vm_area_struct *vma) return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } +#ifndef STACK_RND_MASK +#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ +#endif + +unsigned long randomize_stack_top(unsigned long stack_top) +{ + unsigned long random_variable = 0; + + if (current->flags & PF_RANDOMIZE) { + random_variable = get_random_long(); + random_variable &= STACK_RND_MASK; + random_variable <<= PAGE_SHIFT; + } +#ifdef CONFIG_STACK_GROWSUP + return PAGE_ALIGN(stack_top) + random_variable; +#else + return PAGE_ALIGN(stack_top) - random_variable; +#endif +} + #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { From 28058ed61fc869d7e67916725a3f7e9de50e606b Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:40 -0700 Subject: [PATCH 1073/2880] arm64: make use of is_compat_task instead of hardcoding this test Each architecture has its own way to determine if a task is a compat task, by using is_compat_task in arch_mmap_rnd, it allows more genericity and then it prepares its moving to mm/. Link: http://lkml.kernel.org/r/20190730055113.23635-3-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Catalin Marinas Acked-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index b050641b5139..bb0140afed66 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -43,7 +43,7 @@ unsigned long arch_mmap_rnd(void) unsigned long rnd; #ifdef CONFIG_COMPAT - if (test_thread_flag(TIF_32BIT)) + if (is_compat_task()) rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); else #endif From e8d54b62c55ab6201de6d195fc2c276294c1f6ae Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:43 -0700 Subject: [PATCH 1074/2880] arm64: consider stack randomization for mmap base only when necessary Do not offset mmap base address because of stack randomization if current task does not want randomization. Note that x86 already implements this behaviour. Link: http://lkml.kernel.org/r/20190730055113.23635-4-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Catalin Marinas Acked-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index bb0140afed66..e4acaead67de 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -54,7 +54,11 @@ unsigned long arch_mmap_rnd(void) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; - unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap; + unsigned long pad = stack_guard_gap; + + /* Account for stack randomization if necessary */ + if (current->flags & PF_RANDOMIZE) + pad += (STACK_RND_MASK << PAGE_SHIFT); /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) From 67f3977f805b34cf0e41090679800d2091d41d49 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:47 -0700 Subject: [PATCH 1075/2880] arm64, mm: move generic mmap layout functions to mm arm64 handles top-down mmap layout in a way that can be easily reused by other architectures, so make it available in mm. It then introduces a new config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT that can be set by other architectures to benefit from those functions. Note that this new config depends on MMU being enabled, if selected without MMU support, a warning will be thrown. Link: http://lkml.kernel.org/r/20190730055113.23635-5-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Suggested-by: Christoph Hellwig Acked-by: Catalin Marinas Acked-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 10 ++++ arch/arm64/Kconfig | 1 + arch/arm64/include/asm/processor.h | 2 - arch/arm64/mm/mmap.c | 76 ----------------------------- kernel/sysctl.c | 6 ++- mm/util.c | 78 +++++++++++++++++++++++++++++- 6 files changed, 92 insertions(+), 81 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 0fcf8ec1e098..dfce421b8e8a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -706,6 +706,16 @@ config HAVE_ARCH_COMPAT_MMAP_BASES and vice-versa 32-bit applications to call 64-bit mmap(). Required for applications doing different bitness syscalls. +# This allows to use a set of generic functions to determine mmap base +# address by giving priority to top-down scheme only if the process +# is not in legacy mode (compat task, unlimited stack size or +# sysctl_legacy_va_layout). +# Architecture that selects this option can provide its own version of: +# - STACK_RND_MASK +config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT + bool + depends on MMU + config HAVE_COPY_THREAD_TLS bool help diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 37c610963eee..2d4b7044063c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -71,6 +71,7 @@ config ARM64 select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index c67848c55009..5623685c7d13 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -280,8 +280,6 @@ static inline void spin_lock_prefetch(const void *ptr) "nop") : : "p" (ptr)); } -#define HAVE_ARCH_PICK_MMAP_LAYOUT - extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */ extern void __init minsigstksz_setup(void); diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index e4acaead67de..3028bacbc4e9 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -20,82 +20,6 @@ #include -/* - * Leave enough space between the mmap area and the stack to honour ulimit in - * the face of randomisation. - */ -#define MIN_GAP (SZ_128M) -#define MAX_GAP (STACK_TOP/6*5) - -static int mmap_is_legacy(struct rlimit *rlim_stack) -{ - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlim_stack->rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -unsigned long arch_mmap_rnd(void) -{ - unsigned long rnd; - -#ifdef CONFIG_COMPAT - if (is_compat_task()) - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); - else -#endif - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); - return rnd << PAGE_SHIFT; -} - -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) -{ - unsigned long gap = rlim_stack->rlim_cur; - unsigned long pad = stack_guard_gap; - - /* Account for stack randomization if necessary */ - if (current->flags & PF_RANDOMIZE) - pad += (STACK_RND_MASK << PAGE_SHIFT); - - /* Values close to RLIM_INFINITY can overflow. */ - if (gap + pad > gap) - gap += pad; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(STACK_TOP - gap - rnd); -} - -/* - * This function, called very early during the creation of a new process VM - * image, sets up which VM layout function to use: - */ -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - /* - * Fall back to the standard layout if the personality bit is set, or - * if the expected stack growth is unlimited: - */ - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} - /* * You really shouldn't be using read() or write() on /dev/mem. This might go * away in the future. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 078950d9605b..00fcea236eba 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -264,7 +264,8 @@ extern struct ctl_table epoll_table[]; extern struct ctl_table firmware_config_table[]; #endif -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif @@ -1573,7 +1574,8 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec, .extra1 = SYSCTL_ZERO, }, -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) { .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, diff --git a/mm/util.c b/mm/util.c index bf8af5e07c4a..7922726f0a8f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -17,7 +17,12 @@ #include #include #include +#include +#include #include +#include +#include +#include #include @@ -315,7 +320,78 @@ unsigned long randomize_stack_top(unsigned long stack_top) #endif } -#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) +#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +#ifdef CONFIG_ARCH_HAS_ELF_RANDOMIZE +unsigned long arch_mmap_rnd(void) +{ + unsigned long rnd; + +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + if (is_compat_task()) + rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); + else +#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); + + return rnd << PAGE_SHIFT; +} +#endif /* CONFIG_ARCH_HAS_ELF_RANDOMIZE */ + +static int mmap_is_legacy(struct rlimit *rlim_stack) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + if (rlim_stack->rlim_cur == RLIM_INFINITY) + return 1; + + return sysctl_legacy_va_layout; +} + +/* + * Leave enough space between the mmap area and the stack to honour ulimit in + * the face of randomisation. + */ +#define MIN_GAP (SZ_128M) +#define MAX_GAP (STACK_TOP / 6 * 5) + +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) +{ + unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_guard_gap; + + /* Account for stack randomization if necessary */ + if (current->flags & PF_RANDOMIZE) + pad += (STACK_RND_MASK << PAGE_SHIFT); + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return PAGE_ALIGN(STACK_TOP - gap - rnd); +} + +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +{ + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = arch_mmap_rnd(); + + if (mmap_is_legacy(rlim_stack)) { + mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; + mm->get_unmapped_area = arch_get_unmapped_area; + } else { + mm->mmap_base = mmap_base(random_factor, rlim_stack); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + } +} +#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; From e7142bf5d231f3ccdf6ea6764d5080999b8e299d Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:50 -0700 Subject: [PATCH 1076/2880] arm64, mm: make randomization selected by generic topdown mmap layout This commits selects ARCH_HAS_ELF_RANDOMIZE when an arch uses the generic topdown mmap layout functions so that this security feature is on by default. Note that this commit also removes the possibility for arm64 to have elf randomization and no MMU: without MMU, the security added by randomization is worth nothing. Link: http://lkml.kernel.org/r/20190730055113.23635-6-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Catalin Marinas Acked-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 1 + arch/arm64/Kconfig | 1 - arch/arm64/kernel/process.c | 8 -------- mm/util.c | 11 +++++++++-- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index dfce421b8e8a..5f8a5d84dbbe 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -715,6 +715,7 @@ config HAVE_ARCH_COMPAT_MMAP_BASES config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT bool depends on MMU + select ARCH_HAS_ELF_RANDOMIZE config HAVE_COPY_THREAD_TLS bool diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 2d4b7044063c..866e05882799 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -15,7 +15,6 @@ config ARM64 select ARCH_HAS_DMA_COHERENT_TO_PFN select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI - select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 03689c0beb34..a47462def04b 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -557,14 +557,6 @@ unsigned long arch_align_stack(unsigned long sp) return sp & ~0xf; } -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - if (is_compat_task()) - return randomize_page(mm->brk, SZ_32M); - else - return randomize_page(mm->brk, SZ_1G); -} - /* * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. */ diff --git a/mm/util.c b/mm/util.c index 7922726f0a8f..3ad6db9a722e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -321,7 +321,15 @@ unsigned long randomize_stack_top(unsigned long stack_top) } #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT -#ifdef CONFIG_ARCH_HAS_ELF_RANDOMIZE +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + /* Is the current task 32bit ? */ + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) + return randomize_page(mm->brk, SZ_32M); + + return randomize_page(mm->brk, SZ_1G); +} + unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -335,7 +343,6 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -#endif /* CONFIG_ARCH_HAS_ELF_RANDOMIZE */ static int mmap_is_legacy(struct rlimit *rlim_stack) { From af0f4297286f13a75edf93677b1fb2fc16c412a7 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:54 -0700 Subject: [PATCH 1077/2880] arm: properly account for stack randomization and stack guard gap This commit takes care of stack randomization and stack guard gap when computing mmap base address and checks if the task asked for randomization. This fixes the problem uncovered and not fixed for arm here: https://lkml.kernel.org/r/20170622200033.25714-1-riel@redhat.com Link: http://lkml.kernel.org/r/20190730055113.23635-7-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/mmap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index f866870db749..bff3d00bda5b 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -18,8 +18,9 @@ (((pgoff)<> (PAGE_SHIFT - 12)) static int mmap_is_legacy(struct rlimit *rlim_stack) { @@ -35,6 +36,15 @@ static int mmap_is_legacy(struct rlimit *rlim_stack) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_guard_gap; + + /* Account for stack randomization if necessary */ + if (current->flags & PF_RANDOMIZE) + pad += (STACK_RND_MASK << PAGE_SHIFT); + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; if (gap < MIN_GAP) gap = MIN_GAP; From 86e568e9c0525fc40e76d827212d5e9721cf7504 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:57 -0700 Subject: [PATCH 1078/2880] arm: use STACK_TOP when computing mmap base address mmap base address must be computed wrt stack top address, using TASK_SIZE is wrong since STACK_TOP and TASK_SIZE are not equivalent. Link: http://lkml.kernel.org/r/20190730055113.23635-8-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index bff3d00bda5b..0b94b674aa91 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -19,7 +19,7 @@ /* gap between mmap and stack */ #define MIN_GAP (128*1024*1024UL) -#define MAX_GAP ((TASK_SIZE)/6*5) +#define MAX_GAP ((STACK_TOP)/6*5) #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) static int mmap_is_legacy(struct rlimit *rlim_stack) @@ -51,7 +51,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(TASK_SIZE - gap - rnd); + return PAGE_ALIGN(STACK_TOP - gap - rnd); } /* From dba79c3df4a2275132759b0bc04c64b7a510af4a Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:01 -0700 Subject: [PATCH 1079/2880] arm: use generic mmap top-down layout and brk randomization arm uses a top-down mmap layout by default that exactly fits the generic functions, so get rid of arch specific code and use the generic version by selecting ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT. As ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT selects ARCH_HAS_ELF_RANDOMIZE, use the generic version of arch_randomize_brk since it also fits. Note that this commit also removes the possibility for arm to have elf randomization and no MMU: without MMU, the security added by randomization is worth nothing. Note that it is safe to remove STACK_RND_MASK since it matches the default value. Link: http://lkml.kernel.org/r/20190730055113.23635-9-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 + arch/arm/include/asm/processor.h | 2 -- arch/arm/kernel/process.c | 5 --- arch/arm/mm/mmap.c | 62 -------------------------------- 4 files changed, 1 insertion(+), 69 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 229f2cdd81ca..8a50efb559f3 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -34,6 +34,7 @@ config ARM select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_IPC_PARSE_VERSION select BINFMT_FLAT_ARGVP_ENVP_ON_STACK select BUILDTIME_EXTABLE_SORT if MMU diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 20c2f42454b8..614bf829e454 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -140,8 +140,6 @@ static inline void prefetchw(const void *ptr) #endif #endif -#define HAVE_ARCH_PICK_MMAP_LAYOUT - #endif #endif /* __ASM_ARM_PROCESSOR_H */ diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index f934a6739fc0..9485acc520a4 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -319,11 +319,6 @@ unsigned long get_wchan(struct task_struct *p) return 0; } -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - return randomize_page(mm->brk, 0x02000000); -} - #ifdef CONFIG_MMU #ifdef CONFIG_KUSER_HELPERS /* diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index 0b94b674aa91..b8d912ac9e61 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -17,43 +17,6 @@ ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ (((pgoff)<> (PAGE_SHIFT - 12)) - -static int mmap_is_legacy(struct rlimit *rlim_stack) -{ - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlim_stack->rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) -{ - unsigned long gap = rlim_stack->rlim_cur; - unsigned long pad = stack_guard_gap; - - /* Account for stack randomization if necessary */ - if (current->flags & PF_RANDOMIZE) - pad += (STACK_RND_MASK << PAGE_SHIFT); - - /* Values close to RLIM_INFINITY can overflow. */ - if (gap + pad > gap) - gap += pad; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(STACK_TOP - gap - rnd); -} - /* * We need to ensure that shared mappings are correctly aligned to * avoid aliasing issues with VIPT caches. We need to ensure that @@ -181,31 +144,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } -unsigned long arch_mmap_rnd(void) -{ - unsigned long rnd; - - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); - - return rnd << PAGE_SHIFT; -} - -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} - /* * You really shouldn't be using read() or write() on /dev/mem. This * might go away in the future. From b1f61b5bde3a1f50392c97b4c8513d1b8efb1cf2 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:04 -0700 Subject: [PATCH 1080/2880] mips: properly account for stack randomization and stack guard gap This commit takes care of stack randomization and stack guard gap when computing mmap base address and checks if the task asked for randomization. This fixes the problem uncovered and not fixed for arm here: https://lkml.kernel.org/r/20170622200033.25714-1-riel@redhat.com Link: http://lkml.kernel.org/r/20190730055113.23635-10-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Acked-by: Paul Burton Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/mmap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index d79f2b432318..f5c778113384 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -21,8 +21,9 @@ unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ EXPORT_SYMBOL(shm_align_mask); /* gap between mmap and stack */ -#define MIN_GAP (128*1024*1024UL) -#define MAX_GAP ((TASK_SIZE)/6*5) +#define MIN_GAP (128*1024*1024UL) +#define MAX_GAP ((TASK_SIZE)/6*5) +#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) static int mmap_is_legacy(struct rlimit *rlim_stack) { @@ -38,6 +39,15 @@ static int mmap_is_legacy(struct rlimit *rlim_stack) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_guard_gap; + + /* Account for stack randomization if necessary */ + if (current->flags & PF_RANDOMIZE) + pad += (STACK_RND_MASK << PAGE_SHIFT); + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; if (gap < MIN_GAP) gap = MIN_GAP; From b5fb861790bf54486b68644fc27d6969bf772dd8 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:07 -0700 Subject: [PATCH 1081/2880] mips: use STACK_TOP when computing mmap base address mmap base address must be computed wrt stack top address, using TASK_SIZE is wrong since STACK_TOP and TASK_SIZE are not equivalent. Link: http://lkml.kernel.org/r/20190730055113.23635-11-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Acked-by: Paul Burton Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index f5c778113384..a7e84b2e71d7 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -22,7 +22,7 @@ EXPORT_SYMBOL(shm_align_mask); /* gap between mmap and stack */ #define MIN_GAP (128*1024*1024UL) -#define MAX_GAP ((TASK_SIZE)/6*5) +#define MAX_GAP ((STACK_TOP)/6*5) #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) static int mmap_is_legacy(struct rlimit *rlim_stack) @@ -54,7 +54,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(TASK_SIZE - gap - rnd); + return PAGE_ALIGN(STACK_TOP - gap - rnd); } #define COLOUR_ALIGN(addr, pgoff) \ From e548599fbe310754aa8f687d53c24d9cb5338ac4 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:11 -0700 Subject: [PATCH 1082/2880] mips: adjust brk randomization offset to fit generic version This commit simply bumps up to 32MB and 1GB the random offset of brk, compared to 8MB and 256MB, for 32bit and 64bit respectively. Link: http://lkml.kernel.org/r/20190730055113.23635-12-alex@ghiti.fr Suggested-by: Kees Cook Signed-off-by: Alexandre Ghiti Acked-by: Paul Burton Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/mmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index a7e84b2e71d7..ff6ab87e9c56 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -16,6 +16,7 @@ #include #include #include +#include unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ EXPORT_SYMBOL(shm_align_mask); @@ -189,11 +190,11 @@ static inline unsigned long brk_rnd(void) unsigned long rnd = get_random_long(); rnd = rnd << PAGE_SHIFT; - /* 8MB for 32bit, 256MB for 64bit */ + /* 32MB for 32bit, 1GB for 64bit */ if (TASK_IS_32BIT_ADDR) - rnd = rnd & 0x7ffffful; + rnd = rnd & (SZ_32M - 1); else - rnd = rnd & 0xffffffful; + rnd = rnd & (SZ_1G - 1); return rnd; } From 09036468c8d074b730a840657a896f81c1c92017 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:14 -0700 Subject: [PATCH 1083/2880] mips: replace arch specific way to determine 32bit task with generic version Mips uses TASK_IS_32BIT_ADDR to determine if a task is 32bit, but this define is mips specific and other arches do not have it: instead, use !IS_ENABLED(CONFIG_64BIT) || is_compat_task() condition. Link: http://lkml.kernel.org/r/20190730055113.23635-13-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Paul Burton Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/mmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index ff6ab87e9c56..d5106c26ac6a 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -17,6 +17,7 @@ #include #include #include +#include unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ EXPORT_SYMBOL(shm_align_mask); @@ -191,7 +192,7 @@ static inline unsigned long brk_rnd(void) rnd = rnd << PAGE_SHIFT; /* 32MB for 32bit, 1GB for 64bit */ - if (TASK_IS_32BIT_ADDR) + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) rnd = rnd & (SZ_32M - 1); else rnd = rnd & (SZ_1G - 1); From 9035bd29427921cd32d268a830aff78dcafb945b Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:18 -0700 Subject: [PATCH 1084/2880] mips: use generic mmap top-down layout and brk randomization mips uses a top-down layout by default that exactly fits the generic functions, so get rid of arch specific code and use the generic version by selecting ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT. As ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT selects ARCH_HAS_ELF_RANDOMIZE, use the generic version of arch_randomize_brk since it also fits. Note that this commit also removes the possibility for mips to have elf randomization and no MMU: without MMU, the security added by randomization is worth nothing. Link: http://lkml.kernel.org/r/20190730055113.23635-14-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Paul Burton Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/Kconfig | 2 +- arch/mips/include/asm/processor.h | 5 -- arch/mips/mm/mmap.c | 96 ------------------------------- 3 files changed, 1 insertion(+), 102 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index cc8e2b1032a5..a0bd9bdb5f83 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -5,7 +5,6 @@ config MIPS select ARCH_32BIT_OFF_T if !64BIT select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT select ARCH_CLOCKSOURCE_DATA - select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_SUPPORTS_UPROBES @@ -13,6 +12,7 @@ config MIPS select ARCH_USE_CMPXCHG_LOCKREF if 64BIT select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index aca909bd7841..fba18d4a9190 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -29,11 +29,6 @@ extern unsigned int vced_count, vcei_count; -/* - * MIPS does have an arch_pick_mmap_layout() - */ -#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 - #ifdef CONFIG_32BIT #ifdef CONFIG_KVM_GUEST /* User space process size is limited to 1GB in KVM Guest Mode */ diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index d5106c26ac6a..00fe90c6db3e 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -16,49 +16,10 @@ #include #include #include -#include -#include unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ EXPORT_SYMBOL(shm_align_mask); -/* gap between mmap and stack */ -#define MIN_GAP (128*1024*1024UL) -#define MAX_GAP ((STACK_TOP)/6*5) -#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) - -static int mmap_is_legacy(struct rlimit *rlim_stack) -{ - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlim_stack->rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) -{ - unsigned long gap = rlim_stack->rlim_cur; - unsigned long pad = stack_guard_gap; - - /* Account for stack randomization if necessary */ - if (current->flags & PF_RANDOMIZE) - pad += (STACK_RND_MASK << PAGE_SHIFT); - - /* Values close to RLIM_INFINITY can overflow. */ - if (gap + pad > gap) - gap += pad; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(STACK_TOP - gap - rnd); -} - #define COLOUR_ALIGN(addr, pgoff) \ ((((addr) + shm_align_mask) & ~shm_align_mask) + \ (((pgoff) << PAGE_SHIFT) & shm_align_mask)) @@ -156,63 +117,6 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, addr0, len, pgoff, flags, DOWN); } -unsigned long arch_mmap_rnd(void) -{ - unsigned long rnd; - -#ifdef CONFIG_COMPAT - if (TASK_IS_32BIT_ADDR) - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); - else -#endif /* CONFIG_COMPAT */ - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); - - return rnd << PAGE_SHIFT; -} - -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} - -static inline unsigned long brk_rnd(void) -{ - unsigned long rnd = get_random_long(); - - rnd = rnd << PAGE_SHIFT; - /* 32MB for 32bit, 1GB for 64bit */ - if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) - rnd = rnd & (SZ_32M - 1); - else - rnd = rnd & (SZ_1G - 1); - - return rnd; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long base = mm->brk; - unsigned long ret; - - ret = PAGE_ALIGN(base + brk_rnd()); - - if (ret < mm->brk) - return mm->brk; - - return ret; -} - bool __virt_addr_valid(const volatile void *kaddr) { unsigned long vaddr = (unsigned long)kaddr; From 54c95a11cc1b5e1d578209e027adf5775395dafd Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:21 -0700 Subject: [PATCH 1085/2880] riscv: make mmap allocation top-down by default In order to avoid wasting user address space by using bottom-up mmap allocation scheme, prefer top-down scheme when possible. Before: root@qemuriscv64:~# cat /proc/self/maps 00010000-00016000 r-xp 00000000 fe:00 6389 /bin/cat.coreutils 00016000-00017000 r--p 00005000 fe:00 6389 /bin/cat.coreutils 00017000-00018000 rw-p 00006000 fe:00 6389 /bin/cat.coreutils 00018000-00039000 rw-p 00000000 00:00 0 [heap] 1555556000-155556d000 r-xp 00000000 fe:00 7193 /lib/ld-2.28.so 155556d000-155556e000 r--p 00016000 fe:00 7193 /lib/ld-2.28.so 155556e000-155556f000 rw-p 00017000 fe:00 7193 /lib/ld-2.28.so 155556f000-1555570000 rw-p 00000000 00:00 0 1555570000-1555572000 r-xp 00000000 00:00 0 [vdso] 1555574000-1555576000 rw-p 00000000 00:00 0 1555576000-1555674000 r-xp 00000000 fe:00 7187 /lib/libc-2.28.so 1555674000-1555678000 r--p 000fd000 fe:00 7187 /lib/libc-2.28.so 1555678000-155567a000 rw-p 00101000 fe:00 7187 /lib/libc-2.28.so 155567a000-15556a0000 rw-p 00000000 00:00 0 3fffb90000-3fffbb1000 rw-p 00000000 00:00 0 [stack] After: root@qemuriscv64:~# cat /proc/self/maps 00010000-00016000 r-xp 00000000 fe:00 6389 /bin/cat.coreutils 00016000-00017000 r--p 00005000 fe:00 6389 /bin/cat.coreutils 00017000-00018000 rw-p 00006000 fe:00 6389 /bin/cat.coreutils 2de81000-2dea2000 rw-p 00000000 00:00 0 [heap] 3ff7eb6000-3ff7ed8000 rw-p 00000000 00:00 0 3ff7ed8000-3ff7fd6000 r-xp 00000000 fe:00 7187 /lib/libc-2.28.so 3ff7fd6000-3ff7fda000 r--p 000fd000 fe:00 7187 /lib/libc-2.28.so 3ff7fda000-3ff7fdc000 rw-p 00101000 fe:00 7187 /lib/libc-2.28.so 3ff7fdc000-3ff7fe2000 rw-p 00000000 00:00 0 3ff7fe4000-3ff7fe6000 r-xp 00000000 00:00 0 [vdso] 3ff7fe6000-3ff7ffd000 r-xp 00000000 fe:00 7193 /lib/ld-2.28.so 3ff7ffd000-3ff7ffe000 r--p 00016000 fe:00 7193 /lib/ld-2.28.so 3ff7ffe000-3ff7fff000 rw-p 00017000 fe:00 7193 /lib/ld-2.28.so 3ff7fff000-3ff8000000 rw-p 00000000 00:00 0 3fff888000-3fff8a9000 rw-p 00000000 00:00 0 [stack] [alex@ghiti.fr: v6] Link: http://lkml.kernel.org/r/20190808061756.19712-15-alex@ghiti.fr Link: http://lkml.kernel.org/r/20190730055113.23635-15-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Christoph Hellwig Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Acked-by: Paul Walmsley [arch/riscv] Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/riscv/Kconfig | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 71d29fb4008a..8eebbc8860bb 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -59,6 +59,18 @@ config RISCV select ARCH_HAS_GIGANTIC_PAGE select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select SPARSEMEM_STATIC if 32BIT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU + select HAVE_ARCH_MMAP_RND_BITS + +config ARCH_MMAP_RND_BITS_MIN + default 18 if 64BIT + default 8 + +# max bits determined by the following formula: +# VA_BITS - PAGE_SHIFT - 3 +config ARCH_MMAP_RND_BITS_MAX + default 24 if 64BIT # SV39 based + default 17 config MMU def_bool y From 73848a9711105836346462e1d3c6b5765b452de1 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 23 Sep 2019 15:39:25 -0700 Subject: [PATCH 1086/2880] mm/mmap.c: refine find_vma_prev() with rb_last() When addr is out of range of the whole rb_tree, pprev will point to the right-most node. rb_tree facility already provides a helper function, rb_last(), to do this task. We can leverage this instead of reimplementing it. This patch refines find_vma_prev() with rb_last() to make it a little nicer to read. [akpm@linux-foundation.org: little cleanup, per Vlastimil] Link: http://lkml.kernel.org/r/20190809001928.4950-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 6bc21fca20bc..e4a8f67aad62 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2274,12 +2274,9 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, if (vma) { *pprev = vma->vm_prev; } else { - struct rb_node *rb_node = mm->mm_rb.rb_node; - *pprev = NULL; - while (rb_node) { - *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); - rb_node = rb_node->rb_right; - } + struct rb_node *rb_node = rb_last(&mm->mm_rb); + + *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL; } return vma; } From 76f34950779f3d7847c94615232dff2cdc2f9844 Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Mon, 23 Sep 2019 15:39:28 -0700 Subject: [PATCH 1087/2880] mm: mmap: increase sockets maximum memory size pgoff for 32bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AF_XDP sockets umem mapping interface uses XDP_UMEM_PGOFF_FILL_RING and XDP_UMEM_PGOFF_COMPLETION_RING offsets. These offsets are established already and are part of the configuration interface. But for 32-bit systems, using AF_XDP socket configuration, these values are too large to pass the maximum allowed file size verification. The offsets can be tuned off, but instead of changing the existing interface, let's extend the max allowed file size for sockets. No one has been using this until this patch with 32 bits as without this fix af_xdp sockets can't be used at all, so it unblocks af_xdp socket usage for 32bit systems. All list of mmap cbs for sockets was verified for side effects and all of them contain dummy cb - sock_no_mmap() at this moment, except the following: xsk_mmap() - it's what this fix is needed for. tcp_mmap() - doesn't have obvious issues with pgoff - no any references on it. packet_mmap() - return -EINVAL if it's even set. Link: http://lkml.kernel.org/r/20190812124326.32146-1-ivan.khoronzhuk@linaro.org Signed-off-by: Ivan Khoronzhuk Reviewed-by: Andrew Morton Cc: Björn Töpel Cc: Alexei Starovoitov Cc: Magnus Karlsson Cc: Daniel Borkmann Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/mmap.c b/mm/mmap.c index e4a8f67aad62..f1e8c7f93e04 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1358,6 +1358,9 @@ static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) if (S_ISBLK(inode->i_mode)) return MAX_LFS_FILESIZE; + if (S_ISSOCK(inode->i_mode)) + return MAX_LFS_FILESIZE; + /* Special "we do even unsigned file positions" case */ if (file->f_mode & FMODE_UNSIGNED_OFFSET) return 0; From f3bc0dba3154fa7e4a728e0be8f37bd5fb9100fe Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 23 Sep 2019 15:39:31 -0700 Subject: [PATCH 1088/2880] mm/madvise: reduce code duplication in error handling paths madvise_behavior() converts -ENOMEM to -EAGAIN in several places using identical code. Move that code to a common error handling path. No functional changes. Link: http://lkml.kernel.org/r/1564640896-1210-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Pankaj Gupta Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 52 ++++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 36 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 88babcc384b9..68ab988ad433 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -107,28 +107,14 @@ static long madvise_behavior(struct vm_area_struct *vma, case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; break; } @@ -154,15 +140,8 @@ static long madvise_behavior(struct vm_area_struct *vma, goto out; } error = __split_vma(mm, vma, start, 1); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; } if (end != vma->vm_end) { @@ -171,15 +150,8 @@ static long madvise_behavior(struct vm_area_struct *vma, goto out; } error = __split_vma(mm, vma, end, 0); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; } success: @@ -187,6 +159,14 @@ success: * vm_flags is protected by the mmap_sem held in write mode. */ vma->vm_flags = new_flags; + +out_convert_errno: + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; out: return error; } From 28eb3c80871951cae59b9c1b7385262e5d045ac3 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Mon, 23 Sep 2019 15:39:34 -0700 Subject: [PATCH 1089/2880] shmem: fix obsolete comment in shmem_getpage_gfp() Replace "fault_mm" with "vmf" in code comment because commit cfda05267f7b ("userfaultfd: shmem: add userfaultfd hook for shared memory faults") has changed the prototpye of shmem_getpage_gfp() - pass vmf instead of fault_mm to the function. Before: static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); After: static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, struct vm_fault *vmf, vm_fault_t *fault_type); Link: http://lkml.kernel.org/r/20190816100204.9781-1-miles.chen@mediatek.com Signed-off-by: Miles Chen Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 57a6aedf6649..30ce722c23fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1734,7 +1734,7 @@ unlock: * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * fault_mm and fault_type are only supplied by shmem_fault: + * vmf and fault_type are only supplied by shmem_fault: * otherwise they are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, From c165f25d23ecb2f9f121ced20435415b931219e2 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Mon, 23 Sep 2019 15:39:37 -0700 Subject: [PATCH 1090/2880] zpool: add malloc_support_movable to zpool_driver As a zpool_driver, zsmalloc can allocate movable memory because it support migate pages. But zbud and z3fold cannot allocate movable memory. Add malloc_support_movable to zpool_driver. If a zpool_driver support allocate movable memory, set it to true. And add zpool_malloc_support_movable check malloc_support_movable to make sure if a zpool support allocate movable memory. Link: http://lkml.kernel.org/r/20190605100630.13293-1-teawaterz@linux.alibaba.com Signed-off-by: Hui Zhu Reviewed-by: Shakeel Butt Cc: Dan Streetman Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zpool.h | 3 +++ mm/zpool.c | 16 ++++++++++++++++ mm/zsmalloc.c | 19 ++++++++++--------- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 7238865e75b0..51bf43076165 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -46,6 +46,8 @@ const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); +bool zpool_malloc_support_movable(struct zpool *pool); + int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, unsigned long *handle); @@ -90,6 +92,7 @@ struct zpool_driver { struct zpool *zpool); void (*destroy)(void *pool); + bool malloc_support_movable; int (*malloc)(void *pool, size_t size, gfp_t gfp, unsigned long *handle); void (*free)(void *pool, unsigned long handle); diff --git a/mm/zpool.c b/mm/zpool.c index a2dd9107857d..863669212070 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -238,6 +238,22 @@ const char *zpool_get_type(struct zpool *zpool) return zpool->driver->type; } +/** + * zpool_malloc_support_movable() - Check if the zpool support + * allocate movable memory + * @zpool: The zpool to check + * + * This returns if the zpool support allocate movable memory. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: true if if the zpool support allocate movable memory, false if not + */ +bool zpool_malloc_support_movable(struct zpool *zpool) +{ + return zpool->driver->malloc_support_movable; +} + /** * zpool_malloc() - Allocate memory * @zpool: The zpool to allocate from. diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e98bb6ab4f7e..646858efd468 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -443,15 +443,16 @@ static u64 zs_zpool_total_size(void *pool) } static struct zpool_driver zs_zpool_driver = { - .type = "zsmalloc", - .owner = THIS_MODULE, - .create = zs_zpool_create, - .destroy = zs_zpool_destroy, - .malloc = zs_zpool_malloc, - .free = zs_zpool_free, - .map = zs_zpool_map, - .unmap = zs_zpool_unmap, - .total_size = zs_zpool_total_size, + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc_support_movable = true, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, }; MODULE_ALIAS("zpool-zsmalloc"); From d2fcd82bb83aab47c6d63aa8c960cd5edb578065 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Mon, 23 Sep 2019 15:39:40 -0700 Subject: [PATCH 1091/2880] zswap: use movable memory if zpool support allocate movable memory This is the third version that was updated according to the comments from Sergey Senozhatsky https://lkml.org/lkml/2019/5/29/73 and Shakeel Butt https://lkml.org/lkml/2019/6/4/973 zswap compresses swap pages into a dynamically allocated RAM-based memory pool. The memory pool should be zbud, z3fold or zsmalloc. All of them will allocate unmovable pages. It will increase the number of unmovable page blocks that will bad for anti-fragment. zsmalloc support page migration if request movable page: handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); And commit "zpool: Add malloc_support_movable to zpool_driver" add zpool_malloc_support_movable check malloc_support_movable to make sure if a zpool support allocate movable memory. This commit let zswap allocate block with gfp __GFP_HIGHMEM | __GFP_MOVABLE if zpool support allocate movable memory. Following part is test log in a pc that has 8G memory and 2G swap. Without this commit: ~# echo lz4 > /sys/module/zswap/parameters/compressor ~# echo zsmalloc > /sys/module/zswap/parameters/zpool ~# echo 1 > /sys/module/zswap/parameters/enabled ~# swapon /swapfile ~# cd /home/teawater/kernel/vm-scalability/ /home/teawater/kernel/vm-scalability# export unit_size=$((9 * 1024 * 1024 * 1024)) /home/teawater/kernel/vm-scalability# ./case-anon-w-seq 2717908992 bytes / 4826062 usecs = 549973 KB/s 2717908992 bytes / 4864201 usecs = 545661 KB/s 2717908992 bytes / 4867015 usecs = 545346 KB/s 2717908992 bytes / 4915485 usecs = 539968 KB/s 397853 usecs to free memory 357820 usecs to free memory 421333 usecs to free memory 420454 usecs to free memory /home/teawater/kernel/vm-scalability# cat /proc/pagetypeinfo Page block order: 9 Pages per block: 512 Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 Node 0, zone DMA, type Unmovable 1 1 1 0 2 1 1 0 1 0 0 Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 1 3 Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type Unmovable 6 5 8 6 6 5 4 1 1 1 0 Node 0, zone DMA32, type Movable 25 20 20 19 22 15 14 11 11 5 767 Node 0, zone DMA32, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type Unmovable 4753 5588 5159 4613 3712 2520 1448 594 188 11 0 Node 0, zone Normal, type Movable 16 3 457 2648 2143 1435 860 459 223 224 296 Node 0, zone Normal, type Reclaimable 0 0 44 38 11 2 0 0 0 0 0 Node 0, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Number of blocks type Unmovable Movable Reclaimable HighAtomic CMA Isolate Node 0, zone DMA 1 7 0 0 0 0 Node 0, zone DMA32 4 1652 0 0 0 0 Node 0, zone Normal 931 1485 15 0 0 0 With this commit: ~# echo lz4 > /sys/module/zswap/parameters/compressor ~# echo zsmalloc > /sys/module/zswap/parameters/zpool ~# echo 1 > /sys/module/zswap/parameters/enabled ~# swapon /swapfile ~# cd /home/teawater/kernel/vm-scalability/ /home/teawater/kernel/vm-scalability# export unit_size=$((9 * 1024 * 1024 * 1024)) /home/teawater/kernel/vm-scalability# ./case-anon-w-seq 2717908992 bytes / 4689240 usecs = 566020 KB/s 2717908992 bytes / 4760605 usecs = 557535 KB/s 2717908992 bytes / 4803621 usecs = 552543 KB/s 2717908992 bytes / 5069828 usecs = 523530 KB/s 431546 usecs to free memory 383397 usecs to free memory 456454 usecs to free memory 224487 usecs to free memory /home/teawater/kernel/vm-scalability# cat /proc/pagetypeinfo Page block order: 9 Pages per block: 512 Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 Node 0, zone DMA, type Unmovable 1 1 1 0 2 1 1 0 1 0 0 Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 1 3 Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type Unmovable 10 8 10 9 10 4 3 2 3 0 0 Node 0, zone DMA32, type Movable 18 12 14 16 16 11 9 5 5 6 775 Node 0, zone DMA32, type Reclaimable 0 0 0 0 0 0 0 0 0 0 1 Node 0, zone DMA32, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type Unmovable 2669 1236 452 118 37 14 4 1 2 3 0 Node 0, zone Normal, type Movable 3850 6086 5274 4327 3510 2494 1520 934 438 220 470 Node 0, zone Normal, type Reclaimable 56 93 155 124 47 31 17 7 3 0 0 Node 0, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Number of blocks type Unmovable Movable Reclaimable HighAtomic CMA Isolate Node 0, zone DMA 1 7 0 0 0 0 Node 0, zone DMA32 4 1650 2 0 0 0 Node 0, zone Normal 79 2326 26 0 0 0 You can see that the number of unmovable page blocks is decreased when the kernel has this commit. Link: http://lkml.kernel.org/r/20190605100630.13293-2-teawaterz@linux.alibaba.com Signed-off-by: Hui Zhu Reviewed-by: Shakeel Butt Cc: Dan Streetman Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 0e22744a76cb..08b6cefae5d8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -997,6 +997,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, char *buf; u8 *src, *dst; struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; + gfp_t gfp; /* THP isn't supported */ if (PageTransHuge(page)) { @@ -1070,9 +1071,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; - ret = zpool_malloc(entry->pool->zpool, hlen + dlen, - __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, - &handle); + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(entry->pool->zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; From 068619e32ff6229a09407d267e36ea7710b96ea1 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 23 Sep 2019 15:39:43 -0700 Subject: [PATCH 1092/2880] zswap: do not map same object twice zswap_writeback_entry() maps a handle to read swpentry first, and then in the most common case it would map the same handle again. This is ok when zbud is the backend since its mapping callback is plain and simple, but it slows things down for z3fold. Since there's hardly a point in unmapping a handle _that_ fast as zswap_writeback_entry() does when it reads swpentry, the suggestion is to keep the handle mapped till the end. Link: http://lkml.kernel.org/r/20190916004640.b453167d3556c4093af4cf7d@gmail.com Signed-off-by: Vitaly Wool Reviewed-by: Dan Streetman Cc: Shakeel Butt Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 08b6cefae5d8..46a322316e52 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -856,7 +856,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) /* extract swpentry from data */ zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ - zpool_unmap_handle(pool, handle); tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); @@ -866,6 +865,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) if (!entry) { /* entry was invalidated */ spin_unlock(&tree->lock); + zpool_unmap_handle(pool, handle); return 0; } spin_unlock(&tree->lock); @@ -886,15 +886,13 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, - ZPOOL_MM_RO) + sizeof(struct zswap_header); + src = (u8 *)zhdr + sizeof(struct zswap_header); dst = kmap_atomic(page); tfm = *get_cpu_ptr(entry->pool->tfm); ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); put_cpu_ptr(entry->pool->tfm); kunmap_atomic(dst); - zpool_unmap_handle(entry->pool->zpool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -940,6 +938,7 @@ fail: spin_unlock(&tree->lock); end: + zpool_unmap_handle(pool, handle); return ret; } From 2b38d01b4de8b1bbda7f5f7e91252609557635fc Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 23 Sep 2019 15:39:46 -0700 Subject: [PATCH 1093/2880] mm/zsmalloc.c: fix a -Wunused-function warning set_zspage_inuse() was introduced in the commit 4f42047bbde0 ("zsmalloc: use accessor") but all the users of it were removed later by the commits, bdb0af7ca8f0 ("zsmalloc: factor page chain functionality out") 3783689a1aa8 ("zsmalloc: introduce zspage structure") so the function can be safely removed now. Link: http://lkml.kernel.org/r/1568658408-19374-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Reviewed-by: Andrew Morton Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 646858efd468..2b2b9aae8a3c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -477,10 +477,6 @@ static inline int get_zspage_inuse(struct zspage *zspage) return zspage->inuse; } -static inline void set_zspage_inuse(struct zspage *zspage, int val) -{ - zspage->inuse = val; -} static inline void mod_zspage_inuse(struct zspage *zspage, int val) { From 981c107cbb420ee028f8ecd155352cfd6351c246 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 10 Sep 2019 21:11:37 +0100 Subject: [PATCH 1094/2880] selftests/tpm2: Add the missing TEST_FILES assignment The Python files required by the selftests are not packaged because of the missing assignment to TEST_FILES. Add the assignment. Cc: stable@vger.kernel.org Fixes: 6ea3dfe1e073 ("selftests: add TPM 2.0 tests") Signed-off-by: Jarkko Sakkinen Reviewed-by: Petr Vorel --- tools/testing/selftests/tpm2/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/tpm2/Makefile b/tools/testing/selftests/tpm2/Makefile index 9dd848427a7b..bf401f725eef 100644 --- a/tools/testing/selftests/tpm2/Makefile +++ b/tools/testing/selftests/tpm2/Makefile @@ -2,3 +2,4 @@ include ../lib.mk TEST_PROGS := test_smoke.sh test_space.sh +TEST_FILES := tpm2.py tpm2_tests.py From 34cd83bb8a468b24d9a13adbfb5ad645c552b8e2 Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Wed, 11 Sep 2019 11:34:42 +0200 Subject: [PATCH 1095/2880] selftests/tpm2: Add log and *.pyc to .gitignore Fixes: 6ea3dfe1e073 ("selftests: add TPM 2.0 tests") Signed-off-by: Petr Vorel Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- tools/testing/selftests/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/.gitignore b/tools/testing/selftests/.gitignore index 8059ce834247..61df01cdf0b2 100644 --- a/tools/testing/selftests/.gitignore +++ b/tools/testing/selftests/.gitignore @@ -2,3 +2,5 @@ gpiogpio-event-mon gpiogpio-hammer gpioinclude/ gpiolsgpio +tpm2/SpaceTest.log +tpm2/*.pyc From 9f75c82246313d4c2a6bc77e947b45655b3b5ad5 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Fri, 13 Sep 2019 20:51:36 +0200 Subject: [PATCH 1096/2880] KEYS: trusted: correctly initialize digests and fix locking issue Commit 0b6cf6b97b7e ("tpm: pass an array of tpm_extend_digest structures to tpm_pcr_extend()") modifies tpm_pcr_extend() to accept a digest for each PCR bank. After modification, tpm_pcr_extend() expects that digests are passed in the same order as the algorithms set in chip->allocated_banks. This patch fixes two issues introduced in the last iterations of the patch set: missing initialization of the TPM algorithm ID in the tpm_digest structures passed to tpm_pcr_extend() by the trusted key module, and unreleased locks in the TPM driver due to returning from tpm_pcr_extend() without calling tpm_put_ops(). Cc: stable@vger.kernel.org Fixes: 0b6cf6b97b7e ("tpm: pass an array of tpm_extend_digest structures to tpm_pcr_extend()") Signed-off-by: Roberto Sassu Suggested-by: Jarkko Sakkinen Reviewed-by: Jerry Snitselaar Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-interface.c | 14 +++++++++----- security/keys/trusted.c | 5 +++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index 1b4f95c13e00..d9ace5480665 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -320,18 +320,22 @@ int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, if (!chip) return -ENODEV; - for (i = 0; i < chip->nr_allocated_banks; i++) - if (digests[i].alg_id != chip->allocated_banks[i].alg_id) - return -EINVAL; + for (i = 0; i < chip->nr_allocated_banks; i++) { + if (digests[i].alg_id != chip->allocated_banks[i].alg_id) { + rc = EINVAL; + goto out; + } + } if (chip->flags & TPM_CHIP_FLAG_TPM2) { rc = tpm2_pcr_extend(chip, pcr_idx, digests); - tpm_put_ops(chip); - return rc; + goto out; } rc = tpm1_pcr_extend(chip, pcr_idx, digests[0].digest, "attempting extend a PCR value"); + +out: tpm_put_ops(chip); return rc; } diff --git a/security/keys/trusted.c b/security/keys/trusted.c index ade699131065..1fbd77816610 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -1228,11 +1228,16 @@ hashalg_fail: static int __init init_digests(void) { + int i; + digests = kcalloc(chip->nr_allocated_banks, sizeof(*digests), GFP_KERNEL); if (!digests) return -ENOMEM; + for (i = 0; i < chip->nr_allocated_banks; i++) + digests[i].alg_id = chip->allocated_banks[i].alg_id; + return 0; } From c980ecff4761f3c446f817430547781298815aa9 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 16 Aug 2019 01:12:00 +0300 Subject: [PATCH 1097/2880] MAINTAINERS: keys: Update path to trusted.h Update MAINTAINERS record to reflect that trusted.h was moved to a different directory in commit 22447981fc05 ("KEYS: Move trusted.h to include/keys [ver #2]"). Cc: Denis Kenzior Cc: James Bottomley Cc: Jarkko Sakkinen Cc: Mimi Zohar Cc: linux-integrity@vger.kernel.org Signed-off-by: Denis Efremov Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 54f1286087e9..ec807d446854 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9050,7 +9050,7 @@ S: Supported F: Documentation/security/keys/trusted-encrypted.rst F: include/keys/trusted-type.h F: security/keys/trusted.c -F: security/keys/trusted.h +F: include/keys/trusted.h KEYS/KEYRINGS: M: David Howells From e13cd21ffd50a07b55dcc4d8c38cedf27f28eaa1 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 16 Sep 2019 11:38:34 +0300 Subject: [PATCH 1098/2880] tpm: Wrap the buffer from the caller to tpm_buf in tpm_send() tpm_send() does not give anymore the result back to the caller. This would require another memcpy(), which kind of tells that the whole approach is somewhat broken. Instead, as Mimi suggested, this commit just wraps the data to the tpm_buf, and thus the result will not go to the garbage. Obviously this assumes from the caller that it passes large enough buffer, which makes the whole API somewhat broken because it could be different size than @buflen but since trusted keys is the only module using this API right now I think that this fix is sufficient for the moment. In the near future the plan is to replace the parameters with a tpm_buf created by the caller. Reported-by: Mimi Zohar Suggested-by: Mimi Zohar Cc: stable@vger.kernel.org Fixes: 412eb585587a ("use tpm_buf in tpm_transmit_cmd() as the IO parameter") Signed-off-by: Jarkko Sakkinen Reviewed-by: Jerry Snitselaar --- drivers/char/tpm/tpm-interface.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index d9ace5480665..d7a3888ad80f 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -358,14 +358,9 @@ int tpm_send(struct tpm_chip *chip, void *cmd, size_t buflen) if (!chip) return -ENODEV; - rc = tpm_buf_init(&buf, 0, 0); - if (rc) - goto out; - - memcpy(buf.data, cmd, buflen); + buf.data = cmd; rc = tpm_transmit_cmd(chip, &buf, 0, "attempting to a send a command"); - tpm_buf_destroy(&buf); -out: + tpm_put_ops(chip); return rc; } From 131ea1ed3322c6ec06eb8f276f226c8a1f3bbf1b Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 24 Sep 2019 23:27:34 -0500 Subject: [PATCH 1099/2880] smb3: Add missing reparse tags Additional reparse tags were described for WSL and file sync. Add missing defines for these tags. Some will be useful for POSIX extensions (as discussed at Storage Developer Conference). Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/smbfsctl.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 08628e6a42ac..1ff28529cf4b 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -144,6 +144,17 @@ #define IO_REPARSE_APPXSTREAM 0xC0000014 /* NFS symlinks, Win 8/SMB3 and later */ #define IO_REPARSE_TAG_NFS 0x80000014 +/* + * AzureFileSync - see + * https://docs.microsoft.com/en-us/azure/storage/files/storage-sync-cloud-tiering + */ +#define IO_REPARSE_TAG_AZ_FILE_SYNC 0x8000001e +/* WSL reparse tags */ +#define IO_REPARSE_TAG_LX_SYMLINK 0xA000001D +#define IO_REPARSE_TAG_AF_UNIX 0x80000023 +#define IO_REPARSE_TAG_LX_FIFO 0x80000024 +#define IO_REPARSE_TAG_LX_CHR 0x80000025 +#define IO_REPARSE_TAG_LX_BLK 0x80000026 /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ From cb063a83ca321fbf0cb2b4044186f241d89f3dc1 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 28 Aug 2019 15:03:18 +0200 Subject: [PATCH 1100/2880] thermal: db8500: Finalize device tree conversion At some point there was an attempt to convert the DB8500 thermal sensor to device tree: a probe path was added and the device tree was augmented for the Snowball board. The switchover was never completed: instead the thermal devices came from from the PRCMU MFD device and the probe on the Snowball was confused as another set of configuration appeared from the device tree. Move over to a device-tree only approach, as we fixed up the device trees. Cc: Vincent Guittot Acked-by: Lee Jones Reviewed-by: Daniel Lezcano Signed-off-by: Linus Walleij Signed-off-by: Eduardo Valentin --- drivers/mfd/db8500-prcmu.c | 53 +------------------- drivers/thermal/Kconfig | 2 +- drivers/thermal/db8500_thermal.c | 30 +++++------ include/linux/platform_data/db8500_thermal.h | 29 ----------- 4 files changed, 17 insertions(+), 97 deletions(-) delete mode 100644 include/linux/platform_data/db8500_thermal.h diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c index 90e0f21bc49c..518439c79826 100644 --- a/drivers/mfd/db8500-prcmu.c +++ b/drivers/mfd/db8500-prcmu.c @@ -36,7 +36,6 @@ #include #include #include -#include #include "dbx500-prcmu-regs.h" /* Index of different voltages to be used when accessing AVSData */ @@ -2984,53 +2983,6 @@ static struct ux500_wdt_data db8500_wdt_pdata = { .timeout = 600, /* 10 minutes */ .has_28_bits_resolution = true, }; -/* - * Thermal Sensor - */ - -static struct resource db8500_thsens_resources[] = { - { - .name = "IRQ_HOTMON_LOW", - .start = IRQ_PRCMU_HOTMON_LOW, - .end = IRQ_PRCMU_HOTMON_LOW, - .flags = IORESOURCE_IRQ, - }, - { - .name = "IRQ_HOTMON_HIGH", - .start = IRQ_PRCMU_HOTMON_HIGH, - .end = IRQ_PRCMU_HOTMON_HIGH, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct db8500_thsens_platform_data db8500_thsens_data = { - .trip_points[0] = { - .temp = 70000, - .type = THERMAL_TRIP_ACTIVE, - .cdev_name = { - [0] = "thermal-cpufreq-0", - }, - }, - .trip_points[1] = { - .temp = 75000, - .type = THERMAL_TRIP_ACTIVE, - .cdev_name = { - [0] = "thermal-cpufreq-0", - }, - }, - .trip_points[2] = { - .temp = 80000, - .type = THERMAL_TRIP_ACTIVE, - .cdev_name = { - [0] = "thermal-cpufreq-0", - }, - }, - .trip_points[3] = { - .temp = 85000, - .type = THERMAL_TRIP_CRITICAL, - }, - .num_trips = 4, -}; static const struct mfd_cell common_prcmu_devs[] = { { @@ -3054,10 +3006,7 @@ static const struct mfd_cell db8500_prcmu_devs[] = { }, { .name = "db8500-thermal", - .num_resources = ARRAY_SIZE(db8500_thsens_resources), - .resources = db8500_thsens_resources, - .platform_data = &db8500_thsens_data, - .pdata_size = sizeof(db8500_thsens_data), + .of_compatible = "stericsson,db8500-thermal", }, }; diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 9966364a6deb..001a21abcc28 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -310,7 +310,7 @@ config DOVE_THERMAL config DB8500_THERMAL tristate "DB8500 thermal management" - depends on MFD_DB8500_PRCMU + depends on MFD_DB8500_PRCMU && OF default y help Adds DB8500 thermal management implementation according to the thermal diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c index b71a999d17d6..d650ae5fdf2a 100644 --- a/drivers/thermal/db8500_thermal.c +++ b/drivers/thermal/db8500_thermal.c @@ -13,13 +13,24 @@ #include #include #include -#include #include #include #include #define PRCMU_DEFAULT_MEASURE_TIME 0xFFF #define PRCMU_DEFAULT_LOW_TEMP 0 +#define COOLING_DEV_MAX 8 + +struct db8500_trip_point { + unsigned long temp; + enum thermal_trip_type type; + char cdev_name[COOLING_DEV_MAX][THERMAL_NAME_LENGTH]; +}; + +struct db8500_thsens_platform_data { + struct db8500_trip_point trip_points[THERMAL_MAX_TRIPS]; + int num_trips; +}; struct db8500_thermal_zone { struct thermal_zone_device *therm_dev; @@ -301,7 +312,6 @@ static void db8500_thermal_work(struct work_struct *work) dev_dbg(&pzone->therm_dev->device, "thermal work finished.\n"); } -#ifdef CONFIG_OF static struct db8500_thsens_platform_data* db8500_thermal_parse_dt(struct platform_device *pdev) { @@ -370,13 +380,6 @@ err_parse_dt: dev_err(&pdev->dev, "Parsing device tree data error.\n"); return NULL; } -#else -static inline struct db8500_thsens_platform_data* - db8500_thermal_parse_dt(struct platform_device *pdev) -{ - return NULL; -} -#endif static int db8500_thermal_probe(struct platform_device *pdev) { @@ -386,11 +389,10 @@ static int db8500_thermal_probe(struct platform_device *pdev) int low_irq, high_irq, ret = 0; unsigned long dft_low, dft_high; - if (np) - ptrips = db8500_thermal_parse_dt(pdev); - else - ptrips = dev_get_platdata(&pdev->dev); + if (!np) + return -EINVAL; + ptrips = db8500_thermal_parse_dt(pdev); if (!ptrips) return -EINVAL; @@ -498,13 +500,11 @@ static int db8500_thermal_resume(struct platform_device *pdev) return 0; } -#ifdef CONFIG_OF static const struct of_device_id db8500_thermal_match[] = { { .compatible = "stericsson,db8500-thermal" }, {}, }; MODULE_DEVICE_TABLE(of, db8500_thermal_match); -#endif static struct platform_driver db8500_thermal_driver = { .driver = { diff --git a/include/linux/platform_data/db8500_thermal.h b/include/linux/platform_data/db8500_thermal.h deleted file mode 100644 index 55e55750a165..000000000000 --- a/include/linux/platform_data/db8500_thermal.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * db8500_thermal.h - DB8500 Thermal Management Implementation - * - * Copyright (C) 2012 ST-Ericsson - * Copyright (C) 2012 Linaro Ltd. - * - * Author: Hongbo Zhang - */ - -#ifndef _DB8500_THERMAL_H_ -#define _DB8500_THERMAL_H_ - -#include - -#define COOLING_DEV_MAX 8 - -struct db8500_trip_point { - unsigned long temp; - enum thermal_trip_type type; - char cdev_name[COOLING_DEV_MAX][THERMAL_NAME_LENGTH]; -}; - -struct db8500_thsens_platform_data { - struct db8500_trip_point trip_points[THERMAL_MAX_TRIPS]; - int num_trips; -}; - -#endif /* _DB8500_THERMAL_H_ */ From 3de9e4dff889531d517c8808898972e96fa507b2 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 28 Aug 2019 15:03:19 +0200 Subject: [PATCH 1101/2880] thermal: db8500: Use dev helper variable The code gets easier to read like this. Cc: Vincent Guittot Reviewed-by: Daniel Lezcano Signed-off-by: Linus Walleij Signed-off-by: Eduardo Valentin --- drivers/thermal/db8500_thermal.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c index d650ae5fdf2a..d250bcf3c10c 100644 --- a/drivers/thermal/db8500_thermal.c +++ b/drivers/thermal/db8500_thermal.c @@ -313,16 +313,16 @@ static void db8500_thermal_work(struct work_struct *work) } static struct db8500_thsens_platform_data* - db8500_thermal_parse_dt(struct platform_device *pdev) + db8500_thermal_parse_dt(struct device *dev) { struct db8500_thsens_platform_data *ptrips; - struct device_node *np = pdev->dev.of_node; + struct device_node *np = dev->of_node; char prop_name[32]; const char *tmp_str; u32 tmp_data; int i, j; - ptrips = devm_kzalloc(&pdev->dev, sizeof(*ptrips), GFP_KERNEL); + ptrips = devm_kzalloc(dev, sizeof(*ptrips), GFP_KERNEL); if (!ptrips) return NULL; @@ -377,7 +377,7 @@ static struct db8500_thsens_platform_data* return ptrips; err_parse_dt: - dev_err(&pdev->dev, "Parsing device tree data error.\n"); + dev_err(dev, "Parsing device tree data error.\n"); return NULL; } @@ -385,18 +385,19 @@ static int db8500_thermal_probe(struct platform_device *pdev) { struct db8500_thermal_zone *pzone = NULL; struct db8500_thsens_platform_data *ptrips = NULL; - struct device_node *np = pdev->dev.of_node; + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; int low_irq, high_irq, ret = 0; unsigned long dft_low, dft_high; if (!np) return -EINVAL; - ptrips = db8500_thermal_parse_dt(pdev); + ptrips = db8500_thermal_parse_dt(dev); if (!ptrips) return -EINVAL; - pzone = devm_kzalloc(&pdev->dev, sizeof(*pzone), GFP_KERNEL); + pzone = devm_kzalloc(dev, sizeof(*pzone), GFP_KERNEL); if (!pzone) return -ENOMEM; @@ -410,31 +411,31 @@ static int db8500_thermal_probe(struct platform_device *pdev) low_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_LOW"); if (low_irq < 0) { - dev_err(&pdev->dev, "Get IRQ_HOTMON_LOW failed.\n"); + dev_err(dev, "Get IRQ_HOTMON_LOW failed.\n"); ret = low_irq; goto out_unlock; } - ret = devm_request_threaded_irq(&pdev->dev, low_irq, NULL, + ret = devm_request_threaded_irq(dev, low_irq, NULL, prcmu_low_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT, "dbx500_temp_low", pzone); if (ret < 0) { - dev_err(&pdev->dev, "Failed to allocate temp low irq.\n"); + dev_err(dev, "Failed to allocate temp low irq.\n"); goto out_unlock; } high_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_HIGH"); if (high_irq < 0) { - dev_err(&pdev->dev, "Get IRQ_HOTMON_HIGH failed.\n"); + dev_err(dev, "Get IRQ_HOTMON_HIGH failed.\n"); ret = high_irq; goto out_unlock; } - ret = devm_request_threaded_irq(&pdev->dev, high_irq, NULL, + ret = devm_request_threaded_irq(dev, high_irq, NULL, prcmu_high_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT, "dbx500_temp_high", pzone); if (ret < 0) { - dev_err(&pdev->dev, "Failed to allocate temp high irq.\n"); + dev_err(dev, "Failed to allocate temp high irq.\n"); goto out_unlock; } @@ -442,11 +443,11 @@ static int db8500_thermal_probe(struct platform_device *pdev) ptrips->num_trips, 0, pzone, &thdev_ops, NULL, 0, 0); if (IS_ERR(pzone->therm_dev)) { - dev_err(&pdev->dev, "Register thermal zone device failed.\n"); + dev_err(dev, "Register thermal zone device failed.\n"); ret = PTR_ERR(pzone->therm_dev); goto out_unlock; } - dev_info(&pdev->dev, "Thermal zone device registered.\n"); + dev_info(dev, "Thermal zone device registered.\n"); dft_low = PRCMU_DEFAULT_LOW_TEMP; dft_high = ptrips->trip_points[0].temp; From 6c375eccded41df8033ed55a1b785531b304fc67 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 28 Aug 2019 15:03:20 +0200 Subject: [PATCH 1102/2880] thermal: db8500: Rewrite to be a pure OF sensor This patch rewrites the DB8500 thermal sensor to be a pure OF sensor, so that it can be used with thermal zones defined in the device tree. This driver was initially merged before we had generic thermal zone device tree bindings, and now it gets modernized to the way we do things these days. The old driver depended on a set of trigger points provided in the device tree or platform data to interpolate the current temperature between trigger points depending on whether the trend was rising or falling. This was bad because the trigger points should be used for defining temperature zone policies and bind to cooling devices. As the PRCMU (power reset control management unit) can only issue IRQs when we pass temperature trigger points upward or downward We instead define a number of temperature points inside the driver ranging from 15 to 100 degrees celsius. The effect is that when we register the device we quickly trigger 15, 20 ... up to the room temperature in succession and then we get continous event IRQs also under normal operating conditions, and the temperature of the system is now reported more accurately (+/- 2.5 degrees celsius) while in the past the first trigger point was at 70 degrees and the average temperature was simply reported as 35 degrees celsius (between 70 degrees and 0) until we passed 70 degrees which didn't accurately represent the temperature of the system. As a result of dropping all the trigger points from the driver and reusing the core DT thermal zone management code we reduce the code footprint quite a bit. Cc: Vincent Guittot Suggested-by: Daniel Lezcano Signed-off-by: Linus Walleij Reviewed-by: Daniel Lezcano Signed-off-by: Eduardo Valentin --- drivers/thermal/db8500_thermal.c | 477 +++++++------------------------ 1 file changed, 107 insertions(+), 370 deletions(-) diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c index d250bcf3c10c..372dbbaaafb8 100644 --- a/drivers/thermal/db8500_thermal.c +++ b/drivers/thermal/db8500_thermal.c @@ -3,9 +3,9 @@ * db8500_thermal.c - DB8500 Thermal Management Implementation * * Copyright (C) 2012 ST-Ericsson - * Copyright (C) 2012 Linaro Ltd. + * Copyright (C) 2012-2019 Linaro Ltd. * - * Author: Hongbo Zhang + * Authors: Hongbo Zhang, Linus Walleij */ #include @@ -19,458 +19,202 @@ #define PRCMU_DEFAULT_MEASURE_TIME 0xFFF #define PRCMU_DEFAULT_LOW_TEMP 0 -#define COOLING_DEV_MAX 8 -struct db8500_trip_point { - unsigned long temp; - enum thermal_trip_type type; - char cdev_name[COOLING_DEV_MAX][THERMAL_NAME_LENGTH]; -}; - -struct db8500_thsens_platform_data { - struct db8500_trip_point trip_points[THERMAL_MAX_TRIPS]; - int num_trips; +/** + * db8500_thermal_points - the interpolation points that trigger + * interrupts + */ +static const unsigned long db8500_thermal_points[] = { + 15000, + 20000, + 25000, + 30000, + 35000, + 40000, + 45000, + 50000, + 55000, + 60000, + 65000, + 70000, + 75000, + 80000, + /* + * This is where things start to get really bad for the + * SoC and the thermal zones should be set up to trigger + * critical temperature at 85000 mC so we don't get above + * this point. + */ + 85000, + 90000, + 95000, + 100000, }; struct db8500_thermal_zone { - struct thermal_zone_device *therm_dev; - struct mutex th_lock; - struct work_struct therm_work; - struct db8500_thsens_platform_data *trip_tab; - enum thermal_device_mode mode; + struct thermal_zone_device *tz; enum thermal_trend trend; - unsigned long cur_temp_pseudo; + unsigned long interpolated_temp; unsigned int cur_index; }; -/* Local function to check if thermal zone matches cooling devices */ -static int db8500_thermal_match_cdev(struct thermal_cooling_device *cdev, - struct db8500_trip_point *trip_point) -{ - int i; - - if (!strlen(cdev->type)) - return -EINVAL; - - for (i = 0; i < COOLING_DEV_MAX; i++) { - if (!strcmp(trip_point->cdev_name[i], cdev->type)) - return 0; - } - - return -ENODEV; -} - -/* Callback to bind cooling device to thermal zone */ -static int db8500_cdev_bind(struct thermal_zone_device *thermal, - struct thermal_cooling_device *cdev) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - unsigned long max_state, upper, lower; - int i, ret = -EINVAL; - - cdev->ops->get_max_state(cdev, &max_state); - - for (i = 0; i < ptrips->num_trips; i++) { - if (db8500_thermal_match_cdev(cdev, &ptrips->trip_points[i])) - continue; - - upper = lower = i > max_state ? max_state : i; - - ret = thermal_zone_bind_cooling_device(thermal, i, cdev, - upper, lower, THERMAL_WEIGHT_DEFAULT); - - dev_info(&cdev->device, "%s bind to %d: %d-%s\n", cdev->type, - i, ret, ret ? "fail" : "succeed"); - } - - return ret; -} - -/* Callback to unbind cooling device from thermal zone */ -static int db8500_cdev_unbind(struct thermal_zone_device *thermal, - struct thermal_cooling_device *cdev) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - int i, ret = -EINVAL; - - for (i = 0; i < ptrips->num_trips; i++) { - if (db8500_thermal_match_cdev(cdev, &ptrips->trip_points[i])) - continue; - - ret = thermal_zone_unbind_cooling_device(thermal, i, cdev); - - dev_info(&cdev->device, "%s unbind from %d: %s\n", cdev->type, - i, ret ? "fail" : "succeed"); - } - - return ret; -} - /* Callback to get current temperature */ -static int db8500_sys_get_temp(struct thermal_zone_device *thermal, int *temp) +static int db8500_thermal_get_temp(void *data, int *temp) { - struct db8500_thermal_zone *pzone = thermal->devdata; + struct db8500_thermal_zone *th = data; /* * TODO: There is no PRCMU interface to get temperature data currently, * so a pseudo temperature is returned , it works for thermal framework * and this will be fixed when the PRCMU interface is available. */ - *temp = pzone->cur_temp_pseudo; + *temp = th->interpolated_temp; return 0; } /* Callback to get temperature changing trend */ -static int db8500_sys_get_trend(struct thermal_zone_device *thermal, - int trip, enum thermal_trend *trend) +static int db8500_thermal_get_trend(void *data, int trip, enum thermal_trend *trend) { - struct db8500_thermal_zone *pzone = thermal->devdata; + struct db8500_thermal_zone *th = data; - *trend = pzone->trend; + *trend = th->trend; return 0; } -/* Callback to get thermal zone mode */ -static int db8500_sys_get_mode(struct thermal_zone_device *thermal, - enum thermal_device_mode *mode) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - - mutex_lock(&pzone->th_lock); - *mode = pzone->mode; - mutex_unlock(&pzone->th_lock); - - return 0; -} - -/* Callback to set thermal zone mode */ -static int db8500_sys_set_mode(struct thermal_zone_device *thermal, - enum thermal_device_mode mode) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - - mutex_lock(&pzone->th_lock); - - pzone->mode = mode; - if (mode == THERMAL_DEVICE_ENABLED) - schedule_work(&pzone->therm_work); - - mutex_unlock(&pzone->th_lock); - - return 0; -} - -/* Callback to get trip point type */ -static int db8500_sys_get_trip_type(struct thermal_zone_device *thermal, - int trip, enum thermal_trip_type *type) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - - if (trip >= ptrips->num_trips) - return -EINVAL; - - *type = ptrips->trip_points[trip].type; - - return 0; -} - -/* Callback to get trip point temperature */ -static int db8500_sys_get_trip_temp(struct thermal_zone_device *thermal, - int trip, int *temp) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - - if (trip >= ptrips->num_trips) - return -EINVAL; - - *temp = ptrips->trip_points[trip].temp; - - return 0; -} - -/* Callback to get critical trip point temperature */ -static int db8500_sys_get_crit_temp(struct thermal_zone_device *thermal, - int *temp) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - int i; - - for (i = ptrips->num_trips - 1; i > 0; i--) { - if (ptrips->trip_points[i].type == THERMAL_TRIP_CRITICAL) { - *temp = ptrips->trip_points[i].temp; - return 0; - } - } - - return -EINVAL; -} - -static struct thermal_zone_device_ops thdev_ops = { - .bind = db8500_cdev_bind, - .unbind = db8500_cdev_unbind, - .get_temp = db8500_sys_get_temp, - .get_trend = db8500_sys_get_trend, - .get_mode = db8500_sys_get_mode, - .set_mode = db8500_sys_set_mode, - .get_trip_type = db8500_sys_get_trip_type, - .get_trip_temp = db8500_sys_get_trip_temp, - .get_crit_temp = db8500_sys_get_crit_temp, +static struct thermal_zone_of_device_ops thdev_ops = { + .get_temp = db8500_thermal_get_temp, + .get_trend = db8500_thermal_get_trend, }; -static void db8500_thermal_update_config(struct db8500_thermal_zone *pzone, - unsigned int idx, enum thermal_trend trend, - unsigned long next_low, unsigned long next_high) +static void db8500_thermal_update_config(struct db8500_thermal_zone *th, + unsigned int idx, + enum thermal_trend trend, + unsigned long next_low, + unsigned long next_high) { prcmu_stop_temp_sense(); - pzone->cur_index = idx; - pzone->cur_temp_pseudo = (next_low + next_high)/2; - pzone->trend = trend; + th->cur_index = idx; + th->interpolated_temp = (next_low + next_high)/2; + th->trend = trend; + /* + * The PRCMU accept absolute temperatures in celsius so divide + * down the millicelsius with 1000 + */ prcmu_config_hotmon((u8)(next_low/1000), (u8)(next_high/1000)); prcmu_start_temp_sense(PRCMU_DEFAULT_MEASURE_TIME); } static irqreturn_t prcmu_low_irq_handler(int irq, void *irq_data) { - struct db8500_thermal_zone *pzone = irq_data; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - unsigned int idx = pzone->cur_index; + struct db8500_thermal_zone *th = irq_data; + unsigned int idx = th->cur_index; unsigned long next_low, next_high; - if (unlikely(idx == 0)) + if (idx == 0) /* Meaningless for thermal management, ignoring it */ return IRQ_HANDLED; if (idx == 1) { - next_high = ptrips->trip_points[0].temp; + next_high = db8500_thermal_points[0]; next_low = PRCMU_DEFAULT_LOW_TEMP; } else { - next_high = ptrips->trip_points[idx-1].temp; - next_low = ptrips->trip_points[idx-2].temp; + next_high = db8500_thermal_points[idx - 1]; + next_low = db8500_thermal_points[idx - 2]; } idx -= 1; - db8500_thermal_update_config(pzone, idx, THERMAL_TREND_DROPPING, - next_low, next_high); - - dev_dbg(&pzone->therm_dev->device, + db8500_thermal_update_config(th, idx, THERMAL_TREND_DROPPING, + next_low, next_high); + dev_dbg(&th->tz->device, "PRCMU set max %ld, min %ld\n", next_high, next_low); - schedule_work(&pzone->therm_work); + thermal_zone_device_update(th->tz, THERMAL_EVENT_UNSPECIFIED); return IRQ_HANDLED; } static irqreturn_t prcmu_high_irq_handler(int irq, void *irq_data) { - struct db8500_thermal_zone *pzone = irq_data; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - unsigned int idx = pzone->cur_index; + struct db8500_thermal_zone *th = irq_data; + unsigned int idx = th->cur_index; unsigned long next_low, next_high; + int num_points = ARRAY_SIZE(db8500_thermal_points); - if (idx < ptrips->num_trips - 1) { - next_high = ptrips->trip_points[idx+1].temp; - next_low = ptrips->trip_points[idx].temp; + if (idx < num_points - 1) { + next_high = db8500_thermal_points[idx+1]; + next_low = db8500_thermal_points[idx]; idx += 1; - db8500_thermal_update_config(pzone, idx, THERMAL_TREND_RAISING, - next_low, next_high); + db8500_thermal_update_config(th, idx, THERMAL_TREND_RAISING, + next_low, next_high); - dev_dbg(&pzone->therm_dev->device, - "PRCMU set max %ld, min %ld\n", next_high, next_low); - } else if (idx == ptrips->num_trips - 1) - pzone->cur_temp_pseudo = ptrips->trip_points[idx].temp + 1; + dev_info(&th->tz->device, + "PRCMU set max %ld, min %ld\n", next_high, next_low); + } else if (idx == num_points - 1) + /* So we roof out 1 degree over the max point */ + th->interpolated_temp = db8500_thermal_points[idx] + 1; - schedule_work(&pzone->therm_work); + thermal_zone_device_update(th->tz, THERMAL_EVENT_UNSPECIFIED); return IRQ_HANDLED; } -static void db8500_thermal_work(struct work_struct *work) -{ - enum thermal_device_mode cur_mode; - struct db8500_thermal_zone *pzone; - - pzone = container_of(work, struct db8500_thermal_zone, therm_work); - - mutex_lock(&pzone->th_lock); - cur_mode = pzone->mode; - mutex_unlock(&pzone->th_lock); - - if (cur_mode == THERMAL_DEVICE_DISABLED) - return; - - thermal_zone_device_update(pzone->therm_dev, THERMAL_EVENT_UNSPECIFIED); - dev_dbg(&pzone->therm_dev->device, "thermal work finished.\n"); -} - -static struct db8500_thsens_platform_data* - db8500_thermal_parse_dt(struct device *dev) -{ - struct db8500_thsens_platform_data *ptrips; - struct device_node *np = dev->of_node; - char prop_name[32]; - const char *tmp_str; - u32 tmp_data; - int i, j; - - ptrips = devm_kzalloc(dev, sizeof(*ptrips), GFP_KERNEL); - if (!ptrips) - return NULL; - - if (of_property_read_u32(np, "num-trips", &tmp_data)) - goto err_parse_dt; - - if (tmp_data > THERMAL_MAX_TRIPS) - goto err_parse_dt; - - ptrips->num_trips = tmp_data; - - for (i = 0; i < ptrips->num_trips; i++) { - sprintf(prop_name, "trip%d-temp", i); - if (of_property_read_u32(np, prop_name, &tmp_data)) - goto err_parse_dt; - - ptrips->trip_points[i].temp = tmp_data; - - sprintf(prop_name, "trip%d-type", i); - if (of_property_read_string(np, prop_name, &tmp_str)) - goto err_parse_dt; - - if (!strcmp(tmp_str, "active")) - ptrips->trip_points[i].type = THERMAL_TRIP_ACTIVE; - else if (!strcmp(tmp_str, "passive")) - ptrips->trip_points[i].type = THERMAL_TRIP_PASSIVE; - else if (!strcmp(tmp_str, "hot")) - ptrips->trip_points[i].type = THERMAL_TRIP_HOT; - else if (!strcmp(tmp_str, "critical")) - ptrips->trip_points[i].type = THERMAL_TRIP_CRITICAL; - else - goto err_parse_dt; - - sprintf(prop_name, "trip%d-cdev-num", i); - if (of_property_read_u32(np, prop_name, &tmp_data)) - goto err_parse_dt; - - if (tmp_data > COOLING_DEV_MAX) - goto err_parse_dt; - - for (j = 0; j < tmp_data; j++) { - sprintf(prop_name, "trip%d-cdev-name%d", i, j); - if (of_property_read_string(np, prop_name, &tmp_str)) - goto err_parse_dt; - - if (strlen(tmp_str) >= THERMAL_NAME_LENGTH) - goto err_parse_dt; - - strcpy(ptrips->trip_points[i].cdev_name[j], tmp_str); - } - } - return ptrips; - -err_parse_dt: - dev_err(dev, "Parsing device tree data error.\n"); - return NULL; -} - static int db8500_thermal_probe(struct platform_device *pdev) { - struct db8500_thermal_zone *pzone = NULL; - struct db8500_thsens_platform_data *ptrips = NULL; + struct db8500_thermal_zone *th = NULL; struct device *dev = &pdev->dev; - struct device_node *np = dev->of_node; int low_irq, high_irq, ret = 0; - unsigned long dft_low, dft_high; - if (!np) - return -EINVAL; - - ptrips = db8500_thermal_parse_dt(dev); - if (!ptrips) - return -EINVAL; - - pzone = devm_kzalloc(dev, sizeof(*pzone), GFP_KERNEL); - if (!pzone) + th = devm_kzalloc(dev, sizeof(*th), GFP_KERNEL); + if (!th) return -ENOMEM; - mutex_init(&pzone->th_lock); - mutex_lock(&pzone->th_lock); - - pzone->mode = THERMAL_DEVICE_DISABLED; - pzone->trip_tab = ptrips; - - INIT_WORK(&pzone->therm_work, db8500_thermal_work); - low_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_LOW"); if (low_irq < 0) { - dev_err(dev, "Get IRQ_HOTMON_LOW failed.\n"); - ret = low_irq; - goto out_unlock; + dev_err(dev, "Get IRQ_HOTMON_LOW failed\n"); + return low_irq; } ret = devm_request_threaded_irq(dev, low_irq, NULL, prcmu_low_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT, - "dbx500_temp_low", pzone); + "dbx500_temp_low", th); if (ret < 0) { - dev_err(dev, "Failed to allocate temp low irq.\n"); - goto out_unlock; + dev_err(dev, "failed to allocate temp low irq\n"); + return ret; } high_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_HIGH"); if (high_irq < 0) { - dev_err(dev, "Get IRQ_HOTMON_HIGH failed.\n"); - ret = high_irq; - goto out_unlock; + dev_err(dev, "Get IRQ_HOTMON_HIGH failed\n"); + return high_irq; } ret = devm_request_threaded_irq(dev, high_irq, NULL, prcmu_high_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT, - "dbx500_temp_high", pzone); + "dbx500_temp_high", th); if (ret < 0) { - dev_err(dev, "Failed to allocate temp high irq.\n"); - goto out_unlock; + dev_err(dev, "failed to allocate temp high irq\n"); + return ret; } - pzone->therm_dev = thermal_zone_device_register("db8500_thermal_zone", - ptrips->num_trips, 0, pzone, &thdev_ops, NULL, 0, 0); - - if (IS_ERR(pzone->therm_dev)) { - dev_err(dev, "Register thermal zone device failed.\n"); - ret = PTR_ERR(pzone->therm_dev); - goto out_unlock; + /* register of thermal sensor and get info from DT */ + th->tz = devm_thermal_zone_of_sensor_register(dev, 0, th, &thdev_ops); + if (IS_ERR(th->tz)) { + dev_err(dev, "register thermal zone sensor failed\n"); + return PTR_ERR(th->tz); } - dev_info(dev, "Thermal zone device registered.\n"); + dev_info(dev, "thermal zone sensor registered\n"); - dft_low = PRCMU_DEFAULT_LOW_TEMP; - dft_high = ptrips->trip_points[0].temp; + /* Start measuring at the lowest point */ + db8500_thermal_update_config(th, 0, THERMAL_TREND_STABLE, + PRCMU_DEFAULT_LOW_TEMP, + db8500_thermal_points[0]); - db8500_thermal_update_config(pzone, 0, THERMAL_TREND_STABLE, - dft_low, dft_high); - - platform_set_drvdata(pdev, pzone); - pzone->mode = THERMAL_DEVICE_ENABLED; - -out_unlock: - mutex_unlock(&pzone->th_lock); - - return ret; -} - -static int db8500_thermal_remove(struct platform_device *pdev) -{ - struct db8500_thermal_zone *pzone = platform_get_drvdata(pdev); - - thermal_zone_device_unregister(pzone->therm_dev); - cancel_work_sync(&pzone->therm_work); - mutex_destroy(&pzone->th_lock); + platform_set_drvdata(pdev, th); return 0; } @@ -478,9 +222,6 @@ static int db8500_thermal_remove(struct platform_device *pdev) static int db8500_thermal_suspend(struct platform_device *pdev, pm_message_t state) { - struct db8500_thermal_zone *pzone = platform_get_drvdata(pdev); - - flush_work(&pzone->therm_work); prcmu_stop_temp_sense(); return 0; @@ -488,15 +229,12 @@ static int db8500_thermal_suspend(struct platform_device *pdev, static int db8500_thermal_resume(struct platform_device *pdev) { - struct db8500_thermal_zone *pzone = platform_get_drvdata(pdev); - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - unsigned long dft_low, dft_high; + struct db8500_thermal_zone *th = platform_get_drvdata(pdev); - dft_low = PRCMU_DEFAULT_LOW_TEMP; - dft_high = ptrips->trip_points[0].temp; - - db8500_thermal_update_config(pzone, 0, THERMAL_TREND_STABLE, - dft_low, dft_high); + /* Resume and start measuring at the lowest point */ + db8500_thermal_update_config(th, 0, THERMAL_TREND_STABLE, + PRCMU_DEFAULT_LOW_TEMP, + db8500_thermal_points[0]); return 0; } @@ -515,7 +253,6 @@ static struct platform_driver db8500_thermal_driver = { .probe = db8500_thermal_probe, .suspend = db8500_thermal_suspend, .resume = db8500_thermal_resume, - .remove = db8500_thermal_remove, }; module_platform_driver(db8500_thermal_driver); From 2b481835cf4e7384b80d7762074b32a45b792d99 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 21 Sep 2019 09:01:45 +0300 Subject: [PATCH 1103/2880] wil6210: use after free in wil_netif_rx_any() The debug code dereferences "skb" to print "skb->len" so we have to print the message before we free "skb". Fixes: f99fe49ff372 ("wil6210: add wil_netif_rx() helper function") Signed-off-by: Dan Carpenter Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/wil6210/txrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/wil6210/txrx.c b/drivers/net/wireless/ath/wil6210/txrx.c index cb13652491ad..598c1fba9dac 100644 --- a/drivers/net/wireless/ath/wil6210/txrx.c +++ b/drivers/net/wireless/ath/wil6210/txrx.c @@ -1012,11 +1012,11 @@ void wil_netif_rx_any(struct sk_buff *skb, struct net_device *ndev) skb_orphan(skb); if (security && (wil->txrx_ops.rx_crypto_check(wil, skb) != 0)) { + wil_dbg_txrx(wil, "Rx drop %d bytes\n", skb->len); dev_kfree_skb(skb); ndev->stats.rx_dropped++; stats->rx_replay++; stats->rx_dropped++; - wil_dbg_txrx(wil, "Rx drop %d bytes\n", skb->len); return; } From 21a045e430e56725628f361dede8a882e92469b9 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 20 Sep 2019 21:45:33 +0200 Subject: [PATCH 1104/2880] ieee802154: mcr20a: simplify a bit 'mcr20a_handle_rx_read_buf_complete()' Use a 'skb_put_data()' variant instead of rewritting it. The __skb_put_data variant is safe here. It is obvious that the skb can not overflow. It has just been allocated a few lines above with the same 'len'. Signed-off-by: Christophe JAILLET Acked-by: Xue Liu Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mcr20a.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/mcr20a.c b/drivers/net/ieee802154/mcr20a.c index 17f2300e63ee..8dc04e2590b1 100644 --- a/drivers/net/ieee802154/mcr20a.c +++ b/drivers/net/ieee802154/mcr20a.c @@ -800,7 +800,7 @@ mcr20a_handle_rx_read_buf_complete(void *context) if (!skb) return; - memcpy(skb_put(skb, len), lp->rx_buf, len); + __skb_put_data(skb, lp->rx_buf, len); ieee802154_rx_irqsafe(lp->hw, skb, lp->rx_lqi[0]); print_hex_dump_debug("mcr20a rx: ", DUMP_PREFIX_OFFSET, 16, 1, From 61aa258ab1a50b834267f5df6cabef0d10f8955a Mon Sep 17 00:00:00 2001 From: Sam Shih Date: Fri, 20 Sep 2019 06:49:03 +0800 Subject: [PATCH 1105/2880] pwm: mediatek: Remove the has_clks field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can use fixed clocks to repair mt7628 PWM during configure from userspace. The SoC is legacy MIPS and has no complex clock tree. Because we can get the clock frequency for period calculation from fixed clocks specified in DT, we can remove the has_clock field, and directly use devm_clk_get() and clk_get_rate(). Signed-off-by: Ryder Lee Signed-off-by: Sam Shih Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-mediatek.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 459b2ccd0b75..9394735b0762 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -57,7 +57,6 @@ static const char * const mtk_pwm_clk_name[MTK_CLK_MAX] = { struct mtk_pwm_platform_data { unsigned int num_pwms; bool pwm45_fixup; - bool has_clks; }; /** @@ -87,9 +86,6 @@ static int mtk_pwm_clk_enable(struct pwm_chip *chip, struct pwm_device *pwm) struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); int ret; - if (!pc->soc->has_clks) - return 0; - ret = clk_prepare_enable(pc->clks[MTK_CLK_TOP]); if (ret < 0) return ret; @@ -116,9 +112,6 @@ static void mtk_pwm_clk_disable(struct pwm_chip *chip, struct pwm_device *pwm) { struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); - if (!pc->soc->has_clks) - return; - clk_disable_unprepare(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]); clk_disable_unprepare(pc->clks[MTK_CLK_MAIN]); clk_disable_unprepare(pc->clks[MTK_CLK_TOP]); @@ -242,7 +235,7 @@ static int mtk_pwm_probe(struct platform_device *pdev) if (IS_ERR(pc->regs)) return PTR_ERR(pc->regs); - for (i = 0; i < pc->soc->num_pwms + 2 && pc->soc->has_clks; i++) { + for (i = 0; i < pc->soc->num_pwms + 2; i++) { pc->clks[i] = devm_clk_get(&pdev->dev, mtk_pwm_clk_name[i]); if (IS_ERR(pc->clks[i])) { dev_err(&pdev->dev, "clock: %s fail: %ld\n", @@ -277,31 +270,26 @@ static int mtk_pwm_remove(struct platform_device *pdev) static const struct mtk_pwm_platform_data mt2712_pwm_data = { .num_pwms = 8, .pwm45_fixup = false, - .has_clks = true, }; static const struct mtk_pwm_platform_data mt7622_pwm_data = { .num_pwms = 6, .pwm45_fixup = false, - .has_clks = true, }; static const struct mtk_pwm_platform_data mt7623_pwm_data = { .num_pwms = 5, .pwm45_fixup = true, - .has_clks = true, }; static const struct mtk_pwm_platform_data mt7628_pwm_data = { .num_pwms = 4, .pwm45_fixup = true, - .has_clks = false, }; static const struct mtk_pwm_platform_data mt8516_pwm_data = { .num_pwms = 5, .pwm45_fixup = false, - .has_clks = true, }; static const struct of_device_id mtk_pwm_of_match[] = { From efecdeb82f21d4100566ce85bda2d7ffb9c9edff Mon Sep 17 00:00:00 2001 From: Sam Shih Date: Fri, 20 Sep 2019 06:49:04 +0800 Subject: [PATCH 1106/2880] pwm: mediatek: Allocate the clks array dynamically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using fixed size of arrays, allocate the memory for them based on the number of PWMs specified for each SoC generation. Signed-off-by: Ryder Lee Signed-off-by: Sam Shih Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-mediatek.c | 79 +++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 9394735b0762..db986b77e556 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -35,25 +35,6 @@ #define PWM_CLK_DIV_MAX 7 -enum { - MTK_CLK_MAIN = 0, - MTK_CLK_TOP, - MTK_CLK_PWM1, - MTK_CLK_PWM2, - MTK_CLK_PWM3, - MTK_CLK_PWM4, - MTK_CLK_PWM5, - MTK_CLK_PWM6, - MTK_CLK_PWM7, - MTK_CLK_PWM8, - MTK_CLK_MAX, -}; - -static const char * const mtk_pwm_clk_name[MTK_CLK_MAX] = { - "main", "top", "pwm1", "pwm2", "pwm3", "pwm4", "pwm5", "pwm6", "pwm7", - "pwm8" -}; - struct mtk_pwm_platform_data { unsigned int num_pwms; bool pwm45_fixup; @@ -63,12 +44,17 @@ struct mtk_pwm_platform_data { * struct mtk_pwm_chip - struct representing PWM chip * @chip: linux PWM chip representation * @regs: base address of PWM chip - * @clks: list of clocks + * @clk_top: the top clock generator + * @clk_main: the clock used by PWM core + * @clk_pwms: the clock used by each PWM channel + * @clk_freq: the fix clock frequency of legacy MIPS SoC */ struct mtk_pwm_chip { struct pwm_chip chip; void __iomem *regs; - struct clk *clks[MTK_CLK_MAX]; + struct clk *clk_top; + struct clk *clk_main; + struct clk **clk_pwms; const struct mtk_pwm_platform_data *soc; }; @@ -86,24 +72,24 @@ static int mtk_pwm_clk_enable(struct pwm_chip *chip, struct pwm_device *pwm) struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); int ret; - ret = clk_prepare_enable(pc->clks[MTK_CLK_TOP]); + ret = clk_prepare_enable(pc->clk_top); if (ret < 0) return ret; - ret = clk_prepare_enable(pc->clks[MTK_CLK_MAIN]); + ret = clk_prepare_enable(pc->clk_main); if (ret < 0) goto disable_clk_top; - ret = clk_prepare_enable(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]); + ret = clk_prepare_enable(pc->clk_pwms[pwm->hwpwm]); if (ret < 0) goto disable_clk_main; return 0; disable_clk_main: - clk_disable_unprepare(pc->clks[MTK_CLK_MAIN]); + clk_disable_unprepare(pc->clk_main); disable_clk_top: - clk_disable_unprepare(pc->clks[MTK_CLK_TOP]); + clk_disable_unprepare(pc->clk_top); return ret; } @@ -112,9 +98,9 @@ static void mtk_pwm_clk_disable(struct pwm_chip *chip, struct pwm_device *pwm) { struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); - clk_disable_unprepare(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]); - clk_disable_unprepare(pc->clks[MTK_CLK_MAIN]); - clk_disable_unprepare(pc->clks[MTK_CLK_TOP]); + clk_disable_unprepare(pc->clk_pwms[pwm->hwpwm]); + clk_disable_unprepare(pc->clk_main); + clk_disable_unprepare(pc->clk_top); } static inline u32 mtk_pwm_readl(struct mtk_pwm_chip *chip, unsigned int num, @@ -134,7 +120,7 @@ static int mtk_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); - struct clk *clk = pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]; + struct clk *clk = pc->clk_pwms[pwm->hwpwm]; u32 clkdiv = 0, cnt_period, cnt_duty, reg_width = PWMDWIDTH, reg_thres = PWMTHRES; u64 resolution; @@ -235,12 +221,35 @@ static int mtk_pwm_probe(struct platform_device *pdev) if (IS_ERR(pc->regs)) return PTR_ERR(pc->regs); - for (i = 0; i < pc->soc->num_pwms + 2; i++) { - pc->clks[i] = devm_clk_get(&pdev->dev, mtk_pwm_clk_name[i]); - if (IS_ERR(pc->clks[i])) { + pc->clk_pwms = devm_kcalloc(&pdev->dev, pc->soc->num_pwms, + sizeof(*pc->clk_pwms), GFP_KERNEL); + if (!pc->clk_pwms) + return -ENOMEM; + + pc->clk_top = devm_clk_get(&pdev->dev, "top"); + if (IS_ERR(pc->clk_top)) { + dev_err(&pdev->dev, "clock: top fail: %ld\n", + PTR_ERR(pc->clk_top)); + return PTR_ERR(pc->clk_top); + } + + pc->clk_main = devm_clk_get(&pdev->dev, "main"); + if (IS_ERR(pc->clk_main)) { + dev_err(&pdev->dev, "clock: main fail: %ld\n", + PTR_ERR(pc->clk_main)); + return PTR_ERR(pc->clk_main); + } + + for (i = 0; i < pc->soc->num_pwms; i++) { + char name[8]; + + snprintf(name, sizeof(name), "pwm%d", i + 1); + + pc->clk_pwms[i] = devm_clk_get(&pdev->dev, name); + if (IS_ERR(pc->clk_pwms[i])) { dev_err(&pdev->dev, "clock: %s fail: %ld\n", - mtk_pwm_clk_name[i], PTR_ERR(pc->clks[i])); - return PTR_ERR(pc->clks[i]); + name, PTR_ERR(pc->clk_pwms[i])); + return PTR_ERR(pc->clk_pwms[i]); } } From 2503781c97fa66f1ee7ffac21f8c5c330b82b5eb Mon Sep 17 00:00:00 2001 From: Sam Shih Date: Fri, 20 Sep 2019 06:49:05 +0800 Subject: [PATCH 1107/2880] pwm: mediatek: Use pwm_mediatek as common prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use pwm_mediatek as common prefix to match the filename. No functional change intended. Signed-off-by: Ryder Lee Signed-off-by: Sam Shih Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-mediatek.c | 117 +++++++++++++++++++------------------ 1 file changed, 60 insertions(+), 57 deletions(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index db986b77e556..6b9a5857b5b6 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -35,13 +35,13 @@ #define PWM_CLK_DIV_MAX 7 -struct mtk_pwm_platform_data { +struct pwm_mediatek_of_data { unsigned int num_pwms; bool pwm45_fixup; }; /** - * struct mtk_pwm_chip - struct representing PWM chip + * struct pwm_mediatek_chip - struct representing PWM chip * @chip: linux PWM chip representation * @regs: base address of PWM chip * @clk_top: the top clock generator @@ -49,27 +49,29 @@ struct mtk_pwm_platform_data { * @clk_pwms: the clock used by each PWM channel * @clk_freq: the fix clock frequency of legacy MIPS SoC */ -struct mtk_pwm_chip { +struct pwm_mediatek_chip { struct pwm_chip chip; void __iomem *regs; struct clk *clk_top; struct clk *clk_main; struct clk **clk_pwms; - const struct mtk_pwm_platform_data *soc; + const struct pwm_mediatek_of_data *soc; }; -static const unsigned int mtk_pwm_reg_offset[] = { +static const unsigned int pwm_mediatek_reg_offset[] = { 0x0010, 0x0050, 0x0090, 0x00d0, 0x0110, 0x0150, 0x0190, 0x0220 }; -static inline struct mtk_pwm_chip *to_mtk_pwm_chip(struct pwm_chip *chip) +static inline struct pwm_mediatek_chip * +to_pwm_mediatek_chip(struct pwm_chip *chip) { - return container_of(chip, struct mtk_pwm_chip, chip); + return container_of(chip, struct pwm_mediatek_chip, chip); } -static int mtk_pwm_clk_enable(struct pwm_chip *chip, struct pwm_device *pwm) +static int pwm_mediatek_clk_enable(struct pwm_chip *chip, + struct pwm_device *pwm) { - struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); + struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); int ret; ret = clk_prepare_enable(pc->clk_top); @@ -94,45 +96,46 @@ disable_clk_top: return ret; } -static void mtk_pwm_clk_disable(struct pwm_chip *chip, struct pwm_device *pwm) +static void pwm_mediatek_clk_disable(struct pwm_chip *chip, + struct pwm_device *pwm) { - struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); + struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); clk_disable_unprepare(pc->clk_pwms[pwm->hwpwm]); clk_disable_unprepare(pc->clk_main); clk_disable_unprepare(pc->clk_top); } -static inline u32 mtk_pwm_readl(struct mtk_pwm_chip *chip, unsigned int num, - unsigned int offset) +static inline u32 pwm_mediatek_readl(struct pwm_mediatek_chip *chip, + unsigned int num, unsigned int offset) { - return readl(chip->regs + mtk_pwm_reg_offset[num] + offset); + return readl(chip->regs + pwm_mediatek_reg_offset[num] + offset); } -static inline void mtk_pwm_writel(struct mtk_pwm_chip *chip, - unsigned int num, unsigned int offset, - u32 value) +static inline void pwm_mediatek_writel(struct pwm_mediatek_chip *chip, + unsigned int num, unsigned int offset, + u32 value) { - writel(value, chip->regs + mtk_pwm_reg_offset[num] + offset); + writel(value, chip->regs + pwm_mediatek_reg_offset[num] + offset); } -static int mtk_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) +static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, + int duty_ns, int period_ns) { - struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); - struct clk *clk = pc->clk_pwms[pwm->hwpwm]; + struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); u32 clkdiv = 0, cnt_period, cnt_duty, reg_width = PWMDWIDTH, reg_thres = PWMTHRES; u64 resolution; int ret; - ret = mtk_pwm_clk_enable(chip, pwm); + ret = pwm_mediatek_clk_enable(chip, pwm); + if (ret < 0) return ret; /* Using resolution in picosecond gets accuracy higher */ resolution = (u64)NSEC_PER_SEC * 1000; - do_div(resolution, clk_get_rate(clk)); + do_div(resolution, clk_get_rate(pc->clk_pwms[pwm->hwpwm])); cnt_period = DIV_ROUND_CLOSEST_ULL((u64)period_ns * 1000, resolution); while (cnt_period > 8191) { @@ -143,7 +146,7 @@ static int mtk_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, } if (clkdiv > PWM_CLK_DIV_MAX) { - mtk_pwm_clk_disable(chip, pwm); + pwm_mediatek_clk_disable(chip, pwm); dev_err(chip->dev, "period %d not supported\n", period_ns); return -EINVAL; } @@ -158,22 +161,22 @@ static int mtk_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, } cnt_duty = DIV_ROUND_CLOSEST_ULL((u64)duty_ns * 1000, resolution); - mtk_pwm_writel(pc, pwm->hwpwm, PWMCON, BIT(15) | clkdiv); - mtk_pwm_writel(pc, pwm->hwpwm, reg_width, cnt_period); - mtk_pwm_writel(pc, pwm->hwpwm, reg_thres, cnt_duty); + pwm_mediatek_writel(pc, pwm->hwpwm, PWMCON, BIT(15) | clkdiv); + pwm_mediatek_writel(pc, pwm->hwpwm, reg_width, cnt_period); + pwm_mediatek_writel(pc, pwm->hwpwm, reg_thres, cnt_duty); - mtk_pwm_clk_disable(chip, pwm); + pwm_mediatek_clk_disable(chip, pwm); return 0; } -static int mtk_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) +static int pwm_mediatek_enable(struct pwm_chip *chip, struct pwm_device *pwm) { - struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); + struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); u32 value; int ret; - ret = mtk_pwm_clk_enable(chip, pwm); + ret = pwm_mediatek_clk_enable(chip, pwm); if (ret < 0) return ret; @@ -184,28 +187,28 @@ static int mtk_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) return 0; } -static void mtk_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) +static void pwm_mediatek_disable(struct pwm_chip *chip, struct pwm_device *pwm) { - struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); + struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); u32 value; value = readl(pc->regs); value &= ~BIT(pwm->hwpwm); writel(value, pc->regs); - mtk_pwm_clk_disable(chip, pwm); + pwm_mediatek_clk_disable(chip, pwm); } -static const struct pwm_ops mtk_pwm_ops = { - .config = mtk_pwm_config, - .enable = mtk_pwm_enable, - .disable = mtk_pwm_disable, +static const struct pwm_ops pwm_mediatek_ops = { + .config = pwm_mediatek_config, + .enable = pwm_mediatek_enable, + .disable = pwm_mediatek_disable, .owner = THIS_MODULE, }; -static int mtk_pwm_probe(struct platform_device *pdev) +static int pwm_mediatek_probe(struct platform_device *pdev) { - struct mtk_pwm_chip *pc; + struct pwm_mediatek_chip *pc; struct resource *res; unsigned int i; int ret; @@ -256,7 +259,7 @@ static int mtk_pwm_probe(struct platform_device *pdev) platform_set_drvdata(pdev, pc); pc->chip.dev = &pdev->dev; - pc->chip.ops = &mtk_pwm_ops; + pc->chip.ops = &pwm_mediatek_ops; pc->chip.base = -1; pc->chip.npwm = pc->soc->num_pwms; @@ -269,39 +272,39 @@ static int mtk_pwm_probe(struct platform_device *pdev) return 0; } -static int mtk_pwm_remove(struct platform_device *pdev) +static int pwm_mediatek_remove(struct platform_device *pdev) { - struct mtk_pwm_chip *pc = platform_get_drvdata(pdev); + struct pwm_mediatek_chip *pc = platform_get_drvdata(pdev); return pwmchip_remove(&pc->chip); } -static const struct mtk_pwm_platform_data mt2712_pwm_data = { +static const struct pwm_mediatek_of_data mt2712_pwm_data = { .num_pwms = 8, .pwm45_fixup = false, }; -static const struct mtk_pwm_platform_data mt7622_pwm_data = { +static const struct pwm_mediatek_of_data mt7622_pwm_data = { .num_pwms = 6, .pwm45_fixup = false, }; -static const struct mtk_pwm_platform_data mt7623_pwm_data = { +static const struct pwm_mediatek_of_data mt7623_pwm_data = { .num_pwms = 5, .pwm45_fixup = true, }; -static const struct mtk_pwm_platform_data mt7628_pwm_data = { +static const struct pwm_mediatek_of_data mt7628_pwm_data = { .num_pwms = 4, .pwm45_fixup = true, }; -static const struct mtk_pwm_platform_data mt8516_pwm_data = { +static const struct pwm_mediatek_of_data mt8516_pwm_data = { .num_pwms = 5, .pwm45_fixup = false, }; -static const struct of_device_id mtk_pwm_of_match[] = { +static const struct of_device_id pwm_mediatek_of_match[] = { { .compatible = "mediatek,mt2712-pwm", .data = &mt2712_pwm_data }, { .compatible = "mediatek,mt7622-pwm", .data = &mt7622_pwm_data }, { .compatible = "mediatek,mt7623-pwm", .data = &mt7623_pwm_data }, @@ -309,17 +312,17 @@ static const struct of_device_id mtk_pwm_of_match[] = { { .compatible = "mediatek,mt8516-pwm", .data = &mt8516_pwm_data }, { }, }; -MODULE_DEVICE_TABLE(of, mtk_pwm_of_match); +MODULE_DEVICE_TABLE(of, pwm_mediatek_of_match); -static struct platform_driver mtk_pwm_driver = { +static struct platform_driver pwm_mediatek_driver = { .driver = { - .name = "mtk-pwm", - .of_match_table = mtk_pwm_of_match, + .name = "pwm-mediatek", + .of_match_table = pwm_mediatek_of_match, }, - .probe = mtk_pwm_probe, - .remove = mtk_pwm_remove, + .probe = pwm_mediatek_probe, + .remove = pwm_mediatek_remove, }; -module_platform_driver(mtk_pwm_driver); +module_platform_driver(pwm_mediatek_driver); MODULE_AUTHOR("John Crispin "); MODULE_LICENSE("GPL"); From 4bea6dd5be7e2ab8c2368ee9724ef88793b2421b Mon Sep 17 00:00:00 2001 From: Sam Shih Date: Fri, 20 Sep 2019 06:49:06 +0800 Subject: [PATCH 1108/2880] pwm: mediatek: Update license and switch to SPDX tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SPDX identifiers to pwm-mediatek.c. Update MODULE_LICENSE to correctly reflect the GNU General Public License v2.0. Signed-off-by: Ryder Lee Signed-off-by: Sam Shih Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-mediatek.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 6b9a5857b5b6..7782fadbc116 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * Mediatek Pulse Width Modulator driver + * MediaTek Pulse Width Modulator driver * * Copyright (C) 2015 John Crispin * Copyright (C) 2017 Zhi Mao * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. */ #include @@ -325,4 +323,4 @@ static struct platform_driver pwm_mediatek_driver = { module_platform_driver(pwm_mediatek_driver); MODULE_AUTHOR("John Crispin "); -MODULE_LICENSE("GPL"); +MODULE_LICENSE("GPL v2"); From 1c00ad6ebf36aa3b0fa598a48b8ae59782be4121 Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Fri, 20 Sep 2019 06:49:10 +0800 Subject: [PATCH 1109/2880] dt-bindings: pwm: Update bindings for MT7629 SoC This updates bindings for MT7629 PWM controller. Signed-off-by: Ryder Lee Signed-off-by: Sam Shih Reviewed-by: Rob Herring Reviewed-by: Matthias Brugger Signed-off-by: Thierry Reding --- Documentation/devicetree/bindings/pwm/pwm-mediatek.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt index 9152bf5afe56..c8501530173c 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt @@ -6,6 +6,7 @@ Required properties: - "mediatek,mt7622-pwm": found on mt7622 SoC. - "mediatek,mt7623-pwm": found on mt7623 SoC. - "mediatek,mt7628-pwm": found on mt7628 SoC. + - "mediatek,mt7629-pwm", "mediatek,mt7622-pwm": found on mt7629 SoC. - "mediatek,mt8516-pwm": found on mt8516 SoC. - reg: physical base address and length of the controller's registers. - #pwm-cells: must be 2. See pwm.txt in this directory for a description of From 89340d0935c9296c7b8222b6eab30e67cb57ab82 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 9 Sep 2019 09:40:28 +0800 Subject: [PATCH 1110/2880] Revert "locking/pvqspinlock: Don't wait if vCPU is preempted" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch reverts commit 75437bb304b20 (locking/pvqspinlock: Don't wait if vCPU is preempted). A large performance regression was caused by this commit. on over-subscription scenarios. The test was run on a Xeon Skylake box, 2 sockets, 40 cores, 80 threads, with three VMs of 80 vCPUs each. The score of ebizzy -M is reduced from 13000-14000 records/s to 1700-1800 records/s: Host Guest score vanilla w/o kvm optimizations upstream 1700-1800 records/s vanilla w/o kvm optimizations revert 13000-14000 records/s vanilla w/ kvm optimizations upstream 4500-5000 records/s vanilla w/ kvm optimizations revert 14000-15500 records/s Exit from aggressive wait-early mechanism can result in premature yield and extra scheduling latency. Actually, only 6% of wait_early events are caused by vcpu_is_preempted() being true. However, when one vCPU voluntarily releases its vCPU, all the subsequently waiters in the queue will do the same and the cascading effect leads to bad performance. kvm optimizations: [1] commit d73eb57b80b (KVM: Boost vCPUs that are delivering interrupts) [2] commit 266e85a5ec9 (KVM: X86: Boost queue head vCPU to mitigate lock waiter preemption) Tested-by: loobinliu@tencent.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Waiman Long Cc: Paolo Bonzini Cc: Radim Krčmář Cc: loobinliu@tencent.com Cc: stable@vger.kernel.org Fixes: 75437bb304b20 (locking/pvqspinlock: Don't wait if vCPU is preempted) Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- kernel/locking/qspinlock_paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 89bab079e7a4..e84d21aa0722 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -269,7 +269,7 @@ pv_wait_early(struct pv_node *prev, int loop) if ((loop & PV_PREV_CHECK_MASK) != 0) return false; - return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu); + return READ_ONCE(prev->state) != vcpu_running; } /* From 20ff1cb506727f81acba59acab8a0f37e1a13e43 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 24 Sep 2019 07:40:06 +0900 Subject: [PATCH 1111/2880] netfilter: ebtables: use __u8 instead of uint8_t in uapi header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_UAPI_HEADER_TEST=y, exported headers are compile-tested to make sure they can be included from user-space. Currently, linux/netfilter_bridge/ebtables.h is excluded from the test coverage. To make it join the compile-test, we need to fix the build errors attached below. For a case like this, we decided to use __u{8,16,32,64} variable types in this discussion: https://lkml.org/lkml/2019/6/5/18 Build log: CC usr/include/linux/netfilter_bridge/ebtables.h.s In file included from :32:0: ./usr/include/linux/netfilter_bridge/ebtables.h:126:4: error: unknown type name ‘uint8_t’ uint8_t revision; ^~~~~~~ ./usr/include/linux/netfilter_bridge/ebtables.h:139:4: error: unknown type name ‘uint8_t’ uint8_t revision; ^~~~~~~ ./usr/include/linux/netfilter_bridge/ebtables.h:152:4: error: unknown type name ‘uint8_t’ uint8_t revision; ^~~~~~~ Signed-off-by: Masahiro Yamada Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/ebtables.h | 6 +++--- usr/include/Makefile | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index 3b86c14ea49d..8076c940ffeb 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -123,7 +123,7 @@ struct ebt_entry_match { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; - uint8_t revision; + __u8 revision; }; struct xt_match *match; } u; @@ -136,7 +136,7 @@ struct ebt_entry_watcher { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; - uint8_t revision; + __u8 revision; }; struct xt_target *watcher; } u; @@ -149,7 +149,7 @@ struct ebt_entry_target { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; - uint8_t revision; + __u8 revision; }; struct xt_target *target; } u; diff --git a/usr/include/Makefile b/usr/include/Makefile index 1fb6abe29b2f..379cc5abc162 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -38,7 +38,6 @@ header-test- += linux/ivtv.h header-test- += linux/jffs2.h header-test- += linux/kexec.h header-test- += linux/matroxfb.h -header-test- += linux/netfilter_bridge/ebtables.h header-test- += linux/netfilter_ipv4/ipt_LOG.h header-test- += linux/netfilter_ipv6/ip6t_LOG.h header-test- += linux/nfc.h From 9b05b6e11d5e93a3a517cadc12b9836e0470c255 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Tue, 24 Sep 2019 14:42:44 +0200 Subject: [PATCH 1112/2880] netfilter: nf_tables: bogus EBUSY when deleting flowtable after flush The deletion of a flowtable after a flush in the same transaction results in EBUSY. This patch adds an activation and deactivation of flowtables in order to update the _use_ counter. Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ net/netfilter/nf_tables_api.c | 16 ++++++++++++++++ net/netfilter/nft_flow_offload.c | 19 +++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index a26d64056fc8..001d294edf57 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1183,6 +1183,10 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask); +void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, + struct nft_flowtable *flowtable, + enum nft_trans_phase phase); + void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6dc46f9b5f7b..d481f9baca2f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5598,6 +5598,22 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nft_flowtable_lookup); +void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, + struct nft_flowtable *flowtable, + enum nft_trans_phase phase) +{ + switch (phase) { + case NFT_TRANS_PREPARE: + case NFT_TRANS_ABORT: + case NFT_TRANS_RELEASE: + flowtable->use--; + /* fall through */ + default: + return; + } +} +EXPORT_SYMBOL_GPL(nf_tables_deactivate_flowtable); + static struct nft_flowtable * nft_flowtable_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u8 genmask) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 22cf236eb5d5..f29bbc74c4bf 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -177,6 +177,23 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, return nf_ct_netns_get(ctx->net, ctx->family); } +static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + enum nft_trans_phase phase) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + + nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); +} + +static void nft_flow_offload_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + + priv->flowtable->use++; +} + static void nft_flow_offload_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -205,6 +222,8 @@ static const struct nft_expr_ops nft_flow_offload_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), .eval = nft_flow_offload_eval, .init = nft_flow_offload_init, + .activate = nft_flow_offload_activate, + .deactivate = nft_flow_offload_deactivate, .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, From f8d7ab2bded897607bff6324d5c6ea6b4aecca0c Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 24 Sep 2019 17:19:06 +0530 Subject: [PATCH 1113/2880] tracing/probe: Fix same probe event argument matching Commit fe60b0ce8e73 ("tracing/probe: Reject exactly same probe event") tries to reject a event which matches an already existing probe. However it currently continues to match arguments and rejects adding a probe even when the arguments don't match. Fix this by only rejecting a probe if and only if all the arguments match. Link: http://lkml.kernel.org/r/20190924114906.14038-1-srikar@linux.vnet.ibm.com Fixes: fe60b0ce8e73 ("tracing/probe: Reject exactly same probe event") Acked-by: Masami Hiramatsu Signed-off-by: Srikar Dronamraju Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 5 +++-- kernel/trace/trace_uprobe.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a6697e28ddda..402dc3ce88d3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -549,10 +549,11 @@ static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig, for (i = 0; i < orig->tp.nr_args; i++) { if (strcmp(orig->tp.args[i].comm, comp->tp.args[i].comm)) - continue; + break; } - return true; + if (i == orig->tp.nr_args) + return true; } return false; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 34dd6d0016a3..dd884341f5c5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -431,10 +431,11 @@ static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, for (i = 0; i < orig->tp.nr_args; i++) { if (strcmp(orig->tp.args[i].comm, comp->tp.args[i].comm)) - continue; + break; } - return true; + if (i == orig->tp.nr_args) + return true; } return false; From b27507bb59ed504d7fa4d6a35f25a8cc39903b54 Mon Sep 17 00:00:00 2001 From: Juliet Kim Date: Fri, 20 Sep 2019 16:11:22 -0400 Subject: [PATCH 1114/2880] net/ibmvnic: unlock rtnl_lock in reset so linkwatch_event can run Commit a5681e20b541 ("net/ibmnvic: Fix deadlock problem in reset") made the change to hold the RTNL lock during a reset to avoid deadlock but linkwatch_event is fired during the reset and needs the RTNL lock. That keeps linkwatch_event process from proceeding until the reset is complete. The reset process cannot tolerate the linkwatch_event processing after reset completes, so release the RTNL lock during the process to allow a chance for linkwatch_event to run during reset. This does not guarantee that the linkwatch_event will be processed as soon as link state changes, but is an improvement over the current code where linkwatch_event processing is always delayed, which prevents transmissions on the device from being deactivated leading transmit watchdog timer to time-out. Release the RTNL lock before link state change and re-acquire after the link state change to allow linkwatch_event to grab the RTNL lock and run during the reset. Fixes: a5681e20b541 ("net/ibmnvic: Fix deadlock problem in reset") Signed-off-by: Juliet Kim Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 226 ++++++++++++++++++++--------- drivers/net/ethernet/ibm/ibmvnic.h | 1 + 2 files changed, 158 insertions(+), 69 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 3816fff75bb5..d7db5cc51f6a 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1723,6 +1723,86 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p) return rc; } +/** + * do_change_param_reset returns zero if we are able to keep processing reset + * events, or non-zero if we hit a fatal error and must halt. + */ +static int do_change_param_reset(struct ibmvnic_adapter *adapter, + struct ibmvnic_rwi *rwi, + u32 reset_state) +{ + struct net_device *netdev = adapter->netdev; + int i, rc; + + netdev_dbg(adapter->netdev, "Change param resetting driver (%d)\n", + rwi->reset_reason); + + netif_carrier_off(netdev); + adapter->reset_reason = rwi->reset_reason; + + ibmvnic_cleanup(netdev); + + if (reset_state == VNIC_OPEN) { + rc = __ibmvnic_close(netdev); + if (rc) + return rc; + } + + release_resources(adapter); + release_sub_crqs(adapter, 1); + release_crq_queue(adapter); + + adapter->state = VNIC_PROBED; + + rc = init_crq_queue(adapter); + + if (rc) { + netdev_err(adapter->netdev, + "Couldn't initialize crq. rc=%d\n", rc); + return rc; + } + + rc = ibmvnic_reset_init(adapter); + if (rc) + return IBMVNIC_INIT_FAILED; + + /* If the adapter was in PROBE state prior to the reset, + * exit here. + */ + if (reset_state == VNIC_PROBED) + return 0; + + rc = ibmvnic_login(netdev); + if (rc) { + adapter->state = reset_state; + return rc; + } + + rc = init_resources(adapter); + if (rc) + return rc; + + ibmvnic_disable_irqs(adapter); + + adapter->state = VNIC_CLOSED; + + if (reset_state == VNIC_CLOSED) + return 0; + + rc = __ibmvnic_open(netdev); + if (rc) + return IBMVNIC_OPEN_FAILED; + + /* refresh device's multicast list */ + ibmvnic_set_multi(netdev); + + /* kick napi */ + for (i = 0; i < adapter->req_rx_queues; i++) + napi_schedule(&adapter->napi[i]); + + return 0; +} + /** * do_reset returns zero if we are able to keep processing reset events, or * non-zero if we hit a fatal error and must halt. @@ -1738,6 +1818,8 @@ static int do_reset(struct ibmvnic_adapter *adapter, netdev_dbg(adapter->netdev, "Re-setting driver (%d)\n", rwi->reset_reason); + rtnl_lock(); + netif_carrier_off(netdev); adapter->reset_reason = rwi->reset_reason; @@ -1751,16 +1833,25 @@ static int do_reset(struct ibmvnic_adapter *adapter, if (reset_state == VNIC_OPEN && adapter->reset_reason != VNIC_RESET_MOBILITY && adapter->reset_reason != VNIC_RESET_FAILOVER) { - rc = __ibmvnic_close(netdev); - if (rc) - return rc; - } + adapter->state = VNIC_CLOSING; - if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM || - adapter->wait_for_reset) { - release_resources(adapter); - release_sub_crqs(adapter, 1); - release_crq_queue(adapter); + /* Release the RTNL lock before link state change and + * re-acquire after the link state change to allow + * linkwatch_event to grab the RTNL lock and run during + * a reset. + */ + rtnl_unlock(); + rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN); + rtnl_lock(); + if (rc) + goto out; + + if (adapter->state != VNIC_CLOSING) { + rc = -1; + goto out; + } + + adapter->state = VNIC_CLOSED; } if (adapter->reset_reason != VNIC_RESET_NON_FATAL) { @@ -1769,9 +1860,7 @@ static int do_reset(struct ibmvnic_adapter *adapter, */ adapter->state = VNIC_PROBED; - if (adapter->wait_for_reset) { - rc = init_crq_queue(adapter); - } else if (adapter->reset_reason == VNIC_RESET_MOBILITY) { + if (adapter->reset_reason == VNIC_RESET_MOBILITY) { rc = ibmvnic_reenable_crq_queue(adapter); release_sub_crqs(adapter, 1); } else { @@ -1783,36 +1872,35 @@ static int do_reset(struct ibmvnic_adapter *adapter, if (rc) { netdev_err(adapter->netdev, "Couldn't initialize crq. rc=%d\n", rc); - return rc; + goto out; } rc = ibmvnic_reset_init(adapter); - if (rc) - return IBMVNIC_INIT_FAILED; + if (rc) { + rc = IBMVNIC_INIT_FAILED; + goto out; + } /* If the adapter was in PROBE state prior to the reset, * exit here. */ - if (reset_state == VNIC_PROBED) - return 0; + if (reset_state == VNIC_PROBED) { + rc = 0; + goto out; + } rc = ibmvnic_login(netdev); if (rc) { adapter->state = reset_state; - return rc; + goto out; } - if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM || - adapter->wait_for_reset) { - rc = init_resources(adapter); - if (rc) - return rc; - } else if (adapter->req_rx_queues != old_num_rx_queues || - adapter->req_tx_queues != old_num_tx_queues || - adapter->req_rx_add_entries_per_subcrq != - old_num_rx_slots || - adapter->req_tx_entries_per_subcrq != - old_num_tx_slots) { + if (adapter->req_rx_queues != old_num_rx_queues || + adapter->req_tx_queues != old_num_tx_queues || + adapter->req_rx_add_entries_per_subcrq != + old_num_rx_slots || + adapter->req_tx_entries_per_subcrq != + old_num_tx_slots) { release_rx_pools(adapter); release_tx_pools(adapter); release_napi(adapter); @@ -1820,32 +1908,30 @@ static int do_reset(struct ibmvnic_adapter *adapter, rc = init_resources(adapter); if (rc) - return rc; + goto out; } else { rc = reset_tx_pools(adapter); if (rc) - return rc; + goto out; rc = reset_rx_pools(adapter); if (rc) - return rc; + goto out; } ibmvnic_disable_irqs(adapter); } adapter->state = VNIC_CLOSED; - if (reset_state == VNIC_CLOSED) - return 0; + if (reset_state == VNIC_CLOSED) { + rc = 0; + goto out; + } rc = __ibmvnic_open(netdev); if (rc) { - if (list_empty(&adapter->rwi_list)) - adapter->state = VNIC_CLOSED; - else - adapter->state = reset_state; - - return 0; + rc = IBMVNIC_OPEN_FAILED; + goto out; } /* refresh device's multicast list */ @@ -1855,11 +1941,15 @@ static int do_reset(struct ibmvnic_adapter *adapter, for (i = 0; i < adapter->req_rx_queues; i++) napi_schedule(&adapter->napi[i]); - if (adapter->reset_reason != VNIC_RESET_FAILOVER && - adapter->reset_reason != VNIC_RESET_CHANGE_PARAM) + if (adapter->reset_reason != VNIC_RESET_FAILOVER) call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, netdev); - return 0; + rc = 0; + +out: + rtnl_unlock(); + + return rc; } static int do_hard_reset(struct ibmvnic_adapter *adapter, @@ -1919,14 +2009,8 @@ static int do_hard_reset(struct ibmvnic_adapter *adapter, return 0; rc = __ibmvnic_open(netdev); - if (rc) { - if (list_empty(&adapter->rwi_list)) - adapter->state = VNIC_CLOSED; - else - adapter->state = reset_state; - - return 0; - } + if (rc) + return IBMVNIC_OPEN_FAILED; return 0; } @@ -1965,20 +2049,11 @@ static void __ibmvnic_reset(struct work_struct *work) { struct ibmvnic_rwi *rwi; struct ibmvnic_adapter *adapter; - bool we_lock_rtnl = false; u32 reset_state; int rc = 0; adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset); - /* netif_set_real_num_xx_queues needs to take rtnl lock here - * unless wait_for_reset is set, in which case the rtnl lock - * has already been taken before initializing the reset - */ - if (!adapter->wait_for_reset) { - rtnl_lock(); - we_lock_rtnl = true; - } reset_state = adapter->state; rwi = get_next_rwi(adapter); @@ -1990,14 +2065,32 @@ static void __ibmvnic_reset(struct work_struct *work) break; } - if (adapter->force_reset_recovery) { - adapter->force_reset_recovery = false; - rc = do_hard_reset(adapter, rwi, reset_state); + if (rwi->reset_reason == VNIC_RESET_CHANGE_PARAM) { + /* CHANGE_PARAM requestor holds rtnl_lock */ + rc = do_change_param_reset(adapter, rwi, reset_state); + } else if (adapter->force_reset_recovery) { + /* Transport event occurred during previous reset */ + if (adapter->wait_for_reset) { + /* Previous was CHANGE_PARAM; caller locked */ + adapter->force_reset_recovery = false; + rc = do_hard_reset(adapter, rwi, reset_state); + } else { + rtnl_lock(); + adapter->force_reset_recovery = false; + rc = do_hard_reset(adapter, rwi, reset_state); + rtnl_unlock(); + } } else { rc = do_reset(adapter, rwi, reset_state); } kfree(rwi); - if (rc && rc != IBMVNIC_INIT_FAILED && + if (rc == IBMVNIC_OPEN_FAILED) { + if (list_empty(&adapter->rwi_list)) + adapter->state = VNIC_CLOSED; + else + adapter->state = reset_state; + rc = 0; + } else if (rc && rc != IBMVNIC_INIT_FAILED && !adapter->force_reset_recovery) break; @@ -2005,7 +2098,6 @@ static void __ibmvnic_reset(struct work_struct *work) } if (adapter->wait_for_reset) { - adapter->wait_for_reset = false; adapter->reset_done_rc = rc; complete(&adapter->reset_done); } @@ -2016,8 +2108,6 @@ static void __ibmvnic_reset(struct work_struct *work) } adapter->resetting = false; - if (we_lock_rtnl) - rtnl_unlock(); } static int ibmvnic_reset(struct ibmvnic_adapter *adapter, @@ -2078,8 +2168,6 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, return 0; err: - if (adapter->wait_for_reset) - adapter->wait_for_reset = false; return -ret; } diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 70bd286f8932..9d3d35cc91d6 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -20,6 +20,7 @@ #define IBMVNIC_INVALID_MAP -1 #define IBMVNIC_STATS_TIMEOUT 1 #define IBMVNIC_INIT_FAILED 2 +#define IBMVNIC_OPEN_FAILED 3 /* basic structures plus 100 2k buffers */ #define IBMVNIC_IO_ENTITLEMENT_DEFAULT 610305 From 7ed5b31f4a6695a21f617df07646e9b15c6c1d29 Mon Sep 17 00:00:00 2001 From: Juliet Kim Date: Fri, 20 Sep 2019 16:11:23 -0400 Subject: [PATCH 1115/2880] net/ibmvnic: prevent more than one thread from running in reset The current code allows more than one thread to run in reset. This can corrupt struct adapter data. Check adapter->resetting before performing a reset, if there is another reset running delay (100 msec) before trying again. Signed-off-by: Juliet Kim Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 40 ++++++++++++++++++++++-------- drivers/net/ethernet/ibm/ibmvnic.h | 5 +++- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index d7db5cc51f6a..2b073a3c0b84 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1207,7 +1207,7 @@ static void ibmvnic_cleanup(struct net_device *netdev) struct ibmvnic_adapter *adapter = netdev_priv(netdev); /* ensure that transmissions are stopped if called by do_reset */ - if (adapter->resetting) + if (test_bit(0, &adapter->resetting)) netif_tx_disable(netdev); else netif_tx_stop_all_queues(netdev); @@ -1428,7 +1428,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) u8 proto = 0; netdev_tx_t ret = NETDEV_TX_OK; - if (adapter->resetting) { + if (test_bit(0, &adapter->resetting)) { if (!netif_subqueue_stopped(netdev, skb)) netif_stop_subqueue(netdev, queue_num); dev_kfree_skb_any(skb); @@ -2054,6 +2054,12 @@ static void __ibmvnic_reset(struct work_struct *work) adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset); + if (test_and_set_bit_lock(0, &adapter->resetting)) { + schedule_delayed_work(&adapter->ibmvnic_delayed_reset, + IBMVNIC_RESET_DELAY); + return; + } + reset_state = adapter->state; rwi = get_next_rwi(adapter); @@ -2095,6 +2101,10 @@ static void __ibmvnic_reset(struct work_struct *work) break; rwi = get_next_rwi(adapter); + + if (rwi && (rwi->reset_reason == VNIC_RESET_FAILOVER || + rwi->reset_reason == VNIC_RESET_MOBILITY)) + adapter->force_reset_recovery = true; } if (adapter->wait_for_reset) { @@ -2107,7 +2117,16 @@ static void __ibmvnic_reset(struct work_struct *work) free_all_rwi(adapter); } - adapter->resetting = false; + clear_bit_unlock(0, &adapter->resetting); +} + +static void __ibmvnic_delayed_reset(struct work_struct *work) +{ + struct ibmvnic_adapter *adapter; + + adapter = container_of(work, struct ibmvnic_adapter, + ibmvnic_delayed_reset.work); + __ibmvnic_reset(&adapter->ibmvnic_reset); } static int ibmvnic_reset(struct ibmvnic_adapter *adapter, @@ -2162,7 +2181,6 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, rwi->reset_reason = reason; list_add_tail(&rwi->list, &adapter->rwi_list); spin_unlock_irqrestore(&adapter->rwi_lock, flags); - adapter->resetting = true; netdev_dbg(adapter->netdev, "Scheduling reset (reason %d)\n", reason); schedule_work(&adapter->ibmvnic_reset); @@ -2207,7 +2225,7 @@ restart_poll: u16 offset; u8 flags = 0; - if (unlikely(adapter->resetting && + if (unlikely(test_bit(0, &adapter->resetting) && adapter->reset_reason != VNIC_RESET_NON_FATAL)) { enable_scrq_irq(adapter, adapter->rx_scrq[scrq_num]); napi_complete_done(napi, frames_processed); @@ -2858,7 +2876,7 @@ static int enable_scrq_irq(struct ibmvnic_adapter *adapter, return 1; } - if (adapter->resetting && + if (test_bit(0, &adapter->resetting) && adapter->reset_reason == VNIC_RESET_MOBILITY) { u64 val = (0xff000000) | scrq->hw_irq; @@ -3408,7 +3426,7 @@ static int ibmvnic_send_crq(struct ibmvnic_adapter *adapter, if (rc) { if (rc == H_CLOSED) { dev_warn(dev, "CRQ Queue closed\n"); - if (adapter->resetting) + if (test_bit(0, &adapter->resetting)) ibmvnic_reset(adapter, VNIC_RESET_FATAL); } @@ -4484,7 +4502,7 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, case IBMVNIC_CRQ_XPORT_EVENT: netif_carrier_off(netdev); adapter->crq.active = false; - if (adapter->resetting) + if (test_bit(0, &adapter->resetting)) adapter->force_reset_recovery = true; if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) { dev_info(dev, "Migrated, re-enabling adapter\n"); @@ -4822,7 +4840,7 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter) return -1; } - if (adapter->resetting && !adapter->wait_for_reset && + if (test_bit(0, &adapter->resetting) && !adapter->wait_for_reset && adapter->reset_reason != VNIC_RESET_MOBILITY) { if (adapter->req_rx_queues != old_num_rx_queues || adapter->req_tx_queues != old_num_tx_queues) { @@ -4934,10 +4952,12 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) spin_lock_init(&adapter->stats_lock); INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset); + INIT_DELAYED_WORK(&adapter->ibmvnic_delayed_reset, + __ibmvnic_delayed_reset); INIT_LIST_HEAD(&adapter->rwi_list); spin_lock_init(&adapter->rwi_lock); init_completion(&adapter->init_done); - adapter->resetting = false; + clear_bit(0, &adapter->resetting); do { rc = init_crq_queue(adapter); diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 9d3d35cc91d6..ebc39248b334 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -39,6 +39,8 @@ #define IBMVNIC_MAX_LTB_SIZE ((1 << (MAX_ORDER - 1)) * PAGE_SIZE) #define IBMVNIC_BUFFER_HLEN 500 +#define IBMVNIC_RESET_DELAY 100 + static const char ibmvnic_priv_flags[][ETH_GSTRING_LEN] = { #define IBMVNIC_USE_SERVER_MAXES 0x1 "use-server-maxes" @@ -1077,7 +1079,8 @@ struct ibmvnic_adapter { spinlock_t rwi_lock; struct list_head rwi_list; struct work_struct ibmvnic_reset; - bool resetting; + struct delayed_work ibmvnic_delayed_reset; + unsigned long resetting; bool napi_enabled, from_passive_init; bool failover_pending; From 4c247de564f1ff614d11b3bb5313fb70d7b9598b Mon Sep 17 00:00:00 2001 From: Takeshi Misawa Date: Sun, 22 Sep 2019 16:45:31 +0900 Subject: [PATCH 1116/2880] ppp: Fix memory leak in ppp_write When ppp is closing, __ppp_xmit_process() failed to enqueue skb and skb allocated in ppp_write() is leaked. syzbot reported : BUG: memory leak unreferenced object 0xffff88812a17bc00 (size 224): comm "syz-executor673", pid 6952, jiffies 4294942888 (age 13.040s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000d110fff9>] kmemleak_alloc_recursive include/linux/kmemleak.h:43 [inline] [<00000000d110fff9>] slab_post_alloc_hook mm/slab.h:522 [inline] [<00000000d110fff9>] slab_alloc_node mm/slab.c:3262 [inline] [<00000000d110fff9>] kmem_cache_alloc_node+0x163/0x2f0 mm/slab.c:3574 [<000000002d616113>] __alloc_skb+0x6e/0x210 net/core/skbuff.c:197 [<000000000167fc45>] alloc_skb include/linux/skbuff.h:1055 [inline] [<000000000167fc45>] ppp_write+0x48/0x120 drivers/net/ppp/ppp_generic.c:502 [<000000009ab42c0b>] __vfs_write+0x43/0xa0 fs/read_write.c:494 [<00000000086b2e22>] vfs_write fs/read_write.c:558 [inline] [<00000000086b2e22>] vfs_write+0xee/0x210 fs/read_write.c:542 [<00000000a2b70ef9>] ksys_write+0x7c/0x130 fs/read_write.c:611 [<00000000ce5e0fdd>] __do_sys_write fs/read_write.c:623 [inline] [<00000000ce5e0fdd>] __se_sys_write fs/read_write.c:620 [inline] [<00000000ce5e0fdd>] __x64_sys_write+0x1e/0x30 fs/read_write.c:620 [<00000000d9d7b370>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:296 [<0000000006e6d506>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix this by freeing skb, if ppp is closing. Fixes: 6d066734e9f0 ("ppp: avoid loop in xmit recursion detection code") Reported-and-tested-by: syzbot+d9c8bf24e56416d7ce2c@syzkaller.appspotmail.com Signed-off-by: Takeshi Misawa Reviewed-by: Guillaume Nault Tested-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_generic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index a30e41a56085..9a1b006904a7 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1415,6 +1415,8 @@ static void __ppp_xmit_process(struct ppp *ppp, struct sk_buff *skb) netif_wake_queue(ppp->dev); else netif_stop_queue(ppp->dev); + } else { + kfree_skb(skb); } ppp_xmit_unlock(ppp); } From 5c94ad1793f1c8743388f329d55a61e5001dc58e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 22 Sep 2019 13:42:16 +0200 Subject: [PATCH 1117/2880] atm: he: clean up an indentation issue There is a statement that is indented one level too many, remove the extraneous tab. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/atm/he.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/atm/he.c b/drivers/atm/he.c index 70b00ae4ec38..8af793f5e811 100644 --- a/drivers/atm/he.c +++ b/drivers/atm/he.c @@ -1690,7 +1690,7 @@ he_service_rbrq(struct he_dev *he_dev, int group) if (RBRQ_HBUF_ERR(he_dev->rbrq_head)) { hprintk("HBUF_ERR! (cid 0x%x)\n", cid); - atomic_inc(&vcc->stats->rx_drop); + atomic_inc(&vcc->stats->rx_drop); goto return_host_buffers; } From 9f5c44cf61a7cf29d85d2c0d2065ab2d62764a41 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Sep 2019 14:16:03 +0800 Subject: [PATCH 1118/2880] gianfar: Make reset_gfar static Fix sparse warning: drivers/net/ethernet/freescale/gianfar.c:2070:6: warning: symbol 'reset_gfar' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/gianfar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 24bf7f68375f..51ad86417cb1 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -2067,7 +2067,7 @@ static int gfar_change_mtu(struct net_device *dev, int new_mtu) return 0; } -void reset_gfar(struct net_device *ndev) +static void reset_gfar(struct net_device *ndev) { struct gfar_private *priv = netdev_priv(ndev); From b0ce902febef24f917c33d5a5982030dac53141d Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 23 Sep 2019 09:49:08 +0200 Subject: [PATCH 1119/2880] net: stmmac: selftests: Flow Control test can also run with ASYM Pause The Flow Control selftest is also available with ASYM Pause. Lets add this check to the test and fix eventual false positive failures. Fixes: 091810dbded9 ("net: stmmac: Introduce selftests support") Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c index 9c8d210b2d6a..5f66f6161629 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c @@ -670,7 +670,7 @@ static int stmmac_test_flowctrl(struct stmmac_priv *priv) unsigned int pkt_count; int i, ret = 0; - if (!phydev || !phydev->pause) + if (!phydev || (!phydev->pause && !phydev->asym_pause)) return -EOPNOTSUPP; tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL); From 99dcb8432af062fffd5eb81692966a0de9ccc072 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 23 Sep 2019 14:03:51 +0530 Subject: [PATCH 1120/2880] net: macb: Remove dead code macb_64b_desc is always called when HW_DMA_CAP_64B is defined. So the return NULL can never be reached. Remove the dead code. Signed-off-by: Shubhrajyoti Datta Reviewed-by: Claudiu Beznea Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 35b59b5edf0f..8e8d557901a9 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -165,9 +165,8 @@ static unsigned int macb_adj_dma_desc_idx(struct macb *bp, unsigned int desc_idx #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT static struct macb_dma_desc_64 *macb_64b_desc(struct macb *bp, struct macb_dma_desc *desc) { - if (bp->hw_dma_cap & HW_DMA_CAP_64B) - return (struct macb_dma_desc_64 *)((void *)desc + sizeof(struct macb_dma_desc)); - return NULL; + return (struct macb_dma_desc_64 *)((void *)desc + + sizeof(struct macb_dma_desc)); } #endif From faef87494139cf2cc4d188d5730251ade9b2022d Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Thu, 19 Sep 2019 15:43:02 -0500 Subject: [PATCH 1121/2880] perf vendor events amd: Add L3 cache events for Family 17h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow users to symbolically specify L3 events for Family 17h processors using the existing AMD Uncore driver. Source of events descriptions are from section 2.1.15.4.1 "L3 Cache PMC Events" of the latest Family 17h PPR, available here: https://www.amd.com/system/files/TechDocs/55570-B1_PUB.zip Opnly BriefDescriptions added, since they show with and without the -v and --details flags. Tested with: # perf stat -e l3_request_g1.caching_l3_cache_accesses,amd_l3/event=0x01,umask=0x80/,l3_comb_clstr_state.request_miss,amd_l3/event=0x06,umask=0x01/ perf bench mem memcpy -s 4mb -l 100 -f default ... 7,006,831 l3_request_g1.caching_l3_cache_accesses 7,006,830 amd_l3/event=0x01,umask=0x80/ 366,530 l3_comb_clstr_state.request_miss 366,568 amd_l3/event=0x06,umask=0x01/ Signed-off-by: Kim Phillips Reviewed-by: Andi Kleen Cc: Alexander Shishkin Cc: Andi Kleen Cc: Borislav Petkov Cc: Janakarajan Natarajan Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Luke Mujica Cc: Martin Liška Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190919204306.12598-1-kim.phillips@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/amdfam17h/cache.json | 42 +++++++++++++++++++ tools/perf/pmu-events/jevents.c | 1 + 2 files changed, 43 insertions(+) diff --git a/tools/perf/pmu-events/arch/x86/amdfam17h/cache.json b/tools/perf/pmu-events/arch/x86/amdfam17h/cache.json index fad4af9142cb..6221a840fcea 100644 --- a/tools/perf/pmu-events/arch/x86/amdfam17h/cache.json +++ b/tools/perf/pmu-events/arch/x86/amdfam17h/cache.json @@ -283,5 +283,47 @@ "BriefDescription": "Total cycles spent with one or more fill requests in flight from L2.", "PublicDescription": "Total cycles spent with one or more fill requests in flight from L2.", "UMask": "0x1" + }, + { + "EventName": "l3_request_g1.caching_l3_cache_accesses", + "EventCode": "0x01", + "BriefDescription": "Caching: L3 cache accesses", + "UMask": "0x80", + "Unit": "L3PMC" + }, + { + "EventName": "l3_lookup_state.all_l3_req_typs", + "EventCode": "0x04", + "BriefDescription": "All L3 Request Types", + "UMask": "0xff", + "Unit": "L3PMC" + }, + { + "EventName": "l3_comb_clstr_state.other_l3_miss_typs", + "EventCode": "0x06", + "BriefDescription": "Other L3 Miss Request Types", + "UMask": "0xfe", + "Unit": "L3PMC" + }, + { + "EventName": "l3_comb_clstr_state.request_miss", + "EventCode": "0x06", + "BriefDescription": "L3 cache misses", + "UMask": "0x01", + "Unit": "L3PMC" + }, + { + "EventName": "xi_sys_fill_latency", + "EventCode": "0x90", + "BriefDescription": "L3 Cache Miss Latency. Total cycles for all transactions divided by 16. Ignores SliceMask and ThreadMask.", + "UMask": "0x00", + "Unit": "L3PMC" + }, + { + "EventName": "xi_ccx_sdp_req1.all_l3_miss_req_typs", + "EventCode": "0x9a", + "BriefDescription": "All L3 Miss Request Types. Ignores SliceMask and ThreadMask.", + "UMask": "0x3f", + "Unit": "L3PMC" } ] diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index d413761621b0..9e37287da924 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -239,6 +239,7 @@ static struct map { { "hisi_sccl,ddrc", "hisi_sccl,ddrc" }, { "hisi_sccl,hha", "hisi_sccl,hha" }, { "hisi_sccl,l3c", "hisi_sccl,l3c" }, + { "L3PMC", "amd_l3" }, {} }; From 0c03d3aa255b5d3a7b64051a79f6e9f487194a9f Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Thu, 19 Sep 2019 15:43:03 -0500 Subject: [PATCH 1122/2880] perf vendor events amd: Remove redundant '[' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the redundant '['. 'perf list' output before: ex_ret_brn [[Retired Branch Instructions] 'perf list' output after: ex_ret_brn [Retired Branch Instructions] Fixes: 98c07a8f74f8 ("perf vendor events amd: perf PMU events for AMD Family 17h") Signed-off-by: Kim Phillips Reviewed-by: Andi Kleen Cc: Alexander Shishkin Cc: Borislav Petkov Cc: Janakarajan Natarajan Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Luke Mujica Cc: Martin Liška Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190919204306.12598-2-kim.phillips@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/amdfam17h/core.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/pmu-events/arch/x86/amdfam17h/core.json b/tools/perf/pmu-events/arch/x86/amdfam17h/core.json index 7b285b0a7f35..1079544eeed5 100644 --- a/tools/perf/pmu-events/arch/x86/amdfam17h/core.json +++ b/tools/perf/pmu-events/arch/x86/amdfam17h/core.json @@ -13,7 +13,7 @@ { "EventName": "ex_ret_brn", "EventCode": "0xc2", - "BriefDescription": "[Retired Branch Instructions.", + "BriefDescription": "Retired Branch Instructions.", "PublicDescription": "The number of branch instructions retired. This includes all types of architectural control flow changes, including exceptions and interrupts." }, { From 93125562ce385fc0322e010003767d656677bdef Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Thu, 19 Sep 2019 15:43:04 -0500 Subject: [PATCH 1123/2880] perf vendor events: Minor fixes to the README MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some grammatical fixes, and updates to some path references that have since changed. Signed-off-by: Kim Phillips Reviewed-by: Andi Kleen Cc: Alexander Shishkin Cc: Borislav Petkov Cc: Janakarajan Natarajan Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Luke Mujica Cc: Martin Liška Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190919204306.12598-3-kim.phillips@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/README | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/perf/pmu-events/README b/tools/perf/pmu-events/README index e62b09b6a844..de7efa2cebd1 100644 --- a/tools/perf/pmu-events/README +++ b/tools/perf/pmu-events/README @@ -30,9 +30,9 @@ the topic. Eg: "Floating-point.json". All the topic JSON files for a CPU model/family should be in a separate sub directory. Thus for the Silvermont X86 CPU: - $ ls tools/perf/pmu-events/arch/x86/Silvermont_core - Cache.json Memory.json Virtual-Memory.json - Frontend.json Pipeline.json + $ ls tools/perf/pmu-events/arch/x86/silvermont + cache.json memory.json virtual-memory.json + frontend.json pipeline.json The JSONs folder for a CPU model/family may be placed in the root arch folder, or may be placed in a vendor sub-folder under the arch folder @@ -94,7 +94,7 @@ users to specify events by their name: where 'pm_1plus_ppc_cmpl' is a Power8 PMU event. -However some errors in processing may cause the perf build to fail. +However some errors in processing may cause the alias build to fail. Mapfile format =============== @@ -119,7 +119,7 @@ where: Header line The header line is the first line in the file, which is - always _IGNORED_. It can empty. + always _IGNORED_. It can be empty. CPUID: CPUID is an arch-specific char string, that can be used @@ -138,15 +138,15 @@ where: files, relative to the directory containing the mapfile.csv Type: - indicates whether the events or "core" or "uncore" events. + indicates whether the events are "core" or "uncore" events. Eg: - $ grep Silvermont tools/perf/pmu-events/arch/x86/mapfile.csv - GenuineIntel-6-37,V13,Silvermont_core,core - GenuineIntel-6-4D,V13,Silvermont_core,core - GenuineIntel-6-4C,V13,Silvermont_core,core + $ grep silvermont tools/perf/pmu-events/arch/x86/mapfile.csv + GenuineIntel-6-37,v13,silvermont,core + GenuineIntel-6-4D,v13,silvermont,core + GenuineIntel-6-4C,v13,silvermont,core i.e the three CPU models use the JSON files (i.e PMU events) listed - in the directory 'tools/perf/pmu-events/arch/x86/Silvermont_core'. + in the directory 'tools/perf/pmu-events/arch/x86/silvermont'. From 8788d36950dae26074dc63b46eab3debb4b210b4 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Thu, 19 Sep 2019 15:43:05 -0500 Subject: [PATCH 1124/2880] perf list: Allow plurals for metric, metricgroup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhance usability by allowing the same plurality used in the output title, for the command line parameter. BEFORE, perf deceitfully acts as if there are no metrics to be had: $ perf list metrics List of pre-defined events (to be used in -e): Metric Groups: $ But singular 'metric' shows a list of metrics: $ perf list metric List of pre-defined events (to be used in -e): Metrics: IPC [Instructions Per Cycle (per logical thread)] UPI [Uops Per Instruction] AFTER, when asking for 'metrics', we actually see the metrics get listed: $ perf list metrics List of pre-defined events (to be used in -e): Metrics: IPC [Instructions Per Cycle (per logical thread)] UPI [Uops Per Instruction] Fixes: 71b0acce78d1 ("perf list: Add metric groups to perf list") Signed-off-by: Kim Phillips Reviewed-by: Andi Kleen Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Borislav Petkov Cc: Janakarajan Natarajan Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Luke Mujica Cc: Martin Liška Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190919204306.12598-4-kim.phillips@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index e290f6b348d8..08e62ae9d37e 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -81,9 +81,9 @@ int cmd_list(int argc, const char **argv) long_desc_flag, details_flag); else if (strcmp(argv[i], "sdt") == 0) print_sdt_events(NULL, NULL, raw_dump); - else if (strcmp(argv[i], "metric") == 0) + else if (strcmp(argv[i], "metric") == 0 || strcmp(argv[i], "metrics") == 0) metricgroup__print(true, false, NULL, raw_dump, details_flag); - else if (strcmp(argv[i], "metricgroup") == 0) + else if (strcmp(argv[i], "metricgroup") == 0 || strcmp(argv[i], "metricgroups") == 0) metricgroup__print(false, true, NULL, raw_dump, details_flag); else if ((sep = strchr(argv[i], ':')) != NULL) { int sep_idx; From 5c8da72dc21eff32deb00638e547e2c247ebbfb0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Sep 2019 16:51:19 -0400 Subject: [PATCH 1125/2880] libtraceevent: Round up in tep_print_event() time precision When testing the output of the old trace-cmd compared to the one that uses the updated tep_print_event() logic, it was different in that the time stamp precision in the old format would round up to the nearest precision, where as the new logic truncates. Bring back the old method of rounding up. Signed-off-by: Steven Rostedt (VMware) Cc: Jiri Olsa Cc: Tzvetomir Stoyanov Cc: linux trace devel Link: http://lore.kernel.org/lkml/20190919165119.5efa5de6@gandalf.local.home Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/event-parse.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 6f842af4550b..d948475585ce 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -5527,8 +5527,10 @@ static void print_event_time(struct tep_handle *tep, struct trace_seq *s, if (divstr && isdigit(*(divstr + 1))) div = atoi(divstr + 1); time = record->ts; - if (div) + if (div) { + time += div / 2; time /= div; + } pr = prec; while (pr--) p10 *= 10; From 4ab91deacc9b8611fdae9055f926c8f99dde8411 Mon Sep 17 00:00:00 2001 From: Tzvetomir Stoyanov Date: Thu, 19 Sep 2019 17:23:36 -0400 Subject: [PATCH 1126/2880] libtraceevent: Man pages for libtraceevent event print related API Added new man page, describing tep_print_event() libtraceevent API. Signed-off-by: Tzvetomir Stoyanov Cc: Andrew Morton Cc: Jiri Olsa Cc: Namhyung Kim Cc: linux-trace-devel@vger.kernel.org Link: http://lore.kernel.org/linux-trace-devel/20190801075012.22098-1-tz.stoyanov@gmail.com Link: http://lore.kernel.org/lkml/20190919212541.553160178@goodmis.org Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Arnaldo Carvalho de Melo --- .../libtraceevent-event_print.txt | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 tools/lib/traceevent/Documentation/libtraceevent-event_print.txt diff --git a/tools/lib/traceevent/Documentation/libtraceevent-event_print.txt b/tools/lib/traceevent/Documentation/libtraceevent-event_print.txt new file mode 100644 index 000000000000..2c6a61811118 --- /dev/null +++ b/tools/lib/traceevent/Documentation/libtraceevent-event_print.txt @@ -0,0 +1,130 @@ +libtraceevent(3) +================ + +NAME +---- +tep_print_event - Writes event information into a trace sequence. + +SYNOPSIS +-------- +[verse] +-- +*#include * +*#include * + +void *tep_print_event*(struct tep_handle pass:[*]_tep_, struct trace_seqpass:[*]_s_, struct tep_record pass:[*]_record_, const char pass:[*]_fmt_, _..._) +-- + +DESCRIPTION +----------- + +The _tep_print_event()_ function parses the event information of the given +_record_ and writes it into the trace sequence _s_, according to the format +string _fmt_. The desired information is specified after the format string. +The _fmt_ is printf-like format string, following arguments are supported: +[verse] +-- + TEP_PRINT_PID, "%d" - PID of the event. + TEP_PRINT_CPU, "%d" - Event CPU. + TEP_PRINT_COMM, "%s" - Event command string. + TEP_PRINT_NAME, "%s" - Event name. + TEP_PRINT_LATENCY, "%s" - Latency of the event. It prints 4 or more + fields - interrupt state, scheduling state, + current context, and preemption count. + Field 1 is the interrupt enabled state: + d : Interrupts are disabled + . : Interrupts are enabled + X : The architecture does not support this + information + Field 2 is the "need resched" state. + N : The task is set to call the scheduler when + possible, as another higher priority task + may need to be scheduled in. + . : The task is not set to call the scheduler. + Field 3 is the context state. + . : Normal context + s : Soft interrupt context + h : Hard interrupt context + H : Hard interrupt context which triggered + during soft interrupt context. + z : NMI context + Z : NMI context which triggered during hard + interrupt context + Field 4 is the preemption count. + . : The preempt count is zero. + On preemptible kernels (where the task can be scheduled + out in arbitrary locations while in kernel context), the + preempt count, when non zero, will prevent the kernel + from scheduling out the current task. The preempt count + number is displayed when it is not zero. + Depending on the kernel, it may show other fields + (lock depth, or migration disabled, which are unique to + specialized kernels). + TEP_PRINT_TIME, %d - event time stamp. A divisor and precision can be + specified as part of this format string: + "%precision.divisord". Example: + "%3.1000d" - divide the time by 1000 and print the first + 3 digits before the dot. Thus, the time stamp + "123456000" will be printed as "123.456" + TEP_PRINT_INFO, "%s" - event information. + TEP_PRINT_INFO_RAW, "%s" - event information, in raw format. + +-- +EXAMPLE +------- +[source,c] +-- +#include +#include +... +struct trace_seq seq; +trace_seq_init(&seq); +struct tep_handle *tep = tep_alloc(); +... +void print_my_event(struct tep_record *record) +{ + trace_seq_reset(&seq); + tep_print_event(tep, s, record, "%16s-%-5d [%03d] %s %6.1000d %s %s", + TEP_PRINT_COMM, TEP_PRINT_PID, TEP_PRINT_CPU, + TEP_PRINT_LATENCY, TEP_PRINT_TIME, TEP_PRINT_NAME, + TEP_PRINT_INFO); +} +... +-- + +FILES +----- +[verse] +-- +*event-parse.h* + Header file to include in order to have access to the library APIs. +*trace-seq.h* + Header file to include in order to have access to trace sequences related APIs. + Trace sequences are used to allow a function to call several other functions + to create a string of data to use. +*-ltraceevent* + Linker switch to add when building a program that uses the library. +-- + +SEE ALSO +-------- +_libtraceevent(3)_, _trace-cmd(1)_ + +AUTHOR +------ +[verse] +-- +*Steven Rostedt* , author of *libtraceevent*. +*Tzvetomir Stoyanov* , author of this man page. +-- +REPORTING BUGS +-------------- +Report bugs to + +LICENSE +------- +libtraceevent is Free Software licensed under the GNU LGPL 2.1 + +RESOURCES +--------- +https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git From 38f76c31368b7a8b88659c178f9e620b02660a70 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Thu, 19 Sep 2019 17:23:37 -0400 Subject: [PATCH 1127/2880] libtraceevent: Man pages fix, rename tep_ref_get() to tep_get_ref() The tep_ref_get() was renamed to tep_get_ref(), to be more consistent with the other tep_ref_* APIs. However, in the man pages the API is still with the old name. The documentation is fixed to reflect the actual name of the API. Signed-off-by: Tzvetomir Stoyanov (VMware) Cc: Andrew Morton Cc: Jiri Olsa Cc: Namhyung Kim Cc: Tzvetomir Stoyanov (VMware) Cc: linux-trace-devel@vger.kernel.org Link: http://lore.kernel.org/linux-trace-devel/20190808113636.13299-2-tz.stoyanov@gmail.com Link: http://lore.kernel.org/lkml/20190919212541.697034573@goodmis.org Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Arnaldo Carvalho de Melo --- .../lib/traceevent/Documentation/libtraceevent-handle.txt | 8 ++++---- tools/lib/traceevent/Documentation/libtraceevent.txt | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/lib/traceevent/Documentation/libtraceevent-handle.txt b/tools/lib/traceevent/Documentation/libtraceevent-handle.txt index 8d568316847d..45b20172e262 100644 --- a/tools/lib/traceevent/Documentation/libtraceevent-handle.txt +++ b/tools/lib/traceevent/Documentation/libtraceevent-handle.txt @@ -3,7 +3,7 @@ libtraceevent(3) NAME ---- -tep_alloc, tep_free,tep_ref, tep_unref,tep_ref_get - Create, destroy, manage +tep_alloc, tep_free,tep_ref, tep_unref,tep_get_ref - Create, destroy, manage references of trace event parser context. SYNOPSIS @@ -16,7 +16,7 @@ struct tep_handle pass:[*]*tep_alloc*(void); void *tep_free*(struct tep_handle pass:[*]_tep_); void *tep_ref*(struct tep_handle pass:[*]_tep_); void *tep_unref*(struct tep_handle pass:[*]_tep_); -int *tep_ref_get*(struct tep_handle pass:[*]_tep_); +int *tep_get_ref*(struct tep_handle pass:[*]_tep_); -- DESCRIPTION @@ -57,9 +57,9 @@ EXAMPLE ... struct tep_handle *tep = tep_alloc(); ... -int ref = tep_ref_get(tep); +int ref = tep_get_ref(tep); tep_ref(tep); -if ( (ref+1) != tep_ref_get(tep)) { +if ( (ref+1) != tep_get_ref(tep)) { /* Something wrong happened, the counter is not incremented by 1 */ } tep_unref(tep); diff --git a/tools/lib/traceevent/Documentation/libtraceevent.txt b/tools/lib/traceevent/Documentation/libtraceevent.txt index fbd977b47de1..00519503c8de 100644 --- a/tools/lib/traceevent/Documentation/libtraceevent.txt +++ b/tools/lib/traceevent/Documentation/libtraceevent.txt @@ -16,7 +16,7 @@ Management of tep handler data structure and access of its members: void *tep_free*(struct tep_handle pass:[*]_tep_); void *tep_ref*(struct tep_handle pass:[*]_tep_); void *tep_unref*(struct tep_handle pass:[*]_tep_); - int *tep_ref_get*(struct tep_handle pass:[*]_tep_); + int *tep_get_ref*(struct tep_handle pass:[*]_tep_); void *tep_set_flag*(struct tep_handle pass:[*]_tep_, enum tep_flag _flag_); void *tep_clear_flag*(struct tep_handle pass:[*]_tep_, enum tep_flag _flag_); bool *tep_test_flag*(struct tep_handle pass:[*]_tep_, enum tep_flag _flags_); From f8d16387fffba1c7c88b5570b09891ba8a26cfd5 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Thu, 19 Sep 2019 17:23:38 -0400 Subject: [PATCH 1128/2880] libtraceevent: Man pages fix, changes in event printing APIs APIs for printing various trace event information were redesigned to be more simple. However, the main libtraceevent man page was not updated with those changes. The documentation is updated to describe the new event print API. Signed-off-by: Tzvetomir Stoyanov (VMware) Cc: Andrew Morton Cc: Jiri Olsa Cc: Namhyung Kim Cc: Tzvetomir Stoyanov (VMware) Cc: linux-trace-devel@vger.kernel.org Link: http://lore.kernel.org/linux-trace-devel/20190808113636.13299-3-tz.stoyanov@gmail.com Link: http://lore.kernel.org/lkml/20190919212541.869643036@goodmis.org Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Arnaldo Carvalho de Melo --- .../lib/traceevent/Documentation/libtraceevent.txt | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/tools/lib/traceevent/Documentation/libtraceevent.txt b/tools/lib/traceevent/Documentation/libtraceevent.txt index 00519503c8de..d530a7ce8fb2 100644 --- a/tools/lib/traceevent/Documentation/libtraceevent.txt +++ b/tools/lib/traceevent/Documentation/libtraceevent.txt @@ -26,15 +26,12 @@ Management of tep handler data structure and access of its members: void *tep_set_long_size*(struct tep_handle pass:[*]_tep_, int _long_size_); int *tep_get_page_size*(struct tep_handle pass:[*]_tep_); void *tep_set_page_size*(struct tep_handle pass:[*]_tep_, int _page_size_); - bool *tep_is_latency_format*(struct tep_handle pass:[*]_tep_); - void *tep_set_latency_format*(struct tep_handle pass:[*]_tep_, int _lat_); int *tep_get_header_page_size*(struct tep_handle pass:[*]_tep_); int *tep_get_header_timestamp_size*(struct tep_handle pass:[*]_tep_); bool *tep_is_old_format*(struct tep_handle pass:[*]_tep_); int *tep_strerror*(struct tep_handle pass:[*]_tep_, enum tep_errno _errnum_, char pass:[*]_buf_, size_t _buflen_); Register / unregister APIs: - int *tep_register_trace_clock*(struct tep_handle pass:[*]_tep_, const char pass:[*]_trace_clock_); int *tep_register_function*(struct tep_handle pass:[*]_tep_, char pass:[*]_name_, unsigned long long _addr_, char pass:[*]_mod_); int *tep_register_event_handler*(struct tep_handle pass:[*]_tep_, int _id_, const char pass:[*]_sys_name_, const char pass:[*]_event_name_, tep_event_handler_func _func_, void pass:[*]_context_); int *tep_unregister_event_handler*(struct tep_handle pass:[*]tep, int id, const char pass:[*]sys_name, const char pass:[*]event_name, tep_event_handler_func func, void pass:[*]_context_); @@ -57,14 +54,7 @@ Event related APIs: int *tep_get_events_count*(struct tep_handle pass:[*]_tep_); struct tep_event pass:[*]pass:[*]*tep_list_events*(struct tep_handle pass:[*]_tep_, enum tep_event_sort_type _sort_type_); struct tep_event pass:[*]pass:[*]*tep_list_events_copy*(struct tep_handle pass:[*]_tep_, enum tep_event_sort_type _sort_type_); - -Event printing: - void *tep_print_event*(struct tep_handle pass:[*]_tep_, struct trace_seq pass:[*]_s_, struct tep_record pass:[*]_record_, bool _use_trace_clock_); - void *tep_print_event_data*(struct tep_handle pass:[*]_tep_, struct trace_seq pass:[*]_s_, struct tep_event pass:[*]_event_, struct tep_record pass:[*]_record_); - void *tep_event_info*(struct trace_seq pass:[*]_s_, struct tep_event pass:[*]_event_, struct tep_record pass:[*]_record_); - void *tep_print_event_task*(struct tep_handle pass:[*]_tep_, struct trace_seq pass:[*]_s_, struct tep_event pass:[*]_event_, struct tep_record pass:[*]_record_); - void *tep_print_event_time*(struct tep_handle pass:[*]_tep_, struct trace_seq pass:[*]_s_, struct tep_event pass:[*]_event_, struct tep_record pass:[*]record, bool _use_trace_clock_); - void *tep_set_print_raw*(struct tep_handle pass:[*]_tep_, int _print_raw_); + void *tep_print_event*(struct tep_handle pass:[*]_tep_, struct trace_seq pass:[*]_s_, struct tep_record pass:[*]_record_, const char pass:[*]_fmt_, _..._); Event finding: struct tep_event pass:[*]*tep_find_event*(struct tep_handle pass:[*]_tep_, int _id_); @@ -116,7 +106,6 @@ Filter management: int *tep_filter_compare*(struct tep_event_filter pass:[*]_filter1_, struct tep_event_filter pass:[*]_filter2_); Parsing various data from the records: - void *tep_data_latency_format*(struct tep_handle pass:[*]_tep_, struct trace_seq pass:[*]_s_, struct tep_record pass:[*]_record_); int *tep_data_type*(struct tep_handle pass:[*]_tep_, struct tep_record pass:[*]_rec_); int *tep_data_pid*(struct tep_handle pass:[*]_tep_, struct tep_record pass:[*]_rec_); int *tep_data_preempt_count*(struct tep_handle pass:[*]_tep_, struct tep_record pass:[*]_rec_); From d69094f364d012f6d0be712969e6a6a355b69e84 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Thu, 19 Sep 2019 17:23:39 -0400 Subject: [PATCH 1129/2880] libtraceevent: Add tep_get_event() in event-parse.h The tep_get_event() function is an official libtracevent API, described in the library man pages. However, it cannot be used by the library users because it is not declared in the event-parse.h file, where all libtracevent APIs are. The function declaration is added in event-parse.h file. Signed-off-by: Tzvetomir Stoyanov (VMware) Cc: Andrew Morton Cc: Jiri Olsa Cc: Namhyung Kim Cc: Tzvetomir Stoyanov (VMware) Cc: linux-trace-devel@vger.kernel.org Link: http://lore.kernel.org/linux-trace-devel/20190808113721.13539-1-tz.stoyanov@gmail.com Link: http://lore.kernel.org/lkml/20190919212542.058025937@goodmis.org Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/event-parse.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h index d438ee44289f..b77837f75a0d 100644 --- a/tools/lib/traceevent/event-parse.h +++ b/tools/lib/traceevent/event-parse.h @@ -441,6 +441,8 @@ int tep_register_print_string(struct tep_handle *tep, const char *fmt, unsigned long long addr); bool tep_is_pid_registered(struct tep_handle *tep, int pid); +struct tep_event *tep_get_event(struct tep_handle *tep, int index); + #define TEP_PRINT_INFO "INFO" #define TEP_PRINT_INFO_RAW "INFO_RAW" #define TEP_PRINT_COMM "COMM" From 077faf3dc7cc13c3bd784613304bf38696b591da Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Thu, 19 Sep 2019 17:23:41 -0400 Subject: [PATCH 1130/2880] libtraceevent: Move traceevent plugins in its own subdirectory All traceevent plugins code is moved to tools/lib/traceevent/plugins subdirectory. It makes traceevent implementation in trace-cmd and in kernel tree consistent. There is no changes in the way libtraceevent and plugins are compiled and installed. Committer notes: Applied fixup provided by Steven, fixing the tools/perf/Makefile.perf target for the plugin dynamic list file. Problem noticed when cross building to aarch64 from a Ubuntu 19.04 container. Suggested-by: Steven Rostedt (VMware) Signed-off-by: Tzvetomir Stoyanov (VMware) Cc: Andrew Morton Cc: Jiri Olsa Cc: Namhyung Kim Cc: Tzvetomir Stoyanov (VMware) Cc: linux-trace-devel@vger.kernel.org Link: http://lore.kernel.org/lkml/20190923115929.453b68f1@oasis.local.home Link: http://lore.kernel.org/lkml/20190919212542.377333393@goodmis.org Link: http://lore.kernel.org/linux-trace-devel/20190917105055.18983-1-tz.stoyanov@gmail.com Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/Build | 11 - tools/lib/traceevent/Makefile | 94 ++------ tools/lib/traceevent/plugins/Build | 10 + tools/lib/traceevent/plugins/Makefile | 222 ++++++++++++++++++ .../{ => plugins}/plugin_cfg80211.c | 0 .../{ => plugins}/plugin_function.c | 0 .../traceevent/{ => plugins}/plugin_hrtimer.c | 0 .../traceevent/{ => plugins}/plugin_jbd2.c | 0 .../traceevent/{ => plugins}/plugin_kmem.c | 0 .../lib/traceevent/{ => plugins}/plugin_kvm.c | 0 .../{ => plugins}/plugin_mac80211.c | 0 .../{ => plugins}/plugin_sched_switch.c | 0 .../traceevent/{ => plugins}/plugin_scsi.c | 0 .../lib/traceevent/{ => plugins}/plugin_xen.c | 0 tools/perf/Makefile.perf | 4 +- 15 files changed, 250 insertions(+), 91 deletions(-) create mode 100644 tools/lib/traceevent/plugins/Build create mode 100644 tools/lib/traceevent/plugins/Makefile rename tools/lib/traceevent/{ => plugins}/plugin_cfg80211.c (100%) rename tools/lib/traceevent/{ => plugins}/plugin_function.c (100%) rename tools/lib/traceevent/{ => plugins}/plugin_hrtimer.c (100%) rename tools/lib/traceevent/{ => plugins}/plugin_jbd2.c (100%) rename tools/lib/traceevent/{ => plugins}/plugin_kmem.c (100%) rename tools/lib/traceevent/{ => plugins}/plugin_kvm.c (100%) rename tools/lib/traceevent/{ => plugins}/plugin_mac80211.c (100%) rename tools/lib/traceevent/{ => plugins}/plugin_sched_switch.c (100%) rename tools/lib/traceevent/{ => plugins}/plugin_scsi.c (100%) rename tools/lib/traceevent/{ => plugins}/plugin_xen.c (100%) diff --git a/tools/lib/traceevent/Build b/tools/lib/traceevent/Build index ba54bfce0b0b..f9a5d79578f5 100644 --- a/tools/lib/traceevent/Build +++ b/tools/lib/traceevent/Build @@ -6,14 +6,3 @@ libtraceevent-y += parse-utils.o libtraceevent-y += kbuffer-parse.o libtraceevent-y += tep_strerror.o libtraceevent-y += event-parse-api.o - -plugin_jbd2-y += plugin_jbd2.o -plugin_hrtimer-y += plugin_hrtimer.o -plugin_kmem-y += plugin_kmem.o -plugin_kvm-y += plugin_kvm.o -plugin_mac80211-y += plugin_mac80211.o -plugin_sched_switch-y += plugin_sched_switch.o -plugin_function-y += plugin_function.o -plugin_xen-y += plugin_xen.o -plugin_scsi-y += plugin_scsi.o -plugin_cfg80211-y += plugin_cfg80211.o diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile index a39cdd0d890d..5315f3787f8d 100644 --- a/tools/lib/traceevent/Makefile +++ b/tools/lib/traceevent/Makefile @@ -58,30 +58,6 @@ export man_dir man_dir_SQ INSTALL export DESTDIR DESTDIR_SQ export EVENT_PARSE_VERSION -set_plugin_dir := 1 - -# Set plugin_dir to preffered global plugin location -# If we install under $HOME directory we go under -# $(HOME)/.local/lib/traceevent/plugins -# -# We dont set PLUGIN_DIR in case we install under $HOME -# directory, because by default the code looks under: -# $(HOME)/.local/lib/traceevent/plugins by default. -# -ifeq ($(plugin_dir),) -ifeq ($(prefix),$(HOME)) -override plugin_dir = $(HOME)/.local/lib/traceevent/plugins -set_plugin_dir := 0 -else -override plugin_dir = $(libdir)/traceevent/plugins -endif -endif - -ifeq ($(set_plugin_dir),1) -PLUGIN_DIR = -DPLUGIN_DIR="$(plugin_dir)" -PLUGIN_DIR_SQ = '$(subst ','\'',$(PLUGIN_DIR))' -endif - include ../../scripts/Makefile.include # copy a bit from Linux kbuild @@ -105,7 +81,6 @@ export prefix libdir src obj # Shell quotes libdir_SQ = $(subst ','\'',$(libdir)) libdir_relative_SQ = $(subst ','\'',$(libdir_relative)) -plugin_dir_SQ = $(subst ','\'',$(plugin_dir)) CONFIG_INCLUDES = CONFIG_LIBS = @@ -151,29 +126,14 @@ MAKEOVERRIDES= export srctree OUTPUT CC LD CFLAGS V build := -f $(srctree)/tools/build/Makefile.build dir=. obj -PLUGINS = plugin_jbd2.so -PLUGINS += plugin_hrtimer.so -PLUGINS += plugin_kmem.so -PLUGINS += plugin_kvm.so -PLUGINS += plugin_mac80211.so -PLUGINS += plugin_sched_switch.so -PLUGINS += plugin_function.so -PLUGINS += plugin_xen.so -PLUGINS += plugin_scsi.so -PLUGINS += plugin_cfg80211.so - -PLUGINS := $(addprefix $(OUTPUT),$(PLUGINS)) -PLUGINS_IN := $(PLUGINS:.so=-in.o) - TE_IN := $(OUTPUT)libtraceevent-in.o LIB_TARGET := $(addprefix $(OUTPUT),$(LIB_TARGET)) -DYNAMIC_LIST_FILE := $(OUTPUT)libtraceevent-dynamic-list -CMD_TARGETS = $(LIB_TARGET) $(PLUGINS) $(DYNAMIC_LIST_FILE) +CMD_TARGETS = $(LIB_TARGET) TARGETS = $(CMD_TARGETS) -all: all_cmd +all: all_cmd plugins all_cmd: $(CMD_TARGETS) @@ -188,17 +148,6 @@ $(OUTPUT)libtraceevent.so.$(EVENT_PARSE_VERSION): $(TE_IN) $(OUTPUT)libtraceevent.a: $(TE_IN) $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^ -$(OUTPUT)libtraceevent-dynamic-list: $(PLUGINS) - $(QUIET_GEN)$(call do_generate_dynamic_list_file, $(PLUGINS), $@) - -plugins: $(PLUGINS) - -__plugin_obj = $(notdir $@) - plugin_obj = $(__plugin_obj:-in.o=) - -$(PLUGINS_IN): force - $(Q)$(MAKE) $(build)=$(plugin_obj) - $(OUTPUT)%.so: $(OUTPUT)%-in.o $(QUIET_LINK)$(CC) $(CFLAGS) -shared $(LDFLAGS) -nostartfiles -o $@ $^ @@ -258,25 +207,6 @@ define do_install $(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR_SQ)$2' endef -define do_install_plugins - for plugin in $1; do \ - $(call do_install,$$plugin,$(plugin_dir_SQ)); \ - done -endef - -define do_generate_dynamic_list_file - symbol_type=`$(NM) -u -D $1 | awk 'NF>1 {print $$1}' | \ - xargs echo "U w W" | tr 'w ' 'W\n' | sort -u | xargs echo`;\ - if [ "$$symbol_type" = "U W" ];then \ - (echo '{'; \ - $(NM) -u -D $1 | awk 'NF>1 {print "\t"$$2";"}' | sort -u;\ - echo '};'; \ - ) > $2; \ - else \ - (echo Either missing one of [$1] or bad version of $(NM)) 1>&2;\ - fi -endef - PKG_CONFIG_FILE = libtraceevent.pc define do_install_pkgconfig_file if [ -n "${pkgconfig_dir}" ]; then \ @@ -296,10 +226,6 @@ install_lib: all_cmd install_plugins install_headers install_pkgconfig $(call do_install_mkdir,$(libdir_SQ)); \ cp -fpR $(LIB_INSTALL) $(DESTDIR)$(libdir_SQ) -install_plugins: $(PLUGINS) - $(call QUIET_INSTALL, trace_plugins) \ - $(call do_install_plugins, $(PLUGINS)) - install_pkgconfig: $(call QUIET_INSTALL, $(PKG_CONFIG_FILE)) \ $(call do_install_pkgconfig_file,$(prefix)) @@ -313,7 +239,7 @@ install_headers: install: install_lib -clean: +clean: clean_plugins $(call QUIET_CLEAN, libtraceevent) \ $(RM) *.o *~ $(TARGETS) *.a *.so $(VERSION_FILES) .*.d .*.cmd; \ $(RM) TRACEEVENT-CFLAGS tags TAGS; \ @@ -351,7 +277,19 @@ help: @echo ' doc-install - install the man pages' @echo ' doc-uninstall - uninstall the man pages' @echo'' -PHONY += force plugins + +PHONY += plugins +plugins: + $(call descend,plugins) + +PHONY += install_plugins +install_plugins: + $(call descend,plugins,install) + +PHONY += clean_plugins +clean_plugins: + $(call descend,plugins,clean) + force: # Declare the contents of the .PHONY variable as phony. We keep that diff --git a/tools/lib/traceevent/plugins/Build b/tools/lib/traceevent/plugins/Build new file mode 100644 index 000000000000..210d26910613 --- /dev/null +++ b/tools/lib/traceevent/plugins/Build @@ -0,0 +1,10 @@ +plugin_jbd2-y += plugin_jbd2.o +plugin_hrtimer-y += plugin_hrtimer.o +plugin_kmem-y += plugin_kmem.o +plugin_kvm-y += plugin_kvm.o +plugin_mac80211-y += plugin_mac80211.o +plugin_sched_switch-y += plugin_sched_switch.o +plugin_function-y += plugin_function.o +plugin_xen-y += plugin_xen.o +plugin_scsi-y += plugin_scsi.o +plugin_cfg80211-y += plugin_cfg80211.o diff --git a/tools/lib/traceevent/plugins/Makefile b/tools/lib/traceevent/plugins/Makefile new file mode 100644 index 000000000000..f440989fa55e --- /dev/null +++ b/tools/lib/traceevent/plugins/Makefile @@ -0,0 +1,222 @@ +# SPDX-License-Identifier: GPL-2.0 + +#MAKEFLAGS += --no-print-directory + + +# Makefiles suck: This macro sets a default value of $(2) for the +# variable named by $(1), unless the variable has been set by +# environment or command line. This is necessary for CC and AR +# because make sets default values, so the simpler ?= approach +# won't work as expected. +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix. +$(call allow-override,CC,$(CROSS_COMPILE)gcc) +$(call allow-override,AR,$(CROSS_COMPILE)ar) +$(call allow-override,NM,$(CROSS_COMPILE)nm) +$(call allow-override,PKG_CONFIG,pkg-config) + +EXT = -std=gnu99 +INSTALL = install + +# Use DESTDIR for installing into a different root directory. +# This is useful for building a package. The program will be +# installed in this directory as if it was the root directory. +# Then the build tool can move it later. +DESTDIR ?= +DESTDIR_SQ = '$(subst ','\'',$(DESTDIR))' + +LP64 := $(shell echo __LP64__ | ${CC} ${CFLAGS} -E -x c - | tail -n 1) +ifeq ($(LP64), 1) + libdir_relative = lib64 +else + libdir_relative = lib +endif + +prefix ?= /usr/local +libdir = $(prefix)/$(libdir_relative) + +set_plugin_dir := 1 + +# Set plugin_dir to preffered global plugin location +# If we install under $HOME directory we go under +# $(HOME)/.local/lib/traceevent/plugins +# +# We dont set PLUGIN_DIR in case we install under $HOME +# directory, because by default the code looks under: +# $(HOME)/.local/lib/traceevent/plugins by default. +# +ifeq ($(plugin_dir),) +ifeq ($(prefix),$(HOME)) +override plugin_dir = $(HOME)/.local/lib/traceevent/plugins +set_plugin_dir := 0 +else +override plugin_dir = $(libdir)/traceevent/plugins +endif +endif + +ifeq ($(set_plugin_dir),1) +PLUGIN_DIR = -DPLUGIN_DIR="$(plugin_dir)" +PLUGIN_DIR_SQ = '$(subst ','\'',$(PLUGIN_DIR))' +endif + +include ../../../scripts/Makefile.include + +# copy a bit from Linux kbuild + +ifeq ("$(origin V)", "command line") + VERBOSE = $(V) +endif +ifndef VERBOSE + VERBOSE = 0 +endif + +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +#$(info Determined 'srctree' to be $(srctree)) +endif + +export prefix libdir src obj + +# Shell quotes +plugin_dir_SQ = $(subst ','\'',$(plugin_dir)) + +CONFIG_INCLUDES = +CONFIG_LIBS = +CONFIG_FLAGS = + +OBJ = $@ +N = + +INCLUDES = -I. -I.. -I $(srctree)/tools/include $(CONFIG_INCLUDES) + +# Set compile option CFLAGS +ifdef EXTRA_CFLAGS + CFLAGS := $(EXTRA_CFLAGS) +else + CFLAGS := -g -Wall +endif + +# Append required CFLAGS +override CFLAGS += -fPIC +override CFLAGS += $(CONFIG_FLAGS) $(INCLUDES) $(PLUGIN_DIR_SQ) +override CFLAGS += $(udis86-flags) -D_GNU_SOURCE + +ifeq ($(VERBOSE),1) + Q = +else + Q = @ +endif + +# Disable command line variables (CFLAGS) override from top +# level Makefile (perf), otherwise build Makefile will get +# the same command line setup. +MAKEOVERRIDES= + +export srctree OUTPUT CC LD CFLAGS V + +build := -f $(srctree)/tools/build/Makefile.build dir=. obj + +DYNAMIC_LIST_FILE := $(OUTPUT)libtraceevent-dynamic-list + +PLUGINS = plugin_jbd2.so +PLUGINS += plugin_hrtimer.so +PLUGINS += plugin_kmem.so +PLUGINS += plugin_kvm.so +PLUGINS += plugin_mac80211.so +PLUGINS += plugin_sched_switch.so +PLUGINS += plugin_function.so +PLUGINS += plugin_xen.so +PLUGINS += plugin_scsi.so +PLUGINS += plugin_cfg80211.so + +PLUGINS := $(addprefix $(OUTPUT),$(PLUGINS)) +PLUGINS_IN := $(PLUGINS:.so=-in.o) + +plugins: $(PLUGINS) $(DYNAMIC_LIST_FILE) + +__plugin_obj = $(notdir $@) + plugin_obj = $(__plugin_obj:-in.o=) + +$(PLUGINS_IN): force + $(Q)$(MAKE) $(build)=$(plugin_obj) + +$(OUTPUT)libtraceevent-dynamic-list: $(PLUGINS) + $(QUIET_GEN)$(call do_generate_dynamic_list_file, $(PLUGINS), $@) + +$(OUTPUT)%.so: $(OUTPUT)%-in.o + $(QUIET_LINK)$(CC) $(CFLAGS) -shared $(LDFLAGS) -nostartfiles -o $@ $^ + +define update_dir + (echo $1 > $@.tmp; \ + if [ -r $@ ] && cmp -s $@ $@.tmp; then \ + rm -f $@.tmp; \ + else \ + echo ' UPDATE $@'; \ + mv -f $@.tmp $@; \ + fi); +endef + +tags: force + $(RM) tags + find . -name '*.[ch]' | xargs ctags --extra=+f --c-kinds=+px \ + --regex-c++='/_PE\(([^,)]*).*/TEP_ERRNO__\1/' + +TAGS: force + $(RM) TAGS + find . -name '*.[ch]' | xargs etags \ + --regex='/_PE(\([^,)]*\).*/TEP_ERRNO__\1/' + +define do_install_mkdir + if [ ! -d '$(DESTDIR_SQ)$1' ]; then \ + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$1'; \ + fi +endef + +define do_install + $(call do_install_mkdir,$2); \ + $(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR_SQ)$2' +endef + +define do_install_plugins + for plugin in $1; do \ + $(call do_install,$$plugin,$(plugin_dir_SQ)); \ + done +endef + +define do_generate_dynamic_list_file + symbol_type=`$(NM) -u -D $1 | awk 'NF>1 {print $$1}' | \ + xargs echo "U w W" | tr 'w ' 'W\n' | sort -u | xargs echo`;\ + if [ "$$symbol_type" = "U W" ];then \ + (echo '{'; \ + $(NM) -u -D $1 | awk 'NF>1 {print "\t"$$2";"}' | sort -u;\ + echo '};'; \ + ) > $2; \ + else \ + (echo Either missing one of [$1] or bad version of $(NM)) 1>&2;\ + fi +endef + +install: $(PLUGINS) + $(call QUIET_INSTALL, trace_plugins) \ + $(call do_install_plugins, $(PLUGINS)) + +clean: + $(call QUIET_CLEAN, trace_plugins) \ + $(RM) *.o *~ $(TARGETS) *.a *.so $(VERSION_FILES) .*.d .*.cmd; \ + $(RM) $(OUTPUT)libtraceevent-dynamic-list \ + $(RM) TRACEEVENT-CFLAGS tags TAGS; + +PHONY += force plugins +force: + +# Declare the contents of the .PHONY variable as phony. We keep that +# information in a variable so we can use it in if_changed and friends. +.PHONY: $(PHONY) diff --git a/tools/lib/traceevent/plugin_cfg80211.c b/tools/lib/traceevent/plugins/plugin_cfg80211.c similarity index 100% rename from tools/lib/traceevent/plugin_cfg80211.c rename to tools/lib/traceevent/plugins/plugin_cfg80211.c diff --git a/tools/lib/traceevent/plugin_function.c b/tools/lib/traceevent/plugins/plugin_function.c similarity index 100% rename from tools/lib/traceevent/plugin_function.c rename to tools/lib/traceevent/plugins/plugin_function.c diff --git a/tools/lib/traceevent/plugin_hrtimer.c b/tools/lib/traceevent/plugins/plugin_hrtimer.c similarity index 100% rename from tools/lib/traceevent/plugin_hrtimer.c rename to tools/lib/traceevent/plugins/plugin_hrtimer.c diff --git a/tools/lib/traceevent/plugin_jbd2.c b/tools/lib/traceevent/plugins/plugin_jbd2.c similarity index 100% rename from tools/lib/traceevent/plugin_jbd2.c rename to tools/lib/traceevent/plugins/plugin_jbd2.c diff --git a/tools/lib/traceevent/plugin_kmem.c b/tools/lib/traceevent/plugins/plugin_kmem.c similarity index 100% rename from tools/lib/traceevent/plugin_kmem.c rename to tools/lib/traceevent/plugins/plugin_kmem.c diff --git a/tools/lib/traceevent/plugin_kvm.c b/tools/lib/traceevent/plugins/plugin_kvm.c similarity index 100% rename from tools/lib/traceevent/plugin_kvm.c rename to tools/lib/traceevent/plugins/plugin_kvm.c diff --git a/tools/lib/traceevent/plugin_mac80211.c b/tools/lib/traceevent/plugins/plugin_mac80211.c similarity index 100% rename from tools/lib/traceevent/plugin_mac80211.c rename to tools/lib/traceevent/plugins/plugin_mac80211.c diff --git a/tools/lib/traceevent/plugin_sched_switch.c b/tools/lib/traceevent/plugins/plugin_sched_switch.c similarity index 100% rename from tools/lib/traceevent/plugin_sched_switch.c rename to tools/lib/traceevent/plugins/plugin_sched_switch.c diff --git a/tools/lib/traceevent/plugin_scsi.c b/tools/lib/traceevent/plugins/plugin_scsi.c similarity index 100% rename from tools/lib/traceevent/plugin_scsi.c rename to tools/lib/traceevent/plugins/plugin_scsi.c diff --git a/tools/lib/traceevent/plugin_xen.c b/tools/lib/traceevent/plugins/plugin_xen.c similarity index 100% rename from tools/lib/traceevent/plugin_xen.c rename to tools/lib/traceevent/plugins/plugin_xen.c diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 2ccc12f3730b..902c792f326a 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -292,7 +292,7 @@ endif LIBTRACEEVENT = $(TE_PATH)libtraceevent.a export LIBTRACEEVENT -LIBTRACEEVENT_DYNAMIC_LIST = $(TE_PATH)libtraceevent-dynamic-list +LIBTRACEEVENT_DYNAMIC_LIST = $(TE_PATH)plugins/libtraceevent-dynamic-list # # The static build has no dynsym table, so this does not work for @@ -737,7 +737,7 @@ libtraceevent_plugins: FORCE $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) plugins $(LIBTRACEEVENT_DYNAMIC_LIST): libtraceevent_plugins - $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)libtraceevent-dynamic-list + $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)plugins/libtraceevent-dynamic-list $(LIBTRACEEVENT)-clean: $(call QUIET_CLEAN, libtraceevent) From 33c96400dcd34d572299b8106e1b0460a1b8bae9 Mon Sep 17 00:00:00 2001 From: Tzvetomir Stoyanov Date: Thu, 19 Sep 2019 17:23:40 -0400 Subject: [PATCH 1131/2880] libtraceevent: Man pages for tep plugins APIs Create man pages for libtraceevent APIs: tep_load_plugins(), tep_unload_plugin() Signed-off-by: Tzvetomir Stoyanov Cc: Andrew Morton Cc: Jiri Olsa Cc: Namhyung Kim Cc: linux-trace-devel@vger.kernel.org Link: http://lore.kernel.org/linux-trace-devel/20190903133434.30417-1-tz.stoyanov@gmail.com Link: http://lore.kernel.org/lkml/20190919212542.216189588@goodmis.org Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Arnaldo Carvalho de Melo --- .../Documentation/libtraceevent-plugins.txt | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 tools/lib/traceevent/Documentation/libtraceevent-plugins.txt diff --git a/tools/lib/traceevent/Documentation/libtraceevent-plugins.txt b/tools/lib/traceevent/Documentation/libtraceevent-plugins.txt new file mode 100644 index 000000000000..596032ade31f --- /dev/null +++ b/tools/lib/traceevent/Documentation/libtraceevent-plugins.txt @@ -0,0 +1,99 @@ +libtraceevent(3) +================ + +NAME +---- +tep_load_plugins, tep_unload_plugins - Load / unload traceevent plugins. + +SYNOPSIS +-------- +[verse] +-- +*#include * + +struct tep_plugin_list pass:[*]*tep_load_plugins*(struct tep_handle pass:[*]_tep_); +void *tep_unload_plugins*(struct tep_plugin_list pass:[*]_plugin_list_, struct tep_handle pass:[*]_tep_); +-- + +DESCRIPTION +----------- +The _tep_load_plugins()_ function loads all plugins, located in the plugin +directories. The _tep_ argument is trace event parser context. +The plugin directories are : +[verse] +-- + - System's plugin directory, defined at the library compile time. It + depends on the library installation prefix and usually is + _(install_preffix)/lib/traceevent/plugins_ + - Directory, defined by the environment variable _TRACEEVENT_PLUGIN_DIR_ + - User's plugin directory, located at _~/.local/lib/traceevent/plugins_ +-- +Loading of plugins can be controlled by the _tep_flags_, using the +_tep_set_flag()_ API: +[verse] +-- + _TEP_DISABLE_SYS_PLUGINS_ - do not load plugins, located in + the system's plugin directory. + _TEP_DISABLE_PLUGINS_ - do not load any plugins. +-- +The _tep_set_flag()_ API needs to be called before _tep_load_plugins()_, if +loading of all plugins is not the desired case. + +The _tep_unload_plugins()_ function unloads the plugins, previously loaded by +_tep_load_plugins()_. The _tep_ argument is trace event parser context. The +_plugin_list_ is the list of loaded plugins, returned by +the _tep_load_plugins()_ function. + +RETURN VALUE +------------ +The _tep_load_plugins()_ function returns a list of successfully loaded plugins, +or NULL in case no plugins are loaded. + +EXAMPLE +------- +[source,c] +-- +#include +... +struct tep_handle *tep = tep_alloc(); +... +struct tep_plugin_list *plugins = tep_load_plugins(tep); +if (plugins == NULL) { + /* no plugins are loaded */ +} +... +tep_unload_plugins(plugins, tep); +-- + +FILES +----- +[verse] +-- +*event-parse.h* + Header file to include in order to have access to the library APIs. +*-ltraceevent* + Linker switch to add when building a program that uses the library. +-- + +SEE ALSO +-------- +_libtraceevent(3)_, _trace-cmd(1)_, _tep_set_flag(3)_ + +AUTHOR +------ +[verse] +-- +*Steven Rostedt* , author of *libtraceevent*. +*Tzvetomir Stoyanov* , author of this man page. +-- +REPORTING BUGS +-------------- +Report bugs to + +LICENSE +------- +libtraceevent is Free Software licensed under the GNU LGPL 2.1 + +RESOURCES +--------- +https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git From 058bd857845a9675cfaf4bc2ca831ec7953b58f1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 Sep 2019 10:57:18 +0200 Subject: [PATCH 1132/2880] tools: Add missing stdio.h include to asm/bug.h header We have a direct fprintf() call in the header, so we need stdio.h include, otherwise it could fail compilation if there's no prior stdio.h include directive. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/n/tip-8hvjgh24olfsa4non0a3ohnq@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/asm/bug.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/asm/bug.h b/tools/include/asm/bug.h index bbd75ac8b202..550223f0a6e6 100644 --- a/tools/include/asm/bug.h +++ b/tools/include/asm/bug.h @@ -3,6 +3,7 @@ #define _TOOLS_ASM_BUG_H #include +#include #define __WARN_printf(arg...) do { fprintf(stderr, arg); } while (0) From a583053299c1e66e6202b494cbc3acd93cedc4cc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 27 Jul 2019 20:30:53 +0200 Subject: [PATCH 1133/2880] perf tools: Rename 'struct perf_mmap' to 'struct mmap' Rename 'struct perf_evlist' to 'struct evlist', so we don't have a name clash when we add 'struct perf_mmap' to libperf. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-4-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/perf-time-to-tsc.c | 2 +- tools/perf/builtin-kvm.c | 2 +- tools/perf/builtin-record.c | 34 ++++++------ tools/perf/builtin-top.c | 2 +- tools/perf/builtin-trace.c | 2 +- tools/perf/tests/backward-ring-buffer.c | 2 +- tools/perf/tests/bpf.c | 2 +- tools/perf/tests/code-reading.c | 2 +- tools/perf/tests/keep-tracking.c | 2 +- tools/perf/tests/mmap-basic.c | 2 +- tools/perf/tests/openat-syscall-tp-fields.c | 2 +- tools/perf/tests/perf-record.c | 2 +- tools/perf/tests/sw-clock.c | 2 +- tools/perf/tests/switch-tracking.c | 2 +- tools/perf/tests/task-exit.c | 2 +- tools/perf/util/auxtrace.c | 6 +-- tools/perf/util/auxtrace.h | 8 +-- tools/perf/util/evlist.c | 14 ++--- tools/perf/util/evlist.h | 4 +- tools/perf/util/mmap.c | 54 ++++++++++---------- tools/perf/util/mmap.h | 32 ++++++------ tools/perf/util/python.c | 6 +-- 22 files changed, 93 insertions(+), 93 deletions(-) diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index 0a4570b340fa..c83abb1ed0ff 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -65,7 +65,7 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe union perf_event *event; u64 test_tsc, comm1_tsc, comm2_tsc; u64 test_time, comm1_time = 0, comm2_time = 0; - struct perf_mmap *md; + struct mmap *md; threads = thread_map__new(-1, getpid(), UINT_MAX); CHECK_NOT_NULL__(threads); diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index c4b22e1b0a40..7354b77e9137 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -750,7 +750,7 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx, { struct evlist *evlist = kvm->evlist; union perf_event *event; - struct perf_mmap *md; + struct mmap *md; u64 timestamp; s64 n = 0; int err; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 1e1f97139f16..1528fb686f96 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -119,7 +119,7 @@ static bool switch_output_time(struct record *rec) trigger_is_ready(&switch_output_trigger); } -static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused, +static int record__write(struct record *rec, struct mmap *map __maybe_unused, void *bf, size_t size) { struct perf_data_file *file = &rec->session->data->file; @@ -168,7 +168,7 @@ static int record__aio_write(struct aiocb *cblock, int trace_fd, return rc; } -static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock) +static int record__aio_complete(struct mmap *md, struct aiocb *cblock) { void *rem_buf; off_t rem_off; @@ -214,7 +214,7 @@ static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock) return rc; } -static int record__aio_sync(struct perf_mmap *md, bool sync_all) +static int record__aio_sync(struct mmap *md, bool sync_all) { struct aiocb **aiocb = md->aio.aiocb; struct aiocb *cblocks = md->aio.cblocks; @@ -255,7 +255,7 @@ struct record_aio { size_t size; }; -static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size) +static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size) { struct record_aio *aio = to; @@ -300,7 +300,7 @@ static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t return size; } -static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off) +static int record__aio_push(struct record *rec, struct mmap *map, off_t *off) { int ret, idx; int trace_fd = rec->session->data->file.fd; @@ -351,13 +351,13 @@ static void record__aio_mmap_read_sync(struct record *rec) { int i; struct evlist *evlist = rec->evlist; - struct perf_mmap *maps = evlist->mmap; + struct mmap *maps = evlist->mmap; if (!record__aio_enabled(rec)) return; for (i = 0; i < evlist->nr_mmaps; i++) { - struct perf_mmap *map = &maps[i]; + struct mmap *map = &maps[i]; if (map->base) record__aio_sync(map, true); @@ -387,7 +387,7 @@ static int record__aio_parse(const struct option *opt, #else /* HAVE_AIO_SUPPORT */ static int nr_cblocks_max = 0; -static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused, +static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused, off_t *off __maybe_unused) { return -1; @@ -482,7 +482,7 @@ static int process_synthesized_event(struct perf_tool *tool, return record__write(rec, NULL, event, event->header.size); } -static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size) +static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size) { struct record *rec = to; @@ -527,7 +527,7 @@ static void record__sig_exit(void) #ifdef HAVE_AUXTRACE_SUPPORT static int record__process_auxtrace(struct perf_tool *tool, - struct perf_mmap *map, + struct mmap *map, union perf_event *event, void *data1, size_t len1, void *data2, size_t len2) { @@ -565,7 +565,7 @@ static int record__process_auxtrace(struct perf_tool *tool, } static int record__auxtrace_mmap_read(struct record *rec, - struct perf_mmap *map) + struct mmap *map) { int ret; @@ -581,7 +581,7 @@ static int record__auxtrace_mmap_read(struct record *rec, } static int record__auxtrace_mmap_read_snapshot(struct record *rec, - struct perf_mmap *map) + struct mmap *map) { int ret; @@ -603,7 +603,7 @@ static int record__auxtrace_read_snapshot_all(struct record *rec) int rc = 0; for (i = 0; i < rec->evlist->nr_mmaps; i++) { - struct perf_mmap *map = &rec->evlist->mmap[i]; + struct mmap *map = &rec->evlist->mmap[i]; if (!map->auxtrace_mmap.base) continue; @@ -668,7 +668,7 @@ static int record__auxtrace_init(struct record *rec) static inline int record__auxtrace_mmap_read(struct record *rec __maybe_unused, - struct perf_mmap *map __maybe_unused) + struct mmap *map __maybe_unused) { return 0; } @@ -901,7 +901,7 @@ static struct perf_event_header finished_round_event = { .type = PERF_RECORD_FINISHED_ROUND, }; -static void record__adjust_affinity(struct record *rec, struct perf_mmap *map) +static void record__adjust_affinity(struct record *rec, struct mmap *map) { if (rec->opts.affinity != PERF_AFFINITY_SYS && !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) { @@ -948,7 +948,7 @@ static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, u64 bytes_written = rec->bytes_written; int i; int rc = 0; - struct perf_mmap *maps; + struct mmap *maps; int trace_fd = rec->data.file.fd; off_t off = 0; @@ -967,7 +967,7 @@ static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, for (i = 0; i < evlist->nr_mmaps; i++) { u64 flush = 0; - struct perf_mmap *map = &maps[i]; + struct mmap *map = &maps[i]; if (map->base) { record__adjust_affinity(rec, map); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 8da3c939e6b0..834a927107c4 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -863,7 +863,7 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx) { struct record_opts *opts = &top->record_opts; struct evlist *evlist = top->evlist; - struct perf_mmap *md; + struct mmap *md; union perf_event *event; md = opts->overwrite ? &evlist->overwrite_mmap[idx] : &evlist->mmap[idx]; diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index a292658b4232..2f853870d68d 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3444,7 +3444,7 @@ again: for (i = 0; i < evlist->nr_mmaps; i++) { union perf_event *event; - struct perf_mmap *md; + struct mmap *md; md = &evlist->mmap[i]; if (perf_mmap__read_init(md) < 0) diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index a637a4a90760..0a046096e6f4 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -33,7 +33,7 @@ static int count_samples(struct evlist *evlist, int *sample_count, int i; for (i = 0; i < evlist->nr_mmaps; i++) { - struct perf_mmap *map = &evlist->overwrite_mmap[i]; + struct mmap *map = &evlist->overwrite_mmap[i]; union perf_event *event; perf_mmap__read_init(map); diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index fc102e4f403e..cf9776aceb82 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -180,7 +180,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void), for (i = 0; i < evlist->nr_mmaps; i++) { union perf_event *event; - struct perf_mmap *md; + struct mmap *md; md = &evlist->mmap[i]; if (perf_mmap__read_init(md) < 0) diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index fd02c1f1d976..df96cbb9ffe5 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -419,7 +419,7 @@ static int process_events(struct machine *machine, struct evlist *evlist, struct state *state) { union perf_event *event; - struct perf_mmap *md; + struct mmap *md; int i, ret; for (i = 0; i < evlist->nr_mmaps; i++) { diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index df0fd5a44e04..67a87f507bbd 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -31,7 +31,7 @@ static int find_comm(struct evlist *evlist, const char *comm) { union perf_event *event; - struct perf_mmap *md; + struct mmap *md; int i, found; found = 0; diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index 042757629e90..accca3d78fa3 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -42,7 +42,7 @@ int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unuse expected_nr_events[nsyscalls], i, j; struct evsel *evsels[nsyscalls], *evsel; char sbuf[STRERR_BUFSIZE]; - struct perf_mmap *md; + struct mmap *md; threads = thread_map__new(-1, getpid(), UINT_MAX); if (threads == NULL) { diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index b71167b43dda..e5ef74d4e925 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -88,7 +88,7 @@ int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest for (i = 0; i < evlist->nr_mmaps; i++) { union perf_event *event; - struct perf_mmap *md; + struct mmap *md; md = &evlist->mmap[i]; if (perf_mmap__read_init(md) < 0) diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index e1b42292cf7f..49d2d4c5956d 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -166,7 +166,7 @@ int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unus for (i = 0; i < evlist->nr_mmaps; i++) { union perf_event *event; - struct perf_mmap *md; + struct mmap *md; md = &evlist->mmap[i]; if (perf_mmap__read_init(md) < 0) diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c index 97694a040986..7abe0b6aabb7 100644 --- a/tools/perf/tests/sw-clock.c +++ b/tools/perf/tests/sw-clock.c @@ -42,7 +42,7 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) }; struct perf_cpu_map *cpus; struct perf_thread_map *threads; - struct perf_mmap *md; + struct mmap *md; attr.sample_freq = 500; diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index 3fb1ff7b8a2f..e7888ddd69a3 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -263,7 +263,7 @@ static int process_events(struct evlist *evlist, unsigned pos, cnt = 0; LIST_HEAD(events); struct event_node *events_array, *node; - struct perf_mmap *md; + struct mmap *md; int i, ret; for (i = 0; i < evlist->nr_mmaps; i++) { diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index 088c7708b03a..cff99707cc9d 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -51,7 +51,7 @@ int test__task_exit(struct test *test __maybe_unused, int subtest __maybe_unused char sbuf[STRERR_BUFSIZE]; struct perf_cpu_map *cpus; struct perf_thread_map *threads; - struct perf_mmap *md; + struct mmap *md; signal(SIGCHLD, sig_handler); diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 0e8c89cf7cad..1d22ffd972ac 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -1228,7 +1228,7 @@ int perf_event__process_auxtrace_error(struct perf_session *session, return 0; } -static int __auxtrace_mmap__read(struct perf_mmap *map, +static int __auxtrace_mmap__read(struct mmap *map, struct auxtrace_record *itr, struct perf_tool *tool, process_auxtrace_t fn, bool snapshot, size_t snapshot_size) @@ -1339,13 +1339,13 @@ static int __auxtrace_mmap__read(struct perf_mmap *map, return 1; } -int auxtrace_mmap__read(struct perf_mmap *map, struct auxtrace_record *itr, +int auxtrace_mmap__read(struct mmap *map, struct auxtrace_record *itr, struct perf_tool *tool, process_auxtrace_t fn) { return __auxtrace_mmap__read(map, itr, tool, fn, false, 0); } -int auxtrace_mmap__read_snapshot(struct perf_mmap *map, +int auxtrace_mmap__read_snapshot(struct mmap *map, struct auxtrace_record *itr, struct perf_tool *tool, process_auxtrace_t fn, size_t snapshot_size) diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index b110aec1da4d..f201f36bc35f 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -22,7 +22,7 @@ union perf_event; struct perf_session; struct evlist; struct perf_tool; -struct perf_mmap; +struct mmap; struct perf_sample; struct option; struct record_opts; @@ -445,14 +445,14 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp, bool per_cpu); typedef int (*process_auxtrace_t)(struct perf_tool *tool, - struct perf_mmap *map, + struct mmap *map, union perf_event *event, void *data1, size_t len1, void *data2, size_t len2); -int auxtrace_mmap__read(struct perf_mmap *map, struct auxtrace_record *itr, +int auxtrace_mmap__read(struct mmap *map, struct auxtrace_record *itr, struct perf_tool *tool, process_auxtrace_t fn); -int auxtrace_mmap__read_snapshot(struct perf_mmap *map, +int auxtrace_mmap__read_snapshot(struct mmap *map, struct auxtrace_record *itr, struct perf_tool *tool, process_auxtrace_t fn, size_t snapshot_size); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index ea9517d506d8..ceab9fb7f7f9 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -423,7 +423,7 @@ int perf_evlist__alloc_pollfd(struct evlist *evlist) } static int __perf_evlist__add_pollfd(struct evlist *evlist, int fd, - struct perf_mmap *map, short revent) + struct mmap *map, short revent) { int pos = fdarray__add(&evlist->pollfd, fd, revent | POLLERR | POLLHUP); /* @@ -447,7 +447,7 @@ int perf_evlist__add_pollfd(struct evlist *evlist, int fd) static void perf_evlist__munmap_filtered(struct fdarray *fda, int fd, void *arg __maybe_unused) { - struct perf_mmap *map = fda->priv[fd].ptr; + struct mmap *map = fda->priv[fd].ptr; if (map) perf_mmap__put(map); @@ -693,16 +693,16 @@ void perf_evlist__munmap(struct evlist *evlist) zfree(&evlist->overwrite_mmap); } -static struct perf_mmap *perf_evlist__alloc_mmap(struct evlist *evlist, +static struct mmap *perf_evlist__alloc_mmap(struct evlist *evlist, bool overwrite) { int i; - struct perf_mmap *map; + struct mmap *map; evlist->nr_mmaps = perf_cpu_map__nr(evlist->core.cpus); if (perf_cpu_map__empty(evlist->core.cpus)) evlist->nr_mmaps = perf_thread_map__nr(evlist->core.threads); - map = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap)); + map = zalloc(evlist->nr_mmaps * sizeof(struct mmap)); if (!map) return NULL; @@ -741,7 +741,7 @@ static int perf_evlist__mmap_per_evsel(struct evlist *evlist, int idx, int evlist_cpu = cpu_map__cpu(evlist->core.cpus, cpu_idx); evlist__for_each_entry(evlist, evsel) { - struct perf_mmap *maps = evlist->mmap; + struct mmap *maps = evlist->mmap; int *output = _output; int fd; int cpu; @@ -1847,7 +1847,7 @@ static void *perf_evlist__poll_thread(void *arg) perf_evlist__poll(evlist, 1000); for (i = 0; i < evlist->nr_mmaps; i++) { - struct perf_mmap *map = &evlist->mmap[i]; + struct mmap *map = &evlist->mmap[i]; union perf_event *event; if (perf_mmap__read_init(map)) diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index a55f0f2546e5..129786361572 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -39,8 +39,8 @@ struct evlist { pid_t pid; } workload; struct fdarray pollfd; - struct perf_mmap *mmap; - struct perf_mmap *overwrite_mmap; + struct mmap *mmap; + struct mmap *overwrite_mmap; struct evsel *selected; struct events_stats stats; struct perf_env *env; diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 33c5b5495482..f3b7c8b0fa90 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -22,13 +22,13 @@ #include "../perf.h" #include "util.h" /* page_size */ -size_t perf_mmap__mmap_len(struct perf_mmap *map) +size_t perf_mmap__mmap_len(struct mmap *map) { return map->mask + 1 + page_size; } /* When check_messup is true, 'end' must points to a good entry */ -static union perf_event *perf_mmap__read(struct perf_mmap *map, +static union perf_event *perf_mmap__read(struct mmap *map, u64 *startp, u64 end) { unsigned char *data = map->base + page_size; @@ -82,7 +82,7 @@ static union perf_event *perf_mmap__read(struct perf_mmap *map, * } * perf_mmap__read_done() */ -union perf_event *perf_mmap__read_event(struct perf_mmap *map) +union perf_event *perf_mmap__read_event(struct mmap *map) { union perf_event *event; @@ -104,17 +104,17 @@ union perf_event *perf_mmap__read_event(struct perf_mmap *map) return event; } -static bool perf_mmap__empty(struct perf_mmap *map) +static bool perf_mmap__empty(struct mmap *map) { return perf_mmap__read_head(map) == map->prev && !map->auxtrace_mmap.base; } -void perf_mmap__get(struct perf_mmap *map) +void perf_mmap__get(struct mmap *map) { refcount_inc(&map->refcnt); } -void perf_mmap__put(struct perf_mmap *map) +void perf_mmap__put(struct mmap *map) { BUG_ON(map->base && refcount_read(&map->refcnt) == 0); @@ -122,7 +122,7 @@ void perf_mmap__put(struct perf_mmap *map) perf_mmap__munmap(map); } -void perf_mmap__consume(struct perf_mmap *map) +void perf_mmap__consume(struct mmap *map) { if (!map->overwrite) { u64 old = map->prev; @@ -161,13 +161,13 @@ void __weak auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp __mayb } #ifdef HAVE_AIO_SUPPORT -static int perf_mmap__aio_enabled(struct perf_mmap *map) +static int perf_mmap__aio_enabled(struct mmap *map) { return map->aio.nr_cblocks > 0; } #ifdef HAVE_LIBNUMA_SUPPORT -static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx) +static int perf_mmap__aio_alloc(struct mmap *map, int idx) { map->aio.data[idx] = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); @@ -179,7 +179,7 @@ static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx) return 0; } -static void perf_mmap__aio_free(struct perf_mmap *map, int idx) +static void perf_mmap__aio_free(struct mmap *map, int idx) { if (map->aio.data[idx]) { munmap(map->aio.data[idx], perf_mmap__mmap_len(map)); @@ -187,7 +187,7 @@ static void perf_mmap__aio_free(struct perf_mmap *map, int idx) } } -static int perf_mmap__aio_bind(struct perf_mmap *map, int idx, int cpu, int affinity) +static int perf_mmap__aio_bind(struct mmap *map, int idx, int cpu, int affinity) { void *data; size_t mmap_len; @@ -207,7 +207,7 @@ static int perf_mmap__aio_bind(struct perf_mmap *map, int idx, int cpu, int affi return 0; } #else /* !HAVE_LIBNUMA_SUPPORT */ -static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx) +static int perf_mmap__aio_alloc(struct mmap *map, int idx) { map->aio.data[idx] = malloc(perf_mmap__mmap_len(map)); if (map->aio.data[idx] == NULL) @@ -216,19 +216,19 @@ static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx) return 0; } -static void perf_mmap__aio_free(struct perf_mmap *map, int idx) +static void perf_mmap__aio_free(struct mmap *map, int idx) { zfree(&(map->aio.data[idx])); } -static int perf_mmap__aio_bind(struct perf_mmap *map __maybe_unused, int idx __maybe_unused, +static int perf_mmap__aio_bind(struct mmap *map __maybe_unused, int idx __maybe_unused, int cpu __maybe_unused, int affinity __maybe_unused) { return 0; } #endif -static int perf_mmap__aio_mmap(struct perf_mmap *map, struct mmap_params *mp) +static int perf_mmap__aio_mmap(struct mmap *map, struct mmap_params *mp) { int delta_max, i, prio, ret; @@ -282,7 +282,7 @@ static int perf_mmap__aio_mmap(struct perf_mmap *map, struct mmap_params *mp) return 0; } -static void perf_mmap__aio_munmap(struct perf_mmap *map) +static void perf_mmap__aio_munmap(struct mmap *map) { int i; @@ -294,23 +294,23 @@ static void perf_mmap__aio_munmap(struct perf_mmap *map) zfree(&map->aio.aiocb); } #else /* !HAVE_AIO_SUPPORT */ -static int perf_mmap__aio_enabled(struct perf_mmap *map __maybe_unused) +static int perf_mmap__aio_enabled(struct mmap *map __maybe_unused) { return 0; } -static int perf_mmap__aio_mmap(struct perf_mmap *map __maybe_unused, +static int perf_mmap__aio_mmap(struct mmap *map __maybe_unused, struct mmap_params *mp __maybe_unused) { return 0; } -static void perf_mmap__aio_munmap(struct perf_mmap *map __maybe_unused) +static void perf_mmap__aio_munmap(struct mmap *map __maybe_unused) { } #endif -void perf_mmap__munmap(struct perf_mmap *map) +void perf_mmap__munmap(struct mmap *map) { perf_mmap__aio_munmap(map); if (map->data != NULL) { @@ -343,7 +343,7 @@ static void build_node_mask(int node, cpu_set_t *mask) } } -static void perf_mmap__setup_affinity_mask(struct perf_mmap *map, struct mmap_params *mp) +static void perf_mmap__setup_affinity_mask(struct mmap *map, struct mmap_params *mp) { CPU_ZERO(&map->affinity_mask); if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1) @@ -352,7 +352,7 @@ static void perf_mmap__setup_affinity_mask(struct perf_mmap *map, struct mmap_pa CPU_SET(map->cpu, &map->affinity_mask); } -int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int cpu) +int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) { /* * The last one will be done at perf_mmap__consume(), so that we @@ -440,7 +440,7 @@ static int overwrite_rb_find_range(void *buf, int mask, u64 *start, u64 *end) /* * Report the start and end of the available data in ringbuffer */ -static int __perf_mmap__read_init(struct perf_mmap *md) +static int __perf_mmap__read_init(struct mmap *md) { u64 head = perf_mmap__read_head(md); u64 old = md->prev; @@ -474,7 +474,7 @@ static int __perf_mmap__read_init(struct perf_mmap *md) return 0; } -int perf_mmap__read_init(struct perf_mmap *map) +int perf_mmap__read_init(struct mmap *map) { /* * Check if event was unmapped due to a POLLHUP/POLLERR. @@ -485,8 +485,8 @@ int perf_mmap__read_init(struct perf_mmap *map) return __perf_mmap__read_init(map); } -int perf_mmap__push(struct perf_mmap *md, void *to, - int push(struct perf_mmap *map, void *to, void *buf, size_t size)) +int perf_mmap__push(struct mmap *md, void *to, + int push(struct mmap *map, void *to, void *buf, size_t size)) { u64 head = perf_mmap__read_head(md); unsigned char *data = md->base + page_size; @@ -532,7 +532,7 @@ out: * The last perf_mmap__read() will set tail to map->prev. * Need to correct the map->prev to head which is the end of next read. */ -void perf_mmap__read_done(struct perf_mmap *map) +void perf_mmap__read_done(struct mmap *map) { /* * Check if event was unmapped due to a POLLHUP/POLLERR. diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 3857a49e8f96..01524608a984 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -15,11 +15,11 @@ struct aiocb; /** - * struct perf_mmap - perf's ring buffer mmap details + * struct mmap - perf's ring buffer mmap details * * @refcnt - e.g. code using PERF_EVENT_IOC_SET_OUTPUT to share this */ -struct perf_mmap { +struct mmap { void *base; int mask; int fd; @@ -78,33 +78,33 @@ struct mmap_params { struct auxtrace_mmap_params auxtrace_mp; }; -int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int cpu); -void perf_mmap__munmap(struct perf_mmap *map); +int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu); +void perf_mmap__munmap(struct mmap *map); -void perf_mmap__get(struct perf_mmap *map); -void perf_mmap__put(struct perf_mmap *map); +void perf_mmap__get(struct mmap *map); +void perf_mmap__put(struct mmap *map); -void perf_mmap__consume(struct perf_mmap *map); +void perf_mmap__consume(struct mmap *map); -static inline u64 perf_mmap__read_head(struct perf_mmap *mm) +static inline u64 perf_mmap__read_head(struct mmap *mm) { return ring_buffer_read_head(mm->base); } -static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail) +static inline void perf_mmap__write_tail(struct mmap *md, u64 tail) { ring_buffer_write_tail(md->base, tail); } -union perf_event *perf_mmap__read_forward(struct perf_mmap *map); +union perf_event *perf_mmap__read_forward(struct mmap *map); -union perf_event *perf_mmap__read_event(struct perf_mmap *map); +union perf_event *perf_mmap__read_event(struct mmap *map); -int perf_mmap__push(struct perf_mmap *md, void *to, - int push(struct perf_mmap *map, void *to, void *buf, size_t size)); +int perf_mmap__push(struct mmap *md, void *to, + int push(struct mmap *map, void *to, void *buf, size_t size)); -size_t perf_mmap__mmap_len(struct perf_mmap *map); +size_t perf_mmap__mmap_len(struct mmap *map); -int perf_mmap__read_init(struct perf_mmap *md); -void perf_mmap__read_done(struct perf_mmap *map); +int perf_mmap__read_init(struct mmap *md); +void perf_mmap__read_done(struct mmap *map); #endif /*__PERF_MMAP_H */ diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 0ba19dd75510..1e1247cffea8 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -984,12 +984,12 @@ static PyObject *pyrf_evlist__add(struct pyrf_evlist *pevlist, return Py_BuildValue("i", evlist->core.nr_entries); } -static struct perf_mmap *get_md(struct evlist *evlist, int cpu) +static struct mmap *get_md(struct evlist *evlist, int cpu) { int i; for (i = 0; i < evlist->nr_mmaps; i++) { - struct perf_mmap *md = &evlist->mmap[i]; + struct mmap *md = &evlist->mmap[i]; if (md->cpu == cpu) return md; @@ -1005,7 +1005,7 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, union perf_event *event; int sample_id_all = 1, cpu; static char *kwlist[] = { "cpu", "sample_id_all", NULL }; - struct perf_mmap *md; + struct mmap *md; int err; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|i", kwlist, From 9521b5f2d9d3e3fd6092fb9d7b00c914e7fa7d33 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 28 Jul 2019 12:45:35 +0200 Subject: [PATCH 1134/2880] perf tools: Rename perf_evlist__mmap() to evlist__mmap() Rename perf_evlist__mmap() to evlist__mmap(), so we don't have a name clash when we add perf_evlist__mmap() in libperf. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-5-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/perf-time-to-tsc.c | 2 +- tools/perf/builtin-kvm.c | 2 +- tools/perf/builtin-record.c | 6 ++-- tools/perf/builtin-top.c | 2 +- tools/perf/builtin-trace.c | 2 +- tools/perf/tests/backward-ring-buffer.c | 4 +-- tools/perf/tests/bpf.c | 4 +-- tools/perf/tests/code-reading.c | 4 +-- tools/perf/tests/keep-tracking.c | 2 +- tools/perf/tests/mmap-basic.c | 2 +- tools/perf/tests/openat-syscall-tp-fields.c | 4 +-- tools/perf/tests/perf-record.c | 4 +-- tools/perf/tests/sw-clock.c | 2 +- tools/perf/tests/switch-tracking.c | 4 +-- tools/perf/tests/task-exit.c | 2 +- tools/perf/util/evlist.c | 30 ++++++++++---------- tools/perf/util/evlist.h | 8 +++--- tools/perf/util/python.c | 2 +- 18 files changed, 43 insertions(+), 43 deletions(-) diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index c83abb1ed0ff..42ae47525a76 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -90,7 +90,7 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe CHECK__(evlist__open(evlist)); - CHECK__(perf_evlist__mmap(evlist, UINT_MAX)); + CHECK__(evlist__mmap(evlist, UINT_MAX)); pc = evlist->mmap[0].base; ret = perf_read_tsc_conversion(pc, &tc); diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 7354b77e9137..72debb7bd20d 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1060,7 +1060,7 @@ static int kvm_live_open_events(struct perf_kvm_stat *kvm) goto out; } - if (perf_evlist__mmap(evlist, kvm->opts.mmap_pages) < 0) { + if (evlist__mmap(evlist, kvm->opts.mmap_pages) < 0) { ui__error("Failed to mmap the events: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); evlist__close(evlist); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 1528fb686f96..093d9ab5633e 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -439,7 +439,7 @@ static int record__mmap_flush_parse(const struct option *opt, if (!opts->mmap_flush) opts->mmap_flush = MMAP_FLUSH_DEFAULT; - flush_max = perf_evlist__mmap_size(opts->mmap_pages); + flush_max = evlist__mmap_size(opts->mmap_pages); flush_max /= 4; if (opts->mmap_flush > flush_max) opts->mmap_flush = flush_max; @@ -707,7 +707,7 @@ static int record__mmap_evlist(struct record *rec, if (opts->affinity != PERF_AFFINITY_SYS) cpu__setup_cpunode_map(); - if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, + if (evlist__mmap_ex(evlist, opts->mmap_pages, opts->auxtrace_mmap_pages, opts->auxtrace_snapshot_mode, opts->nr_cblocks, opts->affinity, @@ -1980,7 +1980,7 @@ out_free: static void switch_output_size_warn(struct record *rec) { - u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages); + u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages); struct switch_output *s = &rec->switch_output; wakeup_size /= 2; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 834a927107c4..771b3ff47dc3 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1042,7 +1042,7 @@ try_again: } } - if (perf_evlist__mmap(evlist, opts->mmap_pages) < 0) { + if (evlist__mmap(evlist, opts->mmap_pages) < 0) { ui__error("Failed to mmap with %d (%s)\n", errno, str_error_r(errno, msg, sizeof(msg))); goto out_err; diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 2f853870d68d..ff989351567d 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3409,7 +3409,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) if (trace->dump.map) bpf_map__fprintf(trace->dump.map, trace->output); - err = perf_evlist__mmap(evlist, trace->opts.mmap_pages); + err = evlist__mmap(evlist, trace->opts.mmap_pages); if (err < 0) goto out_error_mmap; diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index 0a046096e6f4..f1eb7e9c1d7d 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -63,9 +63,9 @@ static int do_test(struct evlist *evlist, int mmap_pages, int err; char sbuf[STRERR_BUFSIZE]; - err = perf_evlist__mmap(evlist, mmap_pages); + err = evlist__mmap(evlist, mmap_pages); if (err < 0) { - pr_debug("perf_evlist__mmap: %s\n", + pr_debug("evlist__mmap: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); return TEST_FAIL; } diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index cf9776aceb82..964731915498 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -167,9 +167,9 @@ static int do_test(struct bpf_object *obj, int (*func)(void), goto out_delete_evlist; } - err = perf_evlist__mmap(evlist, opts.mmap_pages); + err = evlist__mmap(evlist, opts.mmap_pages); if (err < 0) { - pr_debug("perf_evlist__mmap: %s\n", + pr_debug("evlist__mmap: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); goto out_delete_evlist; } diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index df96cbb9ffe5..413783b69006 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -685,9 +685,9 @@ static int do_test_code_reading(bool try_kcore) break; } - ret = perf_evlist__mmap(evlist, UINT_MAX); + ret = evlist__mmap(evlist, UINT_MAX); if (ret < 0) { - pr_debug("perf_evlist__mmap failed\n"); + pr_debug("evlist__mmap failed\n"); goto out_put; } diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index 67a87f507bbd..8e2ec5f72042 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -104,7 +104,7 @@ int test__keep_tracking(struct test *test __maybe_unused, int subtest __maybe_un goto out_err; } - CHECK__(perf_evlist__mmap(evlist, UINT_MAX)); + CHECK__(evlist__mmap(evlist, UINT_MAX)); /* * First, test that a 'comm' event can be found when the event is diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index accca3d78fa3..4d42e455feb7 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -99,7 +99,7 @@ int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unuse expected_nr_events[i] = 1 + rand() % 127; } - if (perf_evlist__mmap(evlist, 128) < 0) { + if (evlist__mmap(evlist, 128) < 0) { pr_debug("failed to mmap events: %d (%s)\n", errno, str_error_r(errno, sbuf, sizeof(sbuf))); goto out_delete_evlist; diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index e5ef74d4e925..5c2576174ae9 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -69,9 +69,9 @@ int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest goto out_delete_evlist; } - err = perf_evlist__mmap(evlist, UINT_MAX); + err = evlist__mmap(evlist, UINT_MAX); if (err < 0) { - pr_debug("perf_evlist__mmap: %s\n", + pr_debug("evlist__mmap: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); goto out_delete_evlist; } diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 49d2d4c5956d..669fd88e7f30 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -143,9 +143,9 @@ int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unus * fds in the same CPU to be injected in the same mmap ring buffer * (using ioctl(PERF_EVENT_IOC_SET_OUTPUT)). */ - err = perf_evlist__mmap(evlist, opts.mmap_pages); + err = evlist__mmap(evlist, opts.mmap_pages); if (err < 0) { - pr_debug("perf_evlist__mmap: %s\n", + pr_debug("evlist__mmap: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); goto out_delete_evlist; } diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c index 7abe0b6aabb7..fbff60815be8 100644 --- a/tools/perf/tests/sw-clock.c +++ b/tools/perf/tests/sw-clock.c @@ -82,7 +82,7 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) goto out_delete_evlist; } - err = perf_evlist__mmap(evlist, 128); + err = evlist__mmap(evlist, 128); if (err < 0) { pr_debug("failed to mmap event: %d (%s)\n", errno, str_error_r(errno, sbuf, sizeof(sbuf))); diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index e7888ddd69a3..c92e287e0f53 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -460,9 +460,9 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ goto out; } - err = perf_evlist__mmap(evlist, UINT_MAX); + err = evlist__mmap(evlist, UINT_MAX); if (err) { - pr_debug("perf_evlist__mmap failed!\n"); + pr_debug("evlist__mmap failed!\n"); goto out_err; } diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index cff99707cc9d..718838b5e724 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -106,7 +106,7 @@ int test__task_exit(struct test *test __maybe_unused, int subtest __maybe_unused goto out_delete_evlist; } - if (perf_evlist__mmap(evlist, 128) < 0) { + if (evlist__mmap(evlist, 128) < 0) { pr_debug("failed to mmap events: %d (%s)\n", errno, str_error_r(errno, sbuf, sizeof(sbuf))); goto out_delete_evlist; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index ceab9fb7f7f9..e8afe4fdc1b2 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -732,7 +732,7 @@ perf_evlist__should_poll(struct evlist *evlist __maybe_unused, return true; } -static int perf_evlist__mmap_per_evsel(struct evlist *evlist, int idx, +static int evlist__mmap_per_evsel(struct evlist *evlist, int idx, struct mmap_params *mp, int cpu_idx, int thread, int *_output, int *_output_overwrite) { @@ -810,7 +810,7 @@ static int perf_evlist__mmap_per_evsel(struct evlist *evlist, int idx, return 0; } -static int perf_evlist__mmap_per_cpu(struct evlist *evlist, +static int evlist__mmap_per_cpu(struct evlist *evlist, struct mmap_params *mp) { int cpu, thread; @@ -826,7 +826,7 @@ static int perf_evlist__mmap_per_cpu(struct evlist *evlist, true); for (thread = 0; thread < nr_threads; thread++) { - if (perf_evlist__mmap_per_evsel(evlist, cpu, mp, cpu, + if (evlist__mmap_per_evsel(evlist, cpu, mp, cpu, thread, &output, &output_overwrite)) goto out_unmap; } @@ -839,7 +839,7 @@ out_unmap: return -1; } -static int perf_evlist__mmap_per_thread(struct evlist *evlist, +static int evlist__mmap_per_thread(struct evlist *evlist, struct mmap_params *mp) { int thread; @@ -853,7 +853,7 @@ static int perf_evlist__mmap_per_thread(struct evlist *evlist, auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, thread, false); - if (perf_evlist__mmap_per_evsel(evlist, thread, mp, 0, thread, + if (evlist__mmap_per_evsel(evlist, thread, mp, 0, thread, &output, &output_overwrite)) goto out_unmap; } @@ -888,7 +888,7 @@ unsigned long perf_event_mlock_kb_in_pages(void) return pages; } -size_t perf_evlist__mmap_size(unsigned long pages) +size_t evlist__mmap_size(unsigned long pages) { if (pages == UINT_MAX) pages = perf_event_mlock_kb_in_pages(); @@ -971,7 +971,7 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str, } /** - * perf_evlist__mmap_ex - Create mmaps to receive events. + * evlist__mmap_ex - Create mmaps to receive events. * @evlist: list of events * @pages: map length in pages * @overwrite: overwrite older events? @@ -979,7 +979,7 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str, * @auxtrace_overwrite - overwrite older auxtrace data? * * If @overwrite is %false the user needs to signal event consumption using - * perf_mmap__write_tail(). Using perf_evlist__mmap_read() does this + * perf_mmap__write_tail(). Using evlist__mmap_read() does this * automatically. * * Similarly, if @auxtrace_overwrite is %false the user needs to signal data @@ -987,7 +987,7 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str, * * Return: %0 on success, negative error code otherwise. */ -int perf_evlist__mmap_ex(struct evlist *evlist, unsigned int pages, +int evlist__mmap_ex(struct evlist *evlist, unsigned int pages, unsigned int auxtrace_pages, bool auxtrace_overwrite, int nr_cblocks, int affinity, int flush, int comp_level) @@ -1011,7 +1011,7 @@ int perf_evlist__mmap_ex(struct evlist *evlist, unsigned int pages, if (evlist->pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) < 0) return -ENOMEM; - evlist->mmap_len = perf_evlist__mmap_size(pages); + evlist->mmap_len = evlist__mmap_size(pages); pr_debug("mmap size %zuB\n", evlist->mmap_len); mp.mask = evlist->mmap_len - page_size - 1; @@ -1026,14 +1026,14 @@ int perf_evlist__mmap_ex(struct evlist *evlist, unsigned int pages, } if (perf_cpu_map__empty(cpus)) - return perf_evlist__mmap_per_thread(evlist, &mp); + return evlist__mmap_per_thread(evlist, &mp); - return perf_evlist__mmap_per_cpu(evlist, &mp); + return evlist__mmap_per_cpu(evlist, &mp); } -int perf_evlist__mmap(struct evlist *evlist, unsigned int pages) +int evlist__mmap(struct evlist *evlist, unsigned int pages) { - return perf_evlist__mmap_ex(evlist, pages, 0, false, 0, PERF_AFFINITY_SYS, 1, 0); + return evlist__mmap_ex(evlist, pages, 0, false, 0, PERF_AFFINITY_SYS, 1, 0); } int perf_evlist__create_maps(struct evlist *evlist, struct target *target) @@ -1889,7 +1889,7 @@ int perf_evlist__start_sb_thread(struct evlist *evlist, goto out_delete_evlist; } - if (perf_evlist__mmap(evlist, UINT_MAX)) + if (evlist__mmap(evlist, UINT_MAX)) goto out_delete_evlist; evlist__for_each_entry(evlist, counter) { diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 129786361572..aaf06182c1b8 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -139,7 +139,7 @@ struct perf_sample_id *perf_evlist__id2sid(struct evlist *evlist, u64 id); void perf_evlist__toggle_bkw_mmap(struct evlist *evlist, enum bkw_mmap_state state); -void perf_evlist__mmap_consume(struct evlist *evlist, int idx); +void evlist__mmap_consume(struct evlist *evlist, int idx); int evlist__open(struct evlist *evlist); void evlist__close(struct evlist *evlist); @@ -170,14 +170,14 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, unsigned long perf_event_mlock_kb_in_pages(void); -int perf_evlist__mmap_ex(struct evlist *evlist, unsigned int pages, +int evlist__mmap_ex(struct evlist *evlist, unsigned int pages, unsigned int auxtrace_pages, bool auxtrace_overwrite, int nr_cblocks, int affinity, int flush, int comp_level); -int perf_evlist__mmap(struct evlist *evlist, unsigned int pages); +int evlist__mmap(struct evlist *evlist, unsigned int pages); void perf_evlist__munmap(struct evlist *evlist); -size_t perf_evlist__mmap_size(unsigned long pages); +size_t evlist__mmap_size(unsigned long pages); void evlist__disable(struct evlist *evlist); void evlist__enable(struct evlist *evlist); diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 1e1247cffea8..b102d174b868 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -899,7 +899,7 @@ static PyObject *pyrf_evlist__mmap(struct pyrf_evlist *pevlist, &pages, &overwrite)) return NULL; - if (perf_evlist__mmap(evlist, pages) < 0) { + if (evlist__mmap(evlist, pages) < 0) { PyErr_SetFromErrno(PyExc_OSError); return NULL; } From db6b7b138506ad537edd12d98f674722a643c150 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 16 Aug 2019 16:19:55 +0200 Subject: [PATCH 1135/2880] perf tools: Rename perf_evlist__munmap() to evlist__munmap() Rename perf_evlist__munmap() to evlist__munmap(), so we don't have a name clash when we add perf_evlist__munmap() in libperf. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-6-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/backward-ring-buffer.c | 2 +- tools/perf/util/evlist.c | 12 ++++++------ tools/perf/util/evlist.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index f1eb7e9c1d7d..3073a68d17b9 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -75,7 +75,7 @@ static int do_test(struct evlist *evlist, int mmap_pages, evlist__disable(evlist); err = count_samples(evlist, sample_count, comm_count); - perf_evlist__munmap(evlist); + evlist__munmap(evlist); return err; } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index e8afe4fdc1b2..888472c7bf9c 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -149,7 +149,7 @@ void evlist__delete(struct evlist *evlist) if (evlist == NULL) return; - perf_evlist__munmap(evlist); + evlist__munmap(evlist); evlist__close(evlist); perf_cpu_map__put(evlist->core.cpus); perf_thread_map__put(evlist->core.threads); @@ -673,7 +673,7 @@ static int perf_evlist__resume(struct evlist *evlist) return perf_evlist__set_paused(evlist, false); } -static void perf_evlist__munmap_nofree(struct evlist *evlist) +static void evlist__munmap_nofree(struct evlist *evlist) { int i; @@ -686,9 +686,9 @@ static void perf_evlist__munmap_nofree(struct evlist *evlist) perf_mmap__munmap(&evlist->overwrite_mmap[i]); } -void perf_evlist__munmap(struct evlist *evlist) +void evlist__munmap(struct evlist *evlist) { - perf_evlist__munmap_nofree(evlist); + evlist__munmap_nofree(evlist); zfree(&evlist->mmap); zfree(&evlist->overwrite_mmap); } @@ -835,7 +835,7 @@ static int evlist__mmap_per_cpu(struct evlist *evlist, return 0; out_unmap: - perf_evlist__munmap_nofree(evlist); + evlist__munmap_nofree(evlist); return -1; } @@ -861,7 +861,7 @@ static int evlist__mmap_per_thread(struct evlist *evlist, return 0; out_unmap: - perf_evlist__munmap_nofree(evlist); + evlist__munmap_nofree(evlist); return -1; } diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index aaf06182c1b8..f07501602353 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -175,7 +175,7 @@ int evlist__mmap_ex(struct evlist *evlist, unsigned int pages, bool auxtrace_overwrite, int nr_cblocks, int affinity, int flush, int comp_level); int evlist__mmap(struct evlist *evlist, unsigned int pages); -void perf_evlist__munmap(struct evlist *evlist); +void evlist__munmap(struct evlist *evlist); size_t evlist__mmap_size(unsigned long pages); From d50cf36115a08e668b4a0919a6807c3f9a9e8db8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 16 Aug 2019 16:21:46 +0200 Subject: [PATCH 1136/2880] perf tools: Rename perf_evlist__alloc_mmap() to evlist__alloc_mmap() Rename perf_evlist__alloc_mmap() to evlist__alloc_mmap(), so we don't have a name clash when we add perf_evlist__alloc_mmap() to libperf. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-7-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 888472c7bf9c..5bde2e5bf0e3 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -693,8 +693,8 @@ void evlist__munmap(struct evlist *evlist) zfree(&evlist->overwrite_mmap); } -static struct mmap *perf_evlist__alloc_mmap(struct evlist *evlist, - bool overwrite) +static struct mmap *evlist__alloc_mmap(struct evlist *evlist, + bool overwrite) { int i; struct mmap *map; @@ -752,7 +752,7 @@ static int evlist__mmap_per_evsel(struct evlist *evlist, int idx, maps = evlist->overwrite_mmap; if (!maps) { - maps = perf_evlist__alloc_mmap(evlist, true); + maps = evlist__alloc_mmap(evlist, true); if (!maps) return -1; evlist->overwrite_mmap = maps; @@ -1004,7 +1004,7 @@ int evlist__mmap_ex(struct evlist *evlist, unsigned int pages, .comp_level = comp_level }; if (!evlist->mmap) - evlist->mmap = perf_evlist__alloc_mmap(evlist, false); + evlist->mmap = evlist__alloc_mmap(evlist, false); if (!evlist->mmap) return -ENOMEM; From 470579b0211d370fd3bd7c2f2b2b098f1ca1ae65 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Sep 2019 14:34:52 +0200 Subject: [PATCH 1137/2880] perf tools: Rename perf_evlist__exit() to evlist__exit() Rename perf_evlist__exit() to evlist__exit(), so we don't have a name clash when we add perf_evlist__exit() to libperf. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-8-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 4 ++-- tools/perf/util/evlist.h | 2 +- tools/perf/util/python.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 5bde2e5bf0e3..4c5b7040c02b 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -137,7 +137,7 @@ static void perf_evlist__purge(struct evlist *evlist) evlist->core.nr_entries = 0; } -void perf_evlist__exit(struct evlist *evlist) +void evlist__exit(struct evlist *evlist) { zfree(&evlist->mmap); zfree(&evlist->overwrite_mmap); @@ -156,7 +156,7 @@ void evlist__delete(struct evlist *evlist) evlist->core.cpus = NULL; evlist->core.threads = NULL; perf_evlist__purge(evlist); - perf_evlist__exit(evlist); + evlist__exit(evlist); free(evlist); } diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index f07501602353..b33c5d67410a 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -65,7 +65,7 @@ struct evlist *perf_evlist__new_default(void); struct evlist *perf_evlist__new_dummy(void); void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus, struct perf_thread_map *threads); -void perf_evlist__exit(struct evlist *evlist); +void evlist__exit(struct evlist *evlist); void evlist__delete(struct evlist *evlist); void evlist__add(struct evlist *evlist, struct evsel *entry); diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index b102d174b868..60a6b6e8e176 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -884,7 +884,7 @@ static int pyrf_evlist__init(struct pyrf_evlist *pevlist, static void pyrf_evlist__delete(struct pyrf_evlist *pevlist) { - perf_evlist__exit(&pevlist->evlist); + evlist__exit(&pevlist->evlist); Py_TYPE(pevlist)->tp_free((PyObject*)pevlist); } From e6b1878d4eea85ab453e4b198cbb26a34614fdc0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 5 Sep 2019 10:11:37 +0200 Subject: [PATCH 1138/2880] perf tools: Rename perf_evlist__purge() to evlist__purge() Rename (perf_evlist__purge) to evlist__purge(), so we don't have a name clash when we add (perf_evlist__purge) in libperf. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-9-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 4c5b7040c02b..c96c743cc1d2 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -124,7 +124,7 @@ static void perf_evlist__update_id_pos(struct evlist *evlist) perf_evlist__set_id_pos(evlist); } -static void perf_evlist__purge(struct evlist *evlist) +static void evlist__purge(struct evlist *evlist) { struct evsel *pos, *n; @@ -155,7 +155,7 @@ void evlist__delete(struct evlist *evlist) perf_thread_map__put(evlist->core.threads); evlist->core.cpus = NULL; evlist->core.threads = NULL; - perf_evlist__purge(evlist); + evlist__purge(evlist); evlist__exit(evlist); free(evlist); } From d80a5540bccb4a9e7bd49d249056ce4c7d385ff3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 18 Aug 2019 23:02:58 +0200 Subject: [PATCH 1139/2880] libperf: Link libapi.a in libperf.so Linking libapi.a in libperf.so, because we are about to use some of the API functions in it. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-10-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/Makefile | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/tools/perf/lib/Makefile b/tools/perf/lib/Makefile index e325c0503dc6..54466cc84544 100644 --- a/tools/perf/lib/Makefile +++ b/tools/perf/lib/Makefile @@ -59,7 +59,13 @@ else CFLAGS := -g -Wall endif -INCLUDES = -I$(srctree)/tools/perf/lib/include -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(SRCARCH)/include/ -I$(srctree)/tools/arch/$(SRCARCH)/include/uapi -I$(srctree)/tools/include/uapi +INCLUDES = \ +-I$(srctree)/tools/perf/lib/include \ +-I$(srctree)/tools/lib/ \ +-I$(srctree)/tools/include \ +-I$(srctree)/tools/arch/$(SRCARCH)/include/ \ +-I$(srctree)/tools/arch/$(SRCARCH)/include/uapi \ +-I$(srctree)/tools/include/uapi # Append required CFLAGS override CFLAGS += $(EXTRA_WARNINGS) @@ -88,13 +94,34 @@ LIBPERF_PC := $(OUTPUT)libperf.pc LIBPERF_ALL := $(LIBPERF_A) $(OUTPUT)libperf.so* +LIB_DIR := $(srctree)/tools/lib/api/ + +ifneq ($(OUTPUT),) +ifneq ($(subdir),) + API_PATH=$(OUTPUT)/../lib/api/ +else + API_PATH=$(OUTPUT) +endif +else + API_PATH=$(LIB_DIR) +endif + +LIBAPI = $(API_PATH)libapi.a + +$(LIBAPI): FORCE + $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) $(OUTPUT)libapi.a + +$(LIBAPI)-clean: + $(call QUIET_CLEAN, libapi) + $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null + $(LIBPERF_IN): FORCE $(Q)$(MAKE) $(build)=libperf $(LIBPERF_A): $(LIBPERF_IN) $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIBPERF_IN) -$(LIBPERF_SO): $(LIBPERF_IN) +$(LIBPERF_SO): $(LIBPERF_IN) $(LIBAPI) $(QUIET_LINK)$(CC) --shared -Wl,-soname,libperf.so \ -Wl,--version-script=$(VERSION_SCRIPT) $^ -o $@ @ln -sf $(@F) $(OUTPUT)libperf.so @@ -106,7 +133,7 @@ libs: $(LIBPERF_A) $(LIBPERF_SO) $(LIBPERF_PC) all: fixdep $(Q)$(MAKE) libs -clean: +clean: $(LIBAPI)-clean $(call QUIET_CLEAN, libperf) $(RM) $(LIBPERF_A) \ *.o *~ *.a *.so *.so.$(VERSION) *.so.$(LIBPERF_VERSION) .*.d .*.cmd LIBPERF-CFLAGS $(LIBPERF_PC) $(Q)$(MAKE) -C tests clean From e0fcfb086fbbb6233de1062d4b2f05e9afedab3b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 23 Sep 2019 12:20:38 -0300 Subject: [PATCH 1140/2880] perf evlist: Adopt backwards ring buffer state enum As this isn't used at all in mmap.h but in evlist.h, so to cut down the header dependency tree, move it to where it is used. Also add mmap.h to the places using it but previously getting it indirectly via evlist.h. Add missing pthread.h to evlist.h, as it has a pthread_t struct member and was getting the header via mmap.h. Noticed while processing a Jiri's libperf batch touching mmap.h, where almost everything gets rebuilt because evlist.h is so popular, so cut down't this rebuild the world party. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Cc: Song Liu Link: https://lkml.kernel.org/n/tip-he0uljeftl0xfveh3d6vtode@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/s390/util/auxtrace.c | 1 + tools/perf/arch/x86/tests/perf-time-to-tsc.c | 1 + tools/perf/arch/x86/util/intel-bts.c | 1 + tools/perf/arch/x86/util/intel-pt.c | 1 + tools/perf/builtin-kvm.c | 1 + tools/perf/builtin-record.c | 1 + tools/perf/builtin-top.c | 1 + tools/perf/builtin-trace.c | 1 + tools/perf/tests/backward-ring-buffer.c | 1 + tools/perf/tests/bpf.c | 1 + tools/perf/tests/code-reading.c | 1 + tools/perf/tests/hists_link.c | 1 + tools/perf/tests/keep-tracking.c | 1 + tools/perf/tests/mmap-basic.c | 1 + tools/perf/tests/openat-syscall-tp-fields.c | 1 + tools/perf/tests/perf-record.c | 1 + tools/perf/tests/sw-clock.c | 1 + tools/perf/tests/switch-tracking.c | 1 + tools/perf/tests/task-exit.c | 1 + tools/perf/ui/gtk/hists.c | 1 + tools/perf/util/annotate.c | 1 + tools/perf/util/auxtrace.c | 1 + tools/perf/util/evlist.c | 1 + tools/perf/util/evlist.h | 30 +++++++++++++++++++- tools/perf/util/mmap.h | 28 ------------------ tools/perf/util/parse-events.c | 1 + 26 files changed, 53 insertions(+), 29 deletions(-) diff --git a/tools/perf/arch/s390/util/auxtrace.c b/tools/perf/arch/s390/util/auxtrace.c index b0fb70e38960..0db5c58c98e8 100644 --- a/tools/perf/arch/s390/util/auxtrace.c +++ b/tools/perf/arch/s390/util/auxtrace.c @@ -1,4 +1,5 @@ #include +#include #include #include #include diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index 42ae47525a76..e1dd5f8894ba 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -17,6 +17,7 @@ #include "thread_map.h" #include "record.h" #include "tsc.h" +#include "util/mmap.h" #include "tests/tests.h" #include "arch-tests.h" diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index 090d90e093df..64b409dad6e2 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -15,6 +15,7 @@ #include "../../util/event.h" #include "../../util/evsel.h" #include "../../util/evlist.h" +#include "../../util/mmap.h" #include "../../util/session.h" #include "../../util/pmu.h" #include "../../util/debug.h" diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 3d041b89f018..6c139611d231 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -18,6 +18,7 @@ #include "../../util/evlist.h" #include "../../util/evsel.h" #include "../../util/cpumap.h" +#include "../../util/mmap.h" #include #include "../../util/parse-events.h" #include "../../util/pmu.h" diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 72debb7bd20d..30852848ed9c 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -5,6 +5,7 @@ #include "util/build-id.h" #include "util/evsel.h" #include "util/evlist.h" +#include "util/mmap.h" #include "util/term.h" #include "util/symbol.h" #include "util/thread.h" diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 093d9ab5633e..1bb3d91c6599 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -20,6 +20,7 @@ #include "util/evlist.h" #include "util/evsel.h" #include "util/debug.h" +#include "util/mmap.h" #include "util/target.h" #include "util/session.h" #include "util/tool.h" diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 771b3ff47dc3..e637a08655db 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -30,6 +30,7 @@ #include "util/event.h" #include "util/machine.h" #include "util/map.h" +#include "util/mmap.h" #include "util/session.h" #include "util/symbol.h" #include "util/synthetic-events.h" diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index ff989351567d..c44280358e58 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -31,6 +31,7 @@ #include "util/synthetic-events.h" #include "util/evlist.h" #include "util/evswitch.h" +#include "util/mmap.h" #include #include #include "util/machine.h" diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index 3073a68d17b9..c59d3752d48d 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -10,6 +10,7 @@ #include "tests.h" #include "debug.h" #include "parse-events.h" +#include "util/mmap.h" #include #include diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index 964731915498..3c8533fdbce5 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -19,6 +19,7 @@ #include "llvm.h" #include "debug.h" #include "parse-events.h" +#include "util/mmap.h" #define NR_ITERS 111 #define PERF_TEST_BPF_PATH "/sys/fs/bpf/perf_test" diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 413783b69006..bc6db3e7a1c5 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -24,6 +24,7 @@ #include "symbol.h" #include "event.h" #include "record.h" +#include "util/mmap.h" #include "util/synthetic-events.h" #include "thread.h" diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c index 8be4d0b61e3a..1a3bdc0a2d14 100644 --- a/tools/perf/tests/hists_link.c +++ b/tools/perf/tests/hists_link.c @@ -8,6 +8,7 @@ #include "machine.h" #include "parse-events.h" #include "hists_common.h" +#include "util/mmap.h" #include #include diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index 8e2ec5f72042..c6030fdf7d4c 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -13,6 +13,7 @@ #include "record.h" #include "thread_map.h" #include "tests.h" +#include "util/mmap.h" #define CHECK__(x) { \ while ((x) < 0) { \ diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index 4d42e455feb7..3a22dce991ba 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -11,6 +11,7 @@ #include "evsel.h" #include "thread_map.h" #include "tests.h" +#include "util/mmap.h" #include #include #include diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index 5c2576174ae9..e20eaadb1a35 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -11,6 +11,7 @@ #include "record.h" #include "tests.h" #include "debug.h" +#include "util/mmap.h" #include #ifndef O_DIRECTORY diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 669fd88e7f30..ea8bcaa13ea5 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -11,6 +11,7 @@ #include "debug.h" #include "record.h" #include "tests.h" +#include "util/mmap.h" static int sched__get_first_possible_cpu(pid_t pid, cpu_set_t *maskp) { diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c index fbff60815be8..84519df87f30 100644 --- a/tools/perf/tests/sw-clock.c +++ b/tools/perf/tests/sw-clock.c @@ -12,6 +12,7 @@ #include "util/evsel.h" #include "util/evlist.h" #include "util/cpumap.h" +#include "util/mmap.h" #include "util/thread_map.h" #include diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index c92e287e0f53..e5c3f2ee223a 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -16,6 +16,7 @@ #include "thread_map.h" #include "record.h" #include "tests.h" +#include "util/mmap.h" static int spin_sleep(void) { diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index 718838b5e724..7fc39af48a76 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -5,6 +5,7 @@ #include "target.h" #include "thread_map.h" #include "tests.h" +#include "util/mmap.h" #include #include diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c index 6c2efc10bf5c..ed1a97b2c4b0 100644 --- a/tools/perf/ui/gtk/hists.c +++ b/tools/perf/ui/gtk/hists.c @@ -8,6 +8,7 @@ #include "../string2.h" #include "gtk.h" #include +#include #include #define MAX_COLUMNS 32 diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index d441cca6a517..69d0a1991b29 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -34,6 +34,7 @@ #include "bpf-event.h" #include "block-range.h" #include "string2.h" +#include "util/mmap.h" #include "arch/common.h" #include #include diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 1d22ffd972ac..4bd92f5bfb5f 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -51,6 +51,7 @@ #include "arm-spe.h" #include "s390-cpumsf.h" #include "util.h" // page_size +#include "util/mmap.h" #include #include diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index c96c743cc1d2..f700dbe043b7 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -10,6 +10,7 @@ #include #include #include "cpumap.h" +#include "util/mmap.h" #include "thread_map.h" #include "target.h" #include "evlist.h" diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index b33c5d67410a..8b9c35efea67 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -11,7 +11,7 @@ #include #include "events_stats.h" #include "evsel.h" -#include "mmap.h" +#include #include #include @@ -20,6 +20,34 @@ struct thread_map; struct perf_cpu_map; struct record_opts; +/* + * State machine of bkw_mmap_state: + * + * .________________(forbid)_____________. + * | V + * NOTREADY --(0)--> RUNNING --(1)--> DATA_PENDING --(2)--> EMPTY + * ^ ^ | ^ | + * | |__(forbid)____/ |___(forbid)___/| + * | | + * \_________________(3)_______________/ + * + * NOTREADY : Backward ring buffers are not ready + * RUNNING : Backward ring buffers are recording + * DATA_PENDING : We are required to collect data from backward ring buffers + * EMPTY : We have collected data from backward ring buffers. + * + * (0): Setup backward ring buffer + * (1): Pause ring buffers for reading + * (2): Read from ring buffers + * (3): Resume ring buffers for recording + */ +enum bkw_mmap_state { + BKW_MMAP_NOTREADY, + BKW_MMAP_RUNNING, + BKW_MMAP_DATA_PENDING, + BKW_MMAP_EMPTY, +}; + #define PERF_EVLIST__HLIST_BITS 8 #define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS) diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 01524608a984..ab086ede044a 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -45,34 +45,6 @@ struct mmap { int comp_level; }; -/* - * State machine of bkw_mmap_state: - * - * .________________(forbid)_____________. - * | V - * NOTREADY --(0)--> RUNNING --(1)--> DATA_PENDING --(2)--> EMPTY - * ^ ^ | ^ | - * | |__(forbid)____/ |___(forbid)___/| - * | | - * \_________________(3)_______________/ - * - * NOTREADY : Backward ring buffers are not ready - * RUNNING : Backward ring buffers are recording - * DATA_PENDING : We are required to collect data from backward ring buffers - * EMPTY : We have collected data from backward ring buffers. - * - * (0): Setup backward ring buffer - * (1): Pause ring buffers for reading - * (2): Read from ring buffers - * (3): Resume ring buffers for recording - */ -enum bkw_mmap_state { - BKW_MMAP_NOTREADY, - BKW_MMAP_RUNNING, - BKW_MMAP_DATA_PENDING, - BKW_MMAP_EMPTY, -}; - struct mmap_params { int prot, mask, nr_cblocks, affinity, flush, comp_level; struct auxtrace_mmap_params auxtrace_mp; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index a33a1d5059a2..9cf1371c90a3 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -34,6 +34,7 @@ #include "asm/bug.h" #include "util/parse-branch-options.h" #include "metricgroup.h" +#include "util/mmap.h" #define MAX_NAME_LEN 100 From 547740f7b357cd91cca1fab5d7bf3a37469f7587 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 27 Jul 2019 22:07:44 +0200 Subject: [PATCH 1141/2880] libperf: Add perf_mmap struct Add the perf_mmap struct to libperf. The definition is added into: include/internal/mmap.h which is not to be included by users, but shared within perf and libperf. Committer notes: Remove unnecessary includes from tools/perf/lib/include/internal/mmap.h, those will be readded as they become necessary, later in the series. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-11-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/perf-time-to-tsc.c | 2 +- tools/perf/arch/x86/util/intel-bts.c | 2 +- tools/perf/arch/x86/util/intel-pt.c | 2 +- tools/perf/builtin-record.c | 14 ++++++------- tools/perf/lib/include/internal/mmap.h | 14 +++++++++++++ tools/perf/util/mmap.c | 22 ++++++++++---------- tools/perf/util/mmap.h | 7 ++++--- 7 files changed, 39 insertions(+), 24 deletions(-) create mode 100644 tools/perf/lib/include/internal/mmap.h diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index e1dd5f8894ba..bd404db94b3a 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -93,7 +93,7 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe CHECK__(evlist__mmap(evlist, UINT_MAX)); - pc = evlist->mmap[0].base; + pc = evlist->mmap[0].core.base; ret = perf_read_tsc_conversion(pc, &tc); if (ret) { if (ret == -EOPNOTSUPP) { diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index 64b409dad6e2..130925141369 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -78,7 +78,7 @@ static int intel_bts_info_fill(struct auxtrace_record *itr, if (!session->evlist->nr_mmaps) return -EINVAL; - pc = session->evlist->mmap[0].base; + pc = session->evlist->mmap[0].core.base; if (pc) { err = perf_read_tsc_conversion(pc, &tc); if (err) { diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 6c139611d231..34d7118bf390 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -355,7 +355,7 @@ static int intel_pt_info_fill(struct auxtrace_record *itr, if (!session->evlist->nr_mmaps) return -EINVAL; - pc = session->evlist->mmap[0].base; + pc = session->evlist->mmap[0].core.base; if (pc) { err = perf_read_tsc_conversion(pc, &tc); if (err) { diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 1bb3d91c6599..2520c0212275 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -261,7 +261,7 @@ static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size struct record_aio *aio = to; /* - * map->base data pointed by buf is copied into free map->aio.data[] buffer + * map->core.base data pointed by buf is copied into free map->aio.data[] buffer * to release space in the kernel buffer as fast as possible, calling * perf_mmap__consume() from perf_mmap__push() function. * @@ -360,7 +360,7 @@ static void record__aio_mmap_read_sync(struct record *rec) for (i = 0; i < evlist->nr_mmaps; i++) { struct mmap *map = &maps[i]; - if (map->base) + if (map->core.base) record__aio_sync(map, true); } } @@ -970,7 +970,7 @@ static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, u64 flush = 0; struct mmap *map = &maps[i]; - if (map->base) { + if (map->core.base) { record__adjust_affinity(rec, map); if (synch) { flush = map->flush; @@ -1198,10 +1198,10 @@ static const struct perf_event_mmap_page * perf_evlist__pick_pc(struct evlist *evlist) { if (evlist) { - if (evlist->mmap && evlist->mmap[0].base) - return evlist->mmap[0].base; - if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base) - return evlist->overwrite_mmap[0].base; + if (evlist->mmap && evlist->mmap[0].core.base) + return evlist->mmap[0].core.base; + if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base) + return evlist->overwrite_mmap[0].core.base; } return NULL; } diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h new file mode 100644 index 000000000000..2ef051901f48 --- /dev/null +++ b/tools/perf/lib/include/internal/mmap.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LIBPERF_INTERNAL_MMAP_H +#define __LIBPERF_INTERNAL_MMAP_H + +/** + * struct perf_mmap - perf's ring buffer mmap details + * + * @refcnt - e.g. code using PERF_EVENT_IOC_SET_OUTPUT to share this + */ +struct perf_mmap { + void *base; +}; + +#endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index f3b7c8b0fa90..76190b2edd78 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -31,7 +31,7 @@ size_t perf_mmap__mmap_len(struct mmap *map) static union perf_event *perf_mmap__read(struct mmap *map, u64 *startp, u64 end) { - unsigned char *data = map->base + page_size; + unsigned char *data = map->core.base + page_size; union perf_event *event = NULL; int diff = end - *startp; @@ -116,7 +116,7 @@ void perf_mmap__get(struct mmap *map) void perf_mmap__put(struct mmap *map) { - BUG_ON(map->base && refcount_read(&map->refcnt) == 0); + BUG_ON(map->core.base && refcount_read(&map->refcnt) == 0); if (refcount_dec_and_test(&map->refcnt)) perf_mmap__munmap(map); @@ -317,9 +317,9 @@ void perf_mmap__munmap(struct mmap *map) munmap(map->data, perf_mmap__mmap_len(map)); map->data = NULL; } - if (map->base != NULL) { - munmap(map->base, perf_mmap__mmap_len(map)); - map->base = NULL; + if (map->core.base != NULL) { + munmap(map->core.base, perf_mmap__mmap_len(map)); + map->core.base = NULL; map->fd = -1; refcount_set(&map->refcnt, 0); } @@ -370,12 +370,12 @@ int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) refcount_set(&map->refcnt, 2); map->prev = 0; map->mask = mp->mask; - map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, + map->core.base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, MAP_SHARED, fd, 0); - if (map->base == MAP_FAILED) { + if (map->core.base == MAP_FAILED) { pr_debug2("failed to mmap perf event ring buffer, error %d\n", errno); - map->base = NULL; + map->core.base = NULL; return -1; } map->fd = fd; @@ -399,7 +399,7 @@ int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) } if (auxtrace_mmap__mmap(&map->auxtrace_mmap, - &mp->auxtrace_mp, map->base, fd)) + &mp->auxtrace_mp, map->core.base, fd)) return -1; return perf_mmap__aio_mmap(map, mp); @@ -444,7 +444,7 @@ static int __perf_mmap__read_init(struct mmap *md) { u64 head = perf_mmap__read_head(md); u64 old = md->prev; - unsigned char *data = md->base + page_size; + unsigned char *data = md->core.base + page_size; unsigned long size; md->start = md->overwrite ? head : old; @@ -489,7 +489,7 @@ int perf_mmap__push(struct mmap *md, void *to, int push(struct mmap *map, void *to, void *buf, size_t size)) { u64 head = perf_mmap__read_head(md); - unsigned char *data = md->base + page_size; + unsigned char *data = md->core.base + page_size; unsigned long size; void *buf; int rc = 0; diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index ab086ede044a..d2f0ce581e2c 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -1,6 +1,7 @@ #ifndef __PERF_MMAP_H #define __PERF_MMAP_H 1 +#include #include #include #include @@ -20,7 +21,7 @@ struct aiocb; * @refcnt - e.g. code using PERF_EVENT_IOC_SET_OUTPUT to share this */ struct mmap { - void *base; + struct perf_mmap core; int mask; int fd; int cpu; @@ -60,12 +61,12 @@ void perf_mmap__consume(struct mmap *map); static inline u64 perf_mmap__read_head(struct mmap *mm) { - return ring_buffer_read_head(mm->base); + return ring_buffer_read_head(mm->core.base); } static inline void perf_mmap__write_tail(struct mmap *md, u64 tail) { - ring_buffer_write_tail(md->base, tail); + ring_buffer_write_tail(md->core.base, tail); } union perf_event *perf_mmap__read_forward(struct mmap *map); From 4fd0cef2c7b6469abfeef1f9bd056265ce369b13 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 27 Jul 2019 22:27:55 +0200 Subject: [PATCH 1142/2880] libperf: Add 'mask' to struct perf_mmap Move 'mask' from tools/perf's mmap to libperf's perf_mmap struct. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-12-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/include/internal/mmap.h | 1 + tools/perf/util/mmap.c | 24 ++++++++++++------------ tools/perf/util/mmap.h | 1 - 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index 2ef051901f48..1caa1e8ee5c6 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -9,6 +9,7 @@ */ struct perf_mmap { void *base; + int mask; }; #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 76190b2edd78..702e8e0b90ea 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -24,7 +24,7 @@ size_t perf_mmap__mmap_len(struct mmap *map) { - return map->mask + 1 + page_size; + return map->core.mask + 1 + page_size; } /* When check_messup is true, 'end' must points to a good entry */ @@ -38,7 +38,7 @@ static union perf_event *perf_mmap__read(struct mmap *map, if (diff >= (int)sizeof(event->header)) { size_t size; - event = (union perf_event *)&data[*startp & map->mask]; + event = (union perf_event *)&data[*startp & map->core.mask]; size = event->header.size; if (size < sizeof(event->header) || diff < (int)size) @@ -48,14 +48,14 @@ static union perf_event *perf_mmap__read(struct mmap *map, * Event straddles the mmap boundary -- header should always * be inside due to u64 alignment of output. */ - if ((*startp & map->mask) + size != ((*startp + size) & map->mask)) { + if ((*startp & map->core.mask) + size != ((*startp + size) & map->core.mask)) { unsigned int offset = *startp; unsigned int len = min(sizeof(*event), size), cpy; void *dst = map->event_copy; do { - cpy = min(map->mask + 1 - (offset & map->mask), len); - memcpy(dst, &data[offset & map->mask], cpy); + cpy = min(map->core.mask + 1 - (offset & map->core.mask), len); + memcpy(dst, &data[offset & map->core.mask], cpy); offset += cpy; dst += cpy; len -= cpy; @@ -369,7 +369,7 @@ int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) */ refcount_set(&map->refcnt, 2); map->prev = 0; - map->mask = mp->mask; + map->core.mask = mp->mask; map->core.base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, MAP_SHARED, fd, 0); if (map->core.base == MAP_FAILED) { @@ -454,7 +454,7 @@ static int __perf_mmap__read_init(struct mmap *md) return -EAGAIN; size = md->end - md->start; - if (size > (unsigned long)(md->mask) + 1) { + if (size > (unsigned long)(md->core.mask) + 1) { if (!md->overwrite) { WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); @@ -467,7 +467,7 @@ static int __perf_mmap__read_init(struct mmap *md) * Backward ring buffer is full. We still have a chance to read * most of data from it. */ - if (overwrite_rb_find_range(data, md->mask, &md->start, &md->end)) + if (overwrite_rb_find_range(data, md->core.mask, &md->start, &md->end)) return -EINVAL; } @@ -500,9 +500,9 @@ int perf_mmap__push(struct mmap *md, void *to, size = md->end - md->start; - if ((md->start & md->mask) + size != (md->end & md->mask)) { - buf = &data[md->start & md->mask]; - size = md->mask + 1 - (md->start & md->mask); + if ((md->start & md->core.mask) + size != (md->end & md->core.mask)) { + buf = &data[md->start & md->core.mask]; + size = md->core.mask + 1 - (md->start & md->core.mask); md->start += size; if (push(md, to, buf, size) < 0) { @@ -511,7 +511,7 @@ int perf_mmap__push(struct mmap *md, void *to, } } - buf = &data[md->start & md->mask]; + buf = &data[md->start & md->core.mask]; size = md->end - md->start; md->start += size; diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index d2f0ce581e2c..370138e395fc 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -22,7 +22,6 @@ struct aiocb; */ struct mmap { struct perf_mmap core; - int mask; int fd; int cpu; refcount_t refcnt; From 2cf07b294a604aecd6b583e60724eaa1607f0fbc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 27 Jul 2019 22:31:17 +0200 Subject: [PATCH 1143/2880] libperf: Add 'fd' to struct perf_mmap Move 'fd' from tools/perf's mmap to libperf's perf_mmap struct. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-13-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/include/internal/mmap.h | 1 + tools/perf/util/evlist.c | 4 ++-- tools/perf/util/mmap.c | 4 ++-- tools/perf/util/mmap.h | 1 - 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index 1caa1e8ee5c6..892cbd401d8d 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -10,6 +10,7 @@ struct perf_mmap { void *base; int mask; + int fd; }; #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index f700dbe043b7..a4e1c19c969f 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -652,7 +652,7 @@ static int perf_evlist__set_paused(struct evlist *evlist, bool value) return 0; for (i = 0; i < evlist->nr_mmaps; i++) { - int fd = evlist->overwrite_mmap[i].fd; + int fd = evlist->overwrite_mmap[i].core.fd; int err; if (fd < 0) @@ -708,7 +708,7 @@ static struct mmap *evlist__alloc_mmap(struct evlist *evlist, return NULL; for (i = 0; i < evlist->nr_mmaps; i++) { - map[i].fd = -1; + map[i].core.fd = -1; map[i].overwrite = overwrite; /* * When the perf_mmap() call is made we grab one refcount, plus diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 702e8e0b90ea..40bf124cb658 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -320,7 +320,7 @@ void perf_mmap__munmap(struct mmap *map) if (map->core.base != NULL) { munmap(map->core.base, perf_mmap__mmap_len(map)); map->core.base = NULL; - map->fd = -1; + map->core.fd = -1; refcount_set(&map->refcnt, 0); } auxtrace_mmap__munmap(&map->auxtrace_mmap); @@ -378,7 +378,7 @@ int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) map->core.base = NULL; return -1; } - map->fd = fd; + map->core.fd = fd; map->cpu = cpu; perf_mmap__setup_affinity_mask(map, mp); diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 370138e395fc..de991194af8d 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -22,7 +22,6 @@ struct aiocb; */ struct mmap { struct perf_mmap core; - int fd; int cpu; refcount_t refcnt; u64 prev; From 56a94706cd7233b158ab13e5ac93f5a97ca88941 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 27 Jul 2019 22:33:20 +0200 Subject: [PATCH 1144/2880] libperf: Add 'cpu' to struct perf_mmap Move 'cpu' from tools/perf's mmap to libperf's perf_mmap struct. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-14-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/include/internal/mmap.h | 1 + tools/perf/util/mmap.c | 8 ++++---- tools/perf/util/mmap.h | 1 - tools/perf/util/python.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index 892cbd401d8d..f7d1809fa34c 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -11,6 +11,7 @@ struct perf_mmap { void *base; int mask; int fd; + int cpu; }; #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 40bf124cb658..dc8320891344 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -256,7 +256,7 @@ static int perf_mmap__aio_mmap(struct mmap *map, struct mmap_params *mp) pr_debug2("failed to allocate data buffer area, error %m"); return -1; } - ret = perf_mmap__aio_bind(map, i, map->cpu, mp->affinity); + ret = perf_mmap__aio_bind(map, i, map->core.cpu, mp->affinity); if (ret == -1) return -1; /* @@ -347,9 +347,9 @@ static void perf_mmap__setup_affinity_mask(struct mmap *map, struct mmap_params { CPU_ZERO(&map->affinity_mask); if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1) - build_node_mask(cpu__get_node(map->cpu), &map->affinity_mask); + build_node_mask(cpu__get_node(map->core.cpu), &map->affinity_mask); else if (mp->affinity == PERF_AFFINITY_CPU) - CPU_SET(map->cpu, &map->affinity_mask); + CPU_SET(map->core.cpu, &map->affinity_mask); } int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) @@ -379,7 +379,7 @@ int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) return -1; } map->core.fd = fd; - map->cpu = cpu; + map->core.cpu = cpu; perf_mmap__setup_affinity_mask(map, mp); diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index de991194af8d..8ab779c98f4d 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -22,7 +22,6 @@ struct aiocb; */ struct mmap { struct perf_mmap core; - int cpu; refcount_t refcnt; u64 prev; u64 start; diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 60a6b6e8e176..ba4085d7ae9f 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -991,7 +991,7 @@ static struct mmap *get_md(struct evlist *evlist, int cpu) for (i = 0; i < evlist->nr_mmaps; i++) { struct mmap *md = &evlist->mmap[i]; - if (md->cpu == cpu) + if (md->core.cpu == cpu) return md; } From e03edfeac0330eaa2b19b82fc942611c1abf2120 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 27 Jul 2019 22:35:35 +0200 Subject: [PATCH 1145/2880] libperf: Add 'refcnt' to struct perf_mmap Move 'refcnt' from tools/perf's mmap to libperf's perf_mmap struct. Committer notes: Add the refcount.h include directive here, now it is needed. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-15-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/include/internal/mmap.h | 3 +++ tools/perf/util/evlist.c | 2 +- tools/perf/util/mmap.c | 18 +++++++++--------- tools/perf/util/mmap.h | 1 - 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index f7d1809fa34c..e6fb850ba47a 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -2,6 +2,8 @@ #ifndef __LIBPERF_INTERNAL_MMAP_H #define __LIBPERF_INTERNAL_MMAP_H +#include + /** * struct perf_mmap - perf's ring buffer mmap details * @@ -12,6 +14,7 @@ struct perf_mmap { int mask; int fd; int cpu; + refcount_t refcnt; }; #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index a4e1c19c969f..d987e0e5d62b 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -719,7 +719,7 @@ static struct mmap *evlist__alloc_mmap(struct evlist *evlist, * Each PERF_EVENT_IOC_SET_OUTPUT points to this mmap and * thus does perf_mmap__get() on it. */ - refcount_set(&map[i].refcnt, 0); + refcount_set(&map[i].core.refcnt, 0); } return map; } diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index dc8320891344..d6406d216cfe 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -89,7 +89,7 @@ union perf_event *perf_mmap__read_event(struct mmap *map) /* * Check if event was unmapped due to a POLLHUP/POLLERR. */ - if (!refcount_read(&map->refcnt)) + if (!refcount_read(&map->core.refcnt)) return NULL; /* non-overwirte doesn't pause the ringbuffer */ @@ -111,14 +111,14 @@ static bool perf_mmap__empty(struct mmap *map) void perf_mmap__get(struct mmap *map) { - refcount_inc(&map->refcnt); + refcount_inc(&map->core.refcnt); } void perf_mmap__put(struct mmap *map) { - BUG_ON(map->core.base && refcount_read(&map->refcnt) == 0); + BUG_ON(map->core.base && refcount_read(&map->core.refcnt) == 0); - if (refcount_dec_and_test(&map->refcnt)) + if (refcount_dec_and_test(&map->core.refcnt)) perf_mmap__munmap(map); } @@ -130,7 +130,7 @@ void perf_mmap__consume(struct mmap *map) perf_mmap__write_tail(map, old); } - if (refcount_read(&map->refcnt) == 1 && perf_mmap__empty(map)) + if (refcount_read(&map->core.refcnt) == 1 && perf_mmap__empty(map)) perf_mmap__put(map); } @@ -321,7 +321,7 @@ void perf_mmap__munmap(struct mmap *map) munmap(map->core.base, perf_mmap__mmap_len(map)); map->core.base = NULL; map->core.fd = -1; - refcount_set(&map->refcnt, 0); + refcount_set(&map->core.refcnt, 0); } auxtrace_mmap__munmap(&map->auxtrace_mmap); } @@ -367,7 +367,7 @@ int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) * evlist layer can't just drop it when filtering events in * perf_evlist__filter_pollfd(). */ - refcount_set(&map->refcnt, 2); + refcount_set(&map->core.refcnt, 2); map->prev = 0; map->core.mask = mp->mask; map->core.base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, @@ -479,7 +479,7 @@ int perf_mmap__read_init(struct mmap *map) /* * Check if event was unmapped due to a POLLHUP/POLLERR. */ - if (!refcount_read(&map->refcnt)) + if (!refcount_read(&map->core.refcnt)) return -ENOENT; return __perf_mmap__read_init(map); @@ -537,7 +537,7 @@ void perf_mmap__read_done(struct mmap *map) /* * Check if event was unmapped due to a POLLHUP/POLLERR. */ - if (!refcount_read(&map->refcnt)) + if (!refcount_read(&map->core.refcnt)) return; map->prev = perf_mmap__read_head(map); diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 8ab779c98f4d..5febd22fbe2e 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -22,7 +22,6 @@ struct aiocb; */ struct mmap { struct perf_mmap core; - refcount_t refcnt; u64 prev; u64 start; u64 end; From ebe4d72bba86a499ea0935c58ba1c2aea5aafb43 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 27 Jul 2019 22:39:53 +0200 Subject: [PATCH 1146/2880] libperf: Add prev/start/end to struct perf_mmap Move prev/start/end from tools/perf's mmap to libperf's perf_mmap struct. Committer notes: Add linux/types.h as we use u64. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-16-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/include/internal/mmap.h | 4 +++ tools/perf/util/mmap.c | 50 +++++++++++++------------- tools/perf/util/mmap.h | 3 -- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index e6fb850ba47a..ebf5b93754fd 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -3,6 +3,7 @@ #define __LIBPERF_INTERNAL_MMAP_H #include +#include /** * struct perf_mmap - perf's ring buffer mmap details @@ -15,6 +16,9 @@ struct perf_mmap { int fd; int cpu; refcount_t refcnt; + u64 prev; + u64 start; + u64 end; }; #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index d6406d216cfe..6ce70ff005cb 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -94,19 +94,19 @@ union perf_event *perf_mmap__read_event(struct mmap *map) /* non-overwirte doesn't pause the ringbuffer */ if (!map->overwrite) - map->end = perf_mmap__read_head(map); + map->core.end = perf_mmap__read_head(map); - event = perf_mmap__read(map, &map->start, map->end); + event = perf_mmap__read(map, &map->core.start, map->core.end); if (!map->overwrite) - map->prev = map->start; + map->core.prev = map->core.start; return event; } static bool perf_mmap__empty(struct mmap *map) { - return perf_mmap__read_head(map) == map->prev && !map->auxtrace_mmap.base; + return perf_mmap__read_head(map) == map->core.prev && !map->auxtrace_mmap.base; } void perf_mmap__get(struct mmap *map) @@ -125,7 +125,7 @@ void perf_mmap__put(struct mmap *map) void perf_mmap__consume(struct mmap *map) { if (!map->overwrite) { - u64 old = map->prev; + u64 old = map->core.prev; perf_mmap__write_tail(map, old); } @@ -368,7 +368,7 @@ int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) * perf_evlist__filter_pollfd(). */ refcount_set(&map->core.refcnt, 2); - map->prev = 0; + map->core.prev = 0; map->core.mask = mp->mask; map->core.base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, MAP_SHARED, fd, 0); @@ -443,22 +443,22 @@ static int overwrite_rb_find_range(void *buf, int mask, u64 *start, u64 *end) static int __perf_mmap__read_init(struct mmap *md) { u64 head = perf_mmap__read_head(md); - u64 old = md->prev; + u64 old = md->core.prev; unsigned char *data = md->core.base + page_size; unsigned long size; - md->start = md->overwrite ? head : old; - md->end = md->overwrite ? old : head; + md->core.start = md->overwrite ? head : old; + md->core.end = md->overwrite ? old : head; - if ((md->end - md->start) < md->flush) + if ((md->core.end - md->core.start) < md->flush) return -EAGAIN; - size = md->end - md->start; + size = md->core.end - md->core.start; if (size > (unsigned long)(md->core.mask) + 1) { if (!md->overwrite) { WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); - md->prev = head; + md->core.prev = head; perf_mmap__consume(md); return -EAGAIN; } @@ -467,7 +467,7 @@ static int __perf_mmap__read_init(struct mmap *md) * Backward ring buffer is full. We still have a chance to read * most of data from it. */ - if (overwrite_rb_find_range(data, md->core.mask, &md->start, &md->end)) + if (overwrite_rb_find_range(data, md->core.mask, &md->core.start, &md->core.end)) return -EINVAL; } @@ -498,12 +498,12 @@ int perf_mmap__push(struct mmap *md, void *to, if (rc < 0) return (rc == -EAGAIN) ? 1 : -1; - size = md->end - md->start; + size = md->core.end - md->core.start; - if ((md->start & md->core.mask) + size != (md->end & md->core.mask)) { - buf = &data[md->start & md->core.mask]; - size = md->core.mask + 1 - (md->start & md->core.mask); - md->start += size; + if ((md->core.start & md->core.mask) + size != (md->core.end & md->core.mask)) { + buf = &data[md->core.start & md->core.mask]; + size = md->core.mask + 1 - (md->core.start & md->core.mask); + md->core.start += size; if (push(md, to, buf, size) < 0) { rc = -1; @@ -511,16 +511,16 @@ int perf_mmap__push(struct mmap *md, void *to, } } - buf = &data[md->start & md->core.mask]; - size = md->end - md->start; - md->start += size; + buf = &data[md->core.start & md->core.mask]; + size = md->core.end - md->core.start; + md->core.start += size; if (push(md, to, buf, size) < 0) { rc = -1; goto out; } - md->prev = head; + md->core.prev = head; perf_mmap__consume(md); out: return rc; @@ -529,8 +529,8 @@ out: /* * Mandatory for overwrite mode * The direction of overwrite mode is backward. - * The last perf_mmap__read() will set tail to map->prev. - * Need to correct the map->prev to head which is the end of next read. + * The last perf_mmap__read() will set tail to map->core.prev. + * Need to correct the map->core.prev to head which is the end of next read. */ void perf_mmap__read_done(struct mmap *map) { @@ -540,5 +540,5 @@ void perf_mmap__read_done(struct mmap *map) if (!refcount_read(&map->core.refcnt)) return; - map->prev = perf_mmap__read_head(map); + map->core.prev = perf_mmap__read_head(map); } diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 5febd22fbe2e..a3dd53f2bfb8 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -22,9 +22,6 @@ struct aiocb; */ struct mmap { struct perf_mmap core; - u64 prev; - u64 start; - u64 end; bool overwrite; struct auxtrace_mmap auxtrace_mmap; char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); From 8df7a869818ec278969d34e4792985f12b24f23d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 27 Jul 2019 22:42:56 +0200 Subject: [PATCH 1147/2880] libperf: Add 'overwrite' to 'struct perf_mmap' Move 'overwrite' from tools/perf's mmap to libperf's perf_mmap struct. Committer notes: Add stdbool.h as we start using 'bool'. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-17-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/include/internal/mmap.h | 2 ++ tools/perf/util/evlist.c | 2 +- tools/perf/util/mmap.c | 12 ++++++------ tools/perf/util/mmap.h | 1 - 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index ebf5b93754fd..47c09f375fb6 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -4,6 +4,7 @@ #include #include +#include /** * struct perf_mmap - perf's ring buffer mmap details @@ -19,6 +20,7 @@ struct perf_mmap { u64 prev; u64 start; u64 end; + bool overwrite; }; #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index d987e0e5d62b..16d47a420bc2 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -709,7 +709,7 @@ static struct mmap *evlist__alloc_mmap(struct evlist *evlist, for (i = 0; i < evlist->nr_mmaps; i++) { map[i].core.fd = -1; - map[i].overwrite = overwrite; + map[i].core.overwrite = overwrite; /* * When the perf_mmap() call is made we grab one refcount, plus * one extra to let perf_mmap__consume() get the last diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 6ce70ff005cb..a8850ce2c2ff 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -93,12 +93,12 @@ union perf_event *perf_mmap__read_event(struct mmap *map) return NULL; /* non-overwirte doesn't pause the ringbuffer */ - if (!map->overwrite) + if (!map->core.overwrite) map->core.end = perf_mmap__read_head(map); event = perf_mmap__read(map, &map->core.start, map->core.end); - if (!map->overwrite) + if (!map->core.overwrite) map->core.prev = map->core.start; return event; @@ -124,7 +124,7 @@ void perf_mmap__put(struct mmap *map) void perf_mmap__consume(struct mmap *map) { - if (!map->overwrite) { + if (!map->core.overwrite) { u64 old = map->core.prev; perf_mmap__write_tail(map, old); @@ -447,15 +447,15 @@ static int __perf_mmap__read_init(struct mmap *md) unsigned char *data = md->core.base + page_size; unsigned long size; - md->core.start = md->overwrite ? head : old; - md->core.end = md->overwrite ? old : head; + md->core.start = md->core.overwrite ? head : old; + md->core.end = md->core.overwrite ? old : head; if ((md->core.end - md->core.start) < md->flush) return -EAGAIN; size = md->core.end - md->core.start; if (size > (unsigned long)(md->core.mask) + 1) { - if (!md->overwrite) { + if (!md->core.overwrite) { WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); md->core.prev = head; diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index a3dd53f2bfb8..d3e74c8da51a 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -22,7 +22,6 @@ struct aiocb; */ struct mmap { struct perf_mmap core; - bool overwrite; struct auxtrace_mmap auxtrace_mmap; char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); #ifdef HAVE_AIO_SUPPORT From 4443e6d7704ee85412e5cb0a0181d7ceee7e984f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 27 Jul 2019 22:47:58 +0200 Subject: [PATCH 1148/2880] libperf: Add 'event_copy' to 'struct perf_mmap' Move 'event_copy' from tools/perf's mmap to libperf's perf_mmap struct. Committer notes: Add linux/compiler.h as we need it for '__aligned'. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-18-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/include/internal/mmap.h | 5 +++++ tools/perf/util/mmap.c | 4 ++-- tools/perf/util/mmap.h | 1 - 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index 47c09f375fb6..10653b6e864e 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -2,10 +2,14 @@ #ifndef __LIBPERF_INTERNAL_MMAP_H #define __LIBPERF_INTERNAL_MMAP_H +#include #include #include #include +/* perf sample has 16 bits size limit */ +#define PERF_SAMPLE_MAX_SIZE (1 << 16) + /** * struct perf_mmap - perf's ring buffer mmap details * @@ -21,6 +25,7 @@ struct perf_mmap { u64 start; u64 end; bool overwrite; + char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); }; #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index a8850ce2c2ff..4b8ec8dd79c5 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -51,7 +51,7 @@ static union perf_event *perf_mmap__read(struct mmap *map, if ((*startp & map->core.mask) + size != ((*startp + size) & map->core.mask)) { unsigned int offset = *startp; unsigned int len = min(sizeof(*event), size), cpy; - void *dst = map->event_copy; + void *dst = map->core.event_copy; do { cpy = min(map->core.mask + 1 - (offset & map->core.mask), len); @@ -61,7 +61,7 @@ static union perf_event *perf_mmap__read(struct mmap *map, len -= cpy; } while (len); - event = (union perf_event *)map->event_copy; + event = (union perf_event *)map->core.event_copy; } *startp += size; diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index d3e74c8da51a..75c77fa57121 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -23,7 +23,6 @@ struct aiocb; struct mmap { struct perf_mmap core; struct auxtrace_mmap auxtrace_mmap; - char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); #ifdef HAVE_AIO_SUPPORT struct { void **data; From 65aa2e6bae3658cbc84c2e628a5c0ca163686204 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 27 Aug 2019 16:05:18 +0200 Subject: [PATCH 1149/2880] libperf: Add 'flush' to 'struct perf_mmap' Move 'flush' from tools/perf's mmap to libperf's perf_mmap struct. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-19-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 10 +++++----- tools/perf/lib/include/internal/mmap.h | 1 + tools/perf/util/mmap.c | 4 ++-- tools/perf/util/mmap.h | 1 - 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 2520c0212275..06738efd9820 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -973,13 +973,13 @@ static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, if (map->core.base) { record__adjust_affinity(rec, map); if (synch) { - flush = map->flush; - map->flush = 1; + flush = map->core.flush; + map->core.flush = 1; } if (!record__aio_enabled(rec)) { if (perf_mmap__push(map, rec, record__pushfn) < 0) { if (synch) - map->flush = flush; + map->core.flush = flush; rc = -1; goto out; } @@ -987,13 +987,13 @@ static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, if (record__aio_push(rec, map, &off) < 0) { record__aio_set_pos(trace_fd, off); if (synch) - map->flush = flush; + map->core.flush = flush; rc = -1; goto out; } } if (synch) - map->flush = flush; + map->core.flush = flush; } if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode && diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index 10653b6e864e..ba1e519c15b9 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -25,6 +25,7 @@ struct perf_mmap { u64 start; u64 end; bool overwrite; + u64 flush; char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); }; diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 4b8ec8dd79c5..4cc3b54b2f73 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -383,7 +383,7 @@ int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) perf_mmap__setup_affinity_mask(map, mp); - map->flush = mp->flush; + map->core.flush = mp->flush; map->comp_level = mp->comp_level; @@ -450,7 +450,7 @@ static int __perf_mmap__read_init(struct mmap *md) md->core.start = md->core.overwrite ? head : old; md->core.end = md->core.overwrite ? old : head; - if ((md->core.end - md->core.start) < md->flush) + if ((md->core.end - md->core.start) < md->core.flush) return -EAGAIN; size = md->core.end - md->core.start; diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 75c77fa57121..e567c1c875bd 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -32,7 +32,6 @@ struct mmap { } aio; #endif cpu_set_t affinity_mask; - u64 flush; void *data; int comp_level; }; From 648b5af3f3ae7f4fad7395c8dc84cb79eafe2ba9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 6 Aug 2019 11:35:19 +0200 Subject: [PATCH 1150/2880] libperf: Move 'system_wide' from 'struct evsel' to 'struct perf_evsel' Move the 'system_wide 'member from perf's evsel to libperf's perf_evsel. Committer notes: Added stdbool.h as we now use bool here. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-20-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/intel-pt.c | 4 ++-- tools/perf/builtin-script.c | 2 +- tools/perf/builtin-stat.c | 4 ++-- tools/perf/lib/include/internal/evsel.h | 2 ++ tools/perf/tests/switch-tracking.c | 6 +++--- tools/perf/util/evlist.c | 10 +++++----- tools/perf/util/evsel.c | 8 ++++---- tools/perf/util/evsel.h | 1 - tools/perf/util/parse-events.c | 2 +- tools/perf/util/stat.c | 2 +- 10 files changed, 21 insertions(+), 20 deletions(-) diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 34d7118bf390..1aa86a88884a 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -422,7 +422,7 @@ static int intel_pt_track_switches(struct evlist *evlist) perf_evsel__set_sample_bit(evsel, CPU); perf_evsel__set_sample_bit(evsel, TIME); - evsel->system_wide = true; + evsel->core.system_wide = true; evsel->no_aux_samples = true; evsel->immediate = true; @@ -723,7 +723,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, switch_evsel->core.attr.sample_period = 1; switch_evsel->core.attr.context_switch = 1; - switch_evsel->system_wide = true; + switch_evsel->core.system_wide = true; switch_evsel->no_aux_samples = true; switch_evsel->immediate = true; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index a17a9306bdf6..e7a49e2d7575 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1916,7 +1916,7 @@ static void __process_stat(struct evsel *counter, u64 tstamp) int cpu, thread; static int header_printed; - if (counter->system_wide) + if (counter->core.system_wide) nthreads = 1; if (!header_printed) { diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f7d13326b830..0d55eb6bd6e2 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -278,7 +278,7 @@ static int read_counter(struct evsel *counter, struct timespec *rs) if (!counter->supported) return -ENOENT; - if (counter->system_wide) + if (counter->core.system_wide) nthreads = 1; for (thread = 0; thread < nthreads; thread++) { @@ -1671,7 +1671,7 @@ static void setup_system_wide(int forks) struct evsel *counter; evlist__for_each_entry(evsel_list, counter) { - if (!counter->system_wide) + if (!counter->core.system_wide) return; } diff --git a/tools/perf/lib/include/internal/evsel.h b/tools/perf/lib/include/internal/evsel.h index 8b854d1c9b45..1bff789b0923 100644 --- a/tools/perf/lib/include/internal/evsel.h +++ b/tools/perf/lib/include/internal/evsel.h @@ -4,6 +4,7 @@ #include #include +#include struct perf_cpu_map; struct perf_thread_map; @@ -18,6 +19,7 @@ struct perf_evsel { /* parse modifier helper */ int nr_members; + bool system_wide; }; int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index e5c3f2ee223a..de700aad1fed 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -144,7 +144,7 @@ static int process_sample_event(struct evlist *evlist, return err; /* * Check for no missing sched_switch events i.e. that the - * evsel->system_wide flag has worked. + * evsel->core.system_wide flag has worked. */ if (switch_tracking->tids[cpu] != -1 && switch_tracking->tids[cpu] != prev_tid) { @@ -316,7 +316,7 @@ out_free_nodes: * * This function implements a test that checks that sched_switch events and * tracking events can be recorded for a workload (current process) using the - * evsel->system_wide and evsel->tracking flags (respectively) with other events + * evsel->core.system_wide and evsel->tracking flags (respectively) with other events * sometimes enabled or disabled. */ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_unused) @@ -396,7 +396,7 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ perf_evsel__set_sample_bit(switch_evsel, CPU); perf_evsel__set_sample_bit(switch_evsel, TIME); - switch_evsel->system_wide = true; + switch_evsel->core.system_wide = true; switch_evsel->no_aux_samples = true; switch_evsel->immediate = true; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 16d47a420bc2..16866533745c 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -319,7 +319,7 @@ int perf_evlist__add_newtp(struct evlist *evlist, static int perf_evlist__nr_threads(struct evlist *evlist, struct evsel *evsel) { - if (evsel->system_wide) + if (evsel->core.system_wide) return 1; else return perf_thread_map__nr(evlist->core.threads); @@ -410,7 +410,7 @@ int perf_evlist__alloc_pollfd(struct evlist *evlist) struct evsel *evsel; evlist__for_each_entry(evlist, evsel) { - if (evsel->system_wide) + if (evsel->core.system_wide) nfds += nr_cpus; else nfds += nr_cpus * nr_threads; @@ -536,7 +536,7 @@ static void perf_evlist__set_sid_idx(struct evlist *evlist, sid->cpu = evlist->core.cpus->map[cpu]; else sid->cpu = -1; - if (!evsel->system_wide && evlist->core.threads && thread >= 0) + if (!evsel->core.system_wide && evlist->core.threads && thread >= 0) sid->tid = perf_thread_map__pid(evlist->core.threads, thread); else sid->tid = -1; @@ -763,7 +763,7 @@ static int evlist__mmap_per_evsel(struct evlist *evlist, int idx, mp->prot &= ~PROT_WRITE; } - if (evsel->system_wide && thread) + if (evsel->core.system_wide && thread) continue; cpu = perf_cpu_map__idx(evsel->core.cpus, evlist_cpu); @@ -793,7 +793,7 @@ static int evlist__mmap_per_evsel(struct evlist *evlist, int idx, * other events, so it should not need to be polled anyway. * Therefore don't add it for polling. */ - if (!evsel->system_wide && + if (!evsel->core.system_wide && __perf_evlist__add_pollfd(evlist, fd, &maps[idx], revent) < 0) { perf_mmap__put(&maps[idx]); return -1; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 502bc3d50e0d..566c9413246c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1232,7 +1232,7 @@ int perf_evsel__alloc_id(struct evsel *evsel, int ncpus, int nthreads) if (ncpus == 0 || nthreads == 0) return 0; - if (evsel->system_wide) + if (evsel->core.system_wide) nthreads = 1; evsel->sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id)); @@ -1663,7 +1663,7 @@ static bool ignore_missing_thread(struct evsel *evsel, return false; /* The system wide setup does not work with threads. */ - if (evsel->system_wide) + if (evsel->core.system_wide) return false; /* The -ESRCH is perf event syscall errno for pid's not found. */ @@ -1772,7 +1772,7 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, threads = empty_thread_map; } - if (evsel->system_wide) + if (evsel->core.system_wide) nthreads = 1; else nthreads = threads->nr; @@ -1819,7 +1819,7 @@ retry_sample_id: for (thread = 0; thread < nthreads; thread++) { int fd, group_fd; - if (!evsel->cgrp && !evsel->system_wide) + if (!evsel->cgrp && !evsel->core.system_wide) pid = perf_thread_map__pid(threads, thread); group_fd = get_group_fd(evsel, cpu, thread); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 74df298acb31..f757ff449a17 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -146,7 +146,6 @@ struct evsel { bool disabled; bool no_aux_samples; bool immediate; - bool system_wide; bool tracking; bool per_pkg; bool precise_max; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 9cf1371c90a3..d7aebe9b005d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -335,7 +335,7 @@ __add_event(struct list_head *list, int *idx, (*idx)++; evsel->core.cpus = perf_cpu_map__get(cpus); evsel->core.own_cpus = perf_cpu_map__get(cpus); - evsel->system_wide = pmu ? pmu->is_uncore : false; + evsel->core.system_wide = pmu ? pmu->is_uncore : false; evsel->auto_merge_stats = auto_merge_stats; if (name) diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index fcd54342c04c..ebdd130557fb 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -336,7 +336,7 @@ static int process_counter_maps(struct perf_stat_config *config, int ncpus = perf_evsel__nr_cpus(counter); int cpu, thread; - if (counter->system_wide) + if (counter->core.system_wide) nthreads = 1; for (thread = 0; thread < nthreads; thread++) { From c976ee11a0e1b3ba5e63e734dbf4b19154e39fab Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Jul 2019 13:04:59 +0200 Subject: [PATCH 1151/2880] libperf: Move 'nr_mmaps' from 'struct evlist' to 'struct perf_evlist' Moving 'nr_mmaps' from 'struct evlist' to 'struct perf_evlist', it will be used in following patches. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-21-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 2 +- tools/perf/arch/arm64/util/arm-spe.c | 2 +- tools/perf/arch/x86/tests/perf-time-to-tsc.c | 2 +- tools/perf/arch/x86/util/intel-bts.c | 2 +- tools/perf/arch/x86/util/intel-pt.c | 2 +- tools/perf/builtin-kvm.c | 2 +- tools/perf/builtin-record.c | 6 +++--- tools/perf/builtin-top.c | 2 +- tools/perf/builtin-trace.c | 2 +- tools/perf/lib/include/internal/evlist.h | 1 + tools/perf/tests/backward-ring-buffer.c | 2 +- tools/perf/tests/bpf.c | 2 +- tools/perf/tests/code-reading.c | 2 +- tools/perf/tests/keep-tracking.c | 2 +- tools/perf/tests/openat-syscall-tp-fields.c | 2 +- tools/perf/tests/perf-record.c | 2 +- tools/perf/tests/switch-tracking.c | 2 +- tools/perf/util/evlist.c | 16 ++++++++-------- tools/perf/util/evlist.h | 1 - tools/perf/util/python.c | 2 +- 20 files changed, 28 insertions(+), 28 deletions(-) diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 5d120b1e35ed..051e9066fb38 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -648,7 +648,7 @@ static int cs_etm_info_fill(struct auxtrace_record *itr, if (priv_size != cs_etm_info_priv_size(itr, session->evlist)) return -EINVAL; - if (!session->evlist->nr_mmaps) + if (!session->evlist->core.nr_mmaps) return -EINVAL; /* If the cpu_map is empty all online CPUs are involved */ diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index eebbf31f995c..9302c6566f53 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -51,7 +51,7 @@ static int arm_spe_info_fill(struct auxtrace_record *itr, if (priv_size != ARM_SPE_AUXTRACE_PRIV_SIZE) return -EINVAL; - if (!session->evlist->nr_mmaps) + if (!session->evlist->core.nr_mmaps) return -EINVAL; auxtrace_info->type = PERF_AUXTRACE_ARM_SPE; diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index bd404db94b3a..10b7acebc0eb 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -115,7 +115,7 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe evlist__disable(evlist); - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { md = &evlist->mmap[i]; if (perf_mmap__read_init(md) < 0) continue; diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index 130925141369..e81535c8e9c5 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -75,7 +75,7 @@ static int intel_bts_info_fill(struct auxtrace_record *itr, if (priv_size != INTEL_BTS_AUXTRACE_PRIV_SIZE) return -EINVAL; - if (!session->evlist->nr_mmaps) + if (!session->evlist->core.nr_mmaps) return -EINVAL; pc = session->evlist->mmap[0].core.base; diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 1aa86a88884a..886b3ac60f23 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -352,7 +352,7 @@ static int intel_pt_info_fill(struct auxtrace_record *itr, filter = intel_pt_find_filter(session->evlist, ptr->intel_pt_pmu); filter_str_len = filter ? strlen(filter) : 0; - if (!session->evlist->nr_mmaps) + if (!session->evlist->core.nr_mmaps) return -EINVAL; pc = session->evlist->mmap[0].core.base; diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 30852848ed9c..e2b42efc86fa 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -802,7 +802,7 @@ static int perf_kvm__mmap_read(struct perf_kvm_stat *kvm) s64 n, ntotal = 0; u64 flush_time = ULLONG_MAX, mmap_time; - for (i = 0; i < kvm->evlist->nr_mmaps; i++) { + for (i = 0; i < kvm->evlist->core.nr_mmaps; i++) { n = perf_kvm__mmap_read_idx(kvm, i, &mmap_time); if (n < 0) return -1; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 06738efd9820..8577bf33a556 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -357,7 +357,7 @@ static void record__aio_mmap_read_sync(struct record *rec) if (!record__aio_enabled(rec)) return; - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { struct mmap *map = &maps[i]; if (map->core.base) @@ -603,7 +603,7 @@ static int record__auxtrace_read_snapshot_all(struct record *rec) int i; int rc = 0; - for (i = 0; i < rec->evlist->nr_mmaps; i++) { + for (i = 0; i < rec->evlist->core.nr_mmaps; i++) { struct mmap *map = &rec->evlist->mmap[i]; if (!map->auxtrace_mmap.base) @@ -966,7 +966,7 @@ static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist, if (record__aio_enabled(rec)) off = record__aio_get_pos(trace_fd); - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { u64 flush = 0; struct mmap *map = &maps[i]; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index e637a08655db..474b9860cfd4 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -904,7 +904,7 @@ static void perf_top__mmap_read(struct perf_top *top) if (overwrite) perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_DATA_PENDING); - for (i = 0; i < top->evlist->nr_mmaps; i++) + for (i = 0; i < top->evlist->core.nr_mmaps; i++) perf_top__mmap_read_idx(top, i); if (overwrite) { diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index c44280358e58..91c73c7472ba 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3443,7 +3443,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) again: before = trace->nr_events; - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { union perf_event *event; struct mmap *md; diff --git a/tools/perf/lib/include/internal/evlist.h b/tools/perf/lib/include/internal/evlist.h index 448891f06e3e..035c1e1cc324 100644 --- a/tools/perf/lib/include/internal/evlist.h +++ b/tools/perf/lib/include/internal/evlist.h @@ -13,6 +13,7 @@ struct perf_evlist { bool has_user_cpus; struct perf_cpu_map *cpus; struct perf_thread_map *threads; + int nr_mmaps; }; /** diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index c59d3752d48d..338cd9faa835 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -33,7 +33,7 @@ static int count_samples(struct evlist *evlist, int *sample_count, { int i; - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { struct mmap *map = &evlist->overwrite_mmap[i]; union perf_event *event; diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index 3c8533fdbce5..1eb0bffaed6c 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -179,7 +179,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void), (*func)(); evlist__disable(evlist); - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { union perf_event *event; struct mmap *md; diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index bc6db3e7a1c5..7dac69a375f9 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -423,7 +423,7 @@ static int process_events(struct machine *machine, struct evlist *evlist, struct mmap *md; int i, ret; - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { md = &evlist->mmap[i]; if (perf_mmap__read_init(md) < 0) continue; diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index c6030fdf7d4c..bd4ae8e5cd5d 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -36,7 +36,7 @@ static int find_comm(struct evlist *evlist, const char *comm) int i, found; found = 0; - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { md = &evlist->mmap[i]; if (perf_mmap__read_init(md) < 0) continue; diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index e20eaadb1a35..4629fa33c8ad 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -87,7 +87,7 @@ int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest while (1) { int before = nr_events; - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { union perf_event *event; struct mmap *md; diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index ea8bcaa13ea5..199a66444e60 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -165,7 +165,7 @@ int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unus while (1) { int before = total_events; - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { union perf_event *event; struct mmap *md; diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index de700aad1fed..30a70db6473d 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -267,7 +267,7 @@ static int process_events(struct evlist *evlist, struct mmap *md; int i, ret; - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { md = &evlist->mmap[i]; if (perf_mmap__read_init(md) < 0) continue; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 16866533745c..d147834fbe60 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -651,7 +651,7 @@ static int perf_evlist__set_paused(struct evlist *evlist, bool value) if (!evlist->overwrite_mmap) return 0; - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { int fd = evlist->overwrite_mmap[i].core.fd; int err; @@ -679,11 +679,11 @@ static void evlist__munmap_nofree(struct evlist *evlist) int i; if (evlist->mmap) - for (i = 0; i < evlist->nr_mmaps; i++) + for (i = 0; i < evlist->core.nr_mmaps; i++) perf_mmap__munmap(&evlist->mmap[i]); if (evlist->overwrite_mmap) - for (i = 0; i < evlist->nr_mmaps; i++) + for (i = 0; i < evlist->core.nr_mmaps; i++) perf_mmap__munmap(&evlist->overwrite_mmap[i]); } @@ -700,14 +700,14 @@ static struct mmap *evlist__alloc_mmap(struct evlist *evlist, int i; struct mmap *map; - evlist->nr_mmaps = perf_cpu_map__nr(evlist->core.cpus); + evlist->core.nr_mmaps = perf_cpu_map__nr(evlist->core.cpus); if (perf_cpu_map__empty(evlist->core.cpus)) - evlist->nr_mmaps = perf_thread_map__nr(evlist->core.threads); - map = zalloc(evlist->nr_mmaps * sizeof(struct mmap)); + evlist->core.nr_mmaps = perf_thread_map__nr(evlist->core.threads); + map = zalloc(evlist->core.nr_mmaps * sizeof(struct mmap)); if (!map) return NULL; - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { map[i].core.fd = -1; map[i].core.overwrite = overwrite; /* @@ -1847,7 +1847,7 @@ static void *perf_evlist__poll_thread(void *arg) if (!draining) perf_evlist__poll(evlist, 1000); - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { struct mmap *map = &evlist->mmap[i]; union perf_event *event; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 8b9c35efea67..816b72a2b1e5 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -55,7 +55,6 @@ struct evlist { struct perf_evlist core; struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; int nr_groups; - int nr_mmaps; bool enabled; size_t mmap_len; int id_pos; diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index ba4085d7ae9f..62144b97e17b 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -988,7 +988,7 @@ static struct mmap *get_md(struct evlist *evlist, int cpu) { int i; - for (i = 0; i < evlist->nr_mmaps; i++) { + for (i = 0; i < evlist->core.nr_mmaps; i++) { struct mmap *md = &evlist->mmap[i]; if (md->core.cpu == cpu) From f6fa437577937265dacd4637608f4cf0a1a928dc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 6 Aug 2019 15:14:05 +0200 Subject: [PATCH 1152/2880] libperf: Move 'mmap_len' from 'struct evlist' to 'struct perf_evlist' Moving 'mmap_len' from 'struct evlist' to 'struct perf_evlist' it will be used in following patches. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-22-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 2 +- tools/perf/lib/include/internal/evlist.h | 1 + tools/perf/util/evlist.c | 10 +++++----- tools/perf/util/evlist.h | 1 - 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 8577bf33a556..94997144547d 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1412,7 +1412,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) err = -1; goto out_child; } - session->header.env.comp_mmap_len = session->evlist->mmap_len; + session->header.env.comp_mmap_len = session->evlist->core.mmap_len; err = bpf__apply_obj_config(); if (err) { diff --git a/tools/perf/lib/include/internal/evlist.h b/tools/perf/lib/include/internal/evlist.h index 035c1e1cc324..01b813616440 100644 --- a/tools/perf/lib/include/internal/evlist.h +++ b/tools/perf/lib/include/internal/evlist.h @@ -14,6 +14,7 @@ struct perf_evlist { struct perf_cpu_map *cpus; struct perf_thread_map *threads; int nr_mmaps; + size_t mmap_len; }; /** diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index d147834fbe60..4d8cde099e10 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1012,11 +1012,11 @@ int evlist__mmap_ex(struct evlist *evlist, unsigned int pages, if (evlist->pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) < 0) return -ENOMEM; - evlist->mmap_len = evlist__mmap_size(pages); - pr_debug("mmap size %zuB\n", evlist->mmap_len); - mp.mask = evlist->mmap_len - page_size - 1; + evlist->core.mmap_len = evlist__mmap_size(pages); + pr_debug("mmap size %zuB\n", evlist->core.mmap_len); + mp.mask = evlist->core.mmap_len - page_size - 1; - auxtrace_mmap_params__init(&mp.auxtrace_mp, evlist->mmap_len, + auxtrace_mmap_params__init(&mp.auxtrace_mp, evlist->core.mmap_len, auxtrace_pages, auxtrace_overwrite); evlist__for_each_entry(evlist, evsel) { @@ -1600,7 +1600,7 @@ out_default: int perf_evlist__strerror_mmap(struct evlist *evlist, int err, char *buf, size_t size) { char sbuf[STRERR_BUFSIZE], *emsg = str_error_r(err, sbuf, sizeof(sbuf)); - int pages_attempted = evlist->mmap_len / 1024, pages_max_per_user, printed = 0; + int pages_attempted = evlist->core.mmap_len / 1024, pages_max_per_user, printed = 0; switch (err) { case EPERM: diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 816b72a2b1e5..765cee8bced1 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -56,7 +56,6 @@ struct evlist { struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; int nr_groups; bool enabled; - size_t mmap_len; int id_pos; int is_pos; u64 combined_sample_type; From 40cb2d5141bdd52b3c00bb78799118c4606947ac Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 6 Aug 2019 11:28:02 +0200 Subject: [PATCH 1153/2880] libperf: Move 'pollfd' from 'struct evlist' to 'struct perf_evlist' Moving 'pollfd' from 'struct evlist' to 'struct perf_evlist' it will be used in following patches. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-23-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kvm.c | 2 +- tools/perf/lib/include/internal/evlist.h | 2 ++ tools/perf/util/evlist.c | 18 +++++++++--------- tools/perf/util/evlist.h | 1 - tools/perf/util/python.c | 6 +++--- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index e2b42efc86fa..710a2898ed6c 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -981,7 +981,7 @@ static int kvm_events_live_report(struct perf_kvm_stat *kvm) evlist__enable(kvm->evlist); while (!done) { - struct fdarray *fda = &kvm->evlist->pollfd; + struct fdarray *fda = &kvm->evlist->core.pollfd; int rc; rc = perf_kvm__mmap_read(kvm); diff --git a/tools/perf/lib/include/internal/evlist.h b/tools/perf/lib/include/internal/evlist.h index 01b813616440..8a4eb66fbf3a 100644 --- a/tools/perf/lib/include/internal/evlist.h +++ b/tools/perf/lib/include/internal/evlist.h @@ -3,6 +3,7 @@ #define __LIBPERF_INTERNAL_EVLIST_H #include +#include struct perf_cpu_map; struct perf_thread_map; @@ -15,6 +16,7 @@ struct perf_evlist { struct perf_thread_map *threads; int nr_mmaps; size_t mmap_len; + struct fdarray pollfd; }; /** diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 4d8cde099e10..13595e8e6b4b 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -61,7 +61,7 @@ void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus, INIT_HLIST_HEAD(&evlist->heads[i]); perf_evlist__init(&evlist->core); perf_evlist__set_maps(&evlist->core, cpus, threads); - fdarray__init(&evlist->pollfd, 64); + fdarray__init(&evlist->core.pollfd, 64); evlist->workload.pid = -1; evlist->bkw_mmap_state = BKW_MMAP_NOTREADY; } @@ -142,7 +142,7 @@ void evlist__exit(struct evlist *evlist) { zfree(&evlist->mmap); zfree(&evlist->overwrite_mmap); - fdarray__exit(&evlist->pollfd); + fdarray__exit(&evlist->core.pollfd); } void evlist__delete(struct evlist *evlist) @@ -416,8 +416,8 @@ int perf_evlist__alloc_pollfd(struct evlist *evlist) nfds += nr_cpus * nr_threads; } - if (fdarray__available_entries(&evlist->pollfd) < nfds && - fdarray__grow(&evlist->pollfd, nfds) < 0) + if (fdarray__available_entries(&evlist->core.pollfd) < nfds && + fdarray__grow(&evlist->core.pollfd, nfds) < 0) return -ENOMEM; return 0; @@ -426,13 +426,13 @@ int perf_evlist__alloc_pollfd(struct evlist *evlist) static int __perf_evlist__add_pollfd(struct evlist *evlist, int fd, struct mmap *map, short revent) { - int pos = fdarray__add(&evlist->pollfd, fd, revent | POLLERR | POLLHUP); + int pos = fdarray__add(&evlist->core.pollfd, fd, revent | POLLERR | POLLHUP); /* * Save the idx so that when we filter out fds POLLHUP'ed we can * close the associated evlist->mmap[] entry. */ if (pos >= 0) { - evlist->pollfd.priv[pos].ptr = map; + evlist->core.pollfd.priv[pos].ptr = map; fcntl(fd, F_SETFL, O_NONBLOCK); } @@ -456,13 +456,13 @@ static void perf_evlist__munmap_filtered(struct fdarray *fda, int fd, int perf_evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask) { - return fdarray__filter(&evlist->pollfd, revents_and_mask, + return fdarray__filter(&evlist->core.pollfd, revents_and_mask, perf_evlist__munmap_filtered, NULL); } int perf_evlist__poll(struct evlist *evlist, int timeout) { - return fdarray__poll(&evlist->pollfd, timeout); + return fdarray__poll(&evlist->core.pollfd, timeout); } static void perf_evlist__id_hash(struct evlist *evlist, @@ -1009,7 +1009,7 @@ int evlist__mmap_ex(struct evlist *evlist, unsigned int pages, if (!evlist->mmap) return -ENOMEM; - if (evlist->pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) < 0) + if (evlist->core.pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) < 0) return -ENOMEM; evlist->core.mmap_len = evlist__mmap_size(pages); diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 765cee8bced1..4fcdf93eb19c 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -64,7 +64,6 @@ struct evlist { int cork_fd; pid_t pid; } workload; - struct fdarray pollfd; struct mmap *mmap; struct mmap *overwrite_mmap; struct evsel *selected; diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 62144b97e17b..fcbafb1e8d9d 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -935,17 +935,17 @@ static PyObject *pyrf_evlist__get_pollfd(struct pyrf_evlist *pevlist, PyObject *list = PyList_New(0); int i; - for (i = 0; i < evlist->pollfd.nr; ++i) { + for (i = 0; i < evlist->core.pollfd.nr; ++i) { PyObject *file; #if PY_MAJOR_VERSION < 3 - FILE *fp = fdopen(evlist->pollfd.entries[i].fd, "r"); + FILE *fp = fdopen(evlist->core.pollfd.entries[i].fd, "r"); if (fp == NULL) goto free_list; file = PyFile_FromFile(fp, "perf", "r", NULL); #else - file = PyFile_FromFd(evlist->pollfd.entries[i].fd, "perf", "r", -1, + file = PyFile_FromFd(evlist->core.pollfd.entries[i].fd, "perf", "r", -1, NULL, NULL, NULL, 0); #endif if (file == NULL) From fee92b4442f135495d4fc59a0b9418490d4ac0ba Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 23 Sep 2019 15:10:35 -0300 Subject: [PATCH 1154/2880] libperf: Add missing 'struct xyarray' forward declaration We were getting it by luck, from files included before internal/evsel.h where it is being included. Fixes: 9dfcb7599084 ("libperf: Move fd array from perf's evsel to lobperf's perf_evsel class") Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lkml.kernel.org/n/tip-r8ukhxprpkflbd2k9vcc42v1@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/include/internal/evsel.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/lib/include/internal/evsel.h b/tools/perf/lib/include/internal/evsel.h index 1bff789b0923..60daee6db914 100644 --- a/tools/perf/lib/include/internal/evsel.h +++ b/tools/perf/lib/include/internal/evsel.h @@ -8,6 +8,7 @@ struct perf_cpu_map; struct perf_thread_map; +struct xyarray; struct perf_evsel { struct list_head node; From 8cd36f3ef4926165bc5e5af6f7d7b45f0e14a1f4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Sep 2019 22:04:12 +0200 Subject: [PATCH 1155/2880] libperf: Move 'sample_id' from 'struct evsel' to 'struct perf_evsel' Move 'sample_id' array from 'struct evsel' to libperf's 'struct perf_evsel'. Committer notes: Removed the 'struct xyarray' from util/evsel.h, not needed anymore there. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-24-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 +- tools/perf/lib/include/internal/evsel.h | 1 + tools/perf/util/evlist.c | 4 ++-- tools/perf/util/evsel.c | 12 ++++++------ tools/perf/util/evsel.h | 2 -- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 0d55eb6bd6e2..468fc49420ce 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -235,7 +235,7 @@ static int write_stat_round_event(u64 tm, u64 type) #define WRITE_STAT_ROUND_EVENT(time, interval) \ write_stat_round_event(time, PERF_STAT_ROUND_TYPE__ ## interval) -#define SID(e, x, y) xyarray__entry(e->sample_id, x, y) +#define SID(e, x, y) xyarray__entry(e->core.sample_id, x, y) static int perf_evsel__write_stat_event(struct evsel *counter, u32 cpu, u32 thread, diff --git a/tools/perf/lib/include/internal/evsel.h b/tools/perf/lib/include/internal/evsel.h index 60daee6db914..1ddb969f7807 100644 --- a/tools/perf/lib/include/internal/evsel.h +++ b/tools/perf/lib/include/internal/evsel.h @@ -17,6 +17,7 @@ struct perf_evsel { struct perf_cpu_map *own_cpus; struct perf_thread_map *threads; struct xyarray *fd; + struct xyarray *sample_id; /* parse modifier helper */ int nr_members; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 13595e8e6b4b..84b409802298 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -50,7 +50,7 @@ int sigqueue(pid_t pid, int sig, const union sigval value); #endif #define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y)) -#define SID(e, x, y) xyarray__entry(e->sample_id, x, y) +#define SID(e, x, y) xyarray__entry(e->core.sample_id, x, y) void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus, struct perf_thread_map *threads) @@ -1021,7 +1021,7 @@ int evlist__mmap_ex(struct evlist *evlist, unsigned int pages, evlist__for_each_entry(evlist, evsel) { if ((evsel->core.attr.read_format & PERF_FORMAT_ID) && - evsel->sample_id == NULL && + evsel->core.sample_id == NULL && perf_evsel__alloc_id(evsel, perf_cpu_map__nr(cpus), threads->nr) < 0) return -ENOMEM; } diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 566c9413246c..cb1ada8cf4a4 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1235,14 +1235,14 @@ int perf_evsel__alloc_id(struct evsel *evsel, int ncpus, int nthreads) if (evsel->core.system_wide) nthreads = 1; - evsel->sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id)); - if (evsel->sample_id == NULL) + evsel->core.sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id)); + if (evsel->core.sample_id == NULL) return -ENOMEM; evsel->id = zalloc(ncpus * nthreads * sizeof(u64)); if (evsel->id == NULL) { - xyarray__delete(evsel->sample_id); - evsel->sample_id = NULL; + xyarray__delete(evsel->core.sample_id); + evsel->core.sample_id = NULL; return -ENOMEM; } @@ -1251,8 +1251,8 @@ int perf_evsel__alloc_id(struct evsel *evsel, int ncpus, int nthreads) static void perf_evsel__free_id(struct evsel *evsel) { - xyarray__delete(evsel->sample_id); - evsel->sample_id = NULL; + xyarray__delete(evsel->core.sample_id); + evsel->core.sample_id = NULL; zfree(&evsel->id); evsel->ids = 0; } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index f757ff449a17..2d2c6cad81f8 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -96,7 +96,6 @@ enum perf_tool_event { struct bpf_object; struct perf_counts; -struct xyarray; /** struct evsel - event selector * @@ -117,7 +116,6 @@ struct evsel { struct perf_evsel core; struct evlist *evlist; char *filter; - struct xyarray *sample_id; u64 *id; struct perf_counts *counts; struct perf_counts *prev_raw_counts; From deaf321913a7b1d440c5cd5c7766d47381c9b21b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Sep 2019 22:12:26 +0200 Subject: [PATCH 1156/2880] libperf: Move 'id' from 'struct evsel' to 'struct perf_evsel' Move the 'id' array from 'struct evsel' to libperf's 'struct perf_evsel'. Committer note: Fix the tools/perf/util/cs-etm.c build, i.e. aarch64's CoreSight. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-25-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/include/internal/evsel.h | 1 + tools/perf/util/cs-etm.c | 2 +- tools/perf/util/evlist.c | 2 +- tools/perf/util/evsel.c | 6 +++--- tools/perf/util/evsel.h | 1 - tools/perf/util/header.c | 10 +++++----- tools/perf/util/intel-bts.c | 2 +- tools/perf/util/intel-pt.c | 8 ++++---- tools/perf/util/synthetic-events.c | 12 ++++++------ 9 files changed, 22 insertions(+), 22 deletions(-) diff --git a/tools/perf/lib/include/internal/evsel.h b/tools/perf/lib/include/internal/evsel.h index 1ddb969f7807..e7171948c347 100644 --- a/tools/perf/lib/include/internal/evsel.h +++ b/tools/perf/lib/include/internal/evsel.h @@ -18,6 +18,7 @@ struct perf_evsel { struct perf_thread_map *threads; struct xyarray *fd; struct xyarray *sample_id; + u64 *id; /* parse modifier helper */ int nr_members; diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 6021974577d5..4ba0f871f086 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -1298,7 +1298,7 @@ static int cs_etm__synth_events(struct cs_etm_auxtrace *etm, attr.read_format = evsel->core.attr.read_format; /* create new id val to be a fixed offset from evsel id */ - id = evsel->id[0] + 1000000000; + id = evsel->core.id[0] + 1000000000; if (!id) id = 1; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 84b409802298..24f03f245525 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -482,7 +482,7 @@ void perf_evlist__id_add(struct evlist *evlist, struct evsel *evsel, int cpu, int thread, u64 id) { perf_evlist__id_hash(evlist, evsel, cpu, thread, id); - evsel->id[evsel->ids++] = id; + evsel->core.id[evsel->ids++] = id; } int perf_evlist__id_add_fd(struct evlist *evlist, diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index cb1ada8cf4a4..9c1b4f4a5fa3 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1239,8 +1239,8 @@ int perf_evsel__alloc_id(struct evsel *evsel, int ncpus, int nthreads) if (evsel->core.sample_id == NULL) return -ENOMEM; - evsel->id = zalloc(ncpus * nthreads * sizeof(u64)); - if (evsel->id == NULL) { + evsel->core.id = zalloc(ncpus * nthreads * sizeof(u64)); + if (evsel->core.id == NULL) { xyarray__delete(evsel->core.sample_id); evsel->core.sample_id = NULL; return -ENOMEM; @@ -1253,7 +1253,7 @@ static void perf_evsel__free_id(struct evsel *evsel) { xyarray__delete(evsel->core.sample_id); evsel->core.sample_id = NULL; - zfree(&evsel->id); + zfree(&evsel->core.id); evsel->ids = 0; } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 2d2c6cad81f8..0d2aa933ceb3 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -116,7 +116,6 @@ struct evsel { struct perf_evsel core; struct evlist *evlist; char *filter; - u64 *id; struct perf_counts *counts; struct perf_counts *prev_raw_counts; int idx; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 0167f9697172..2b0681ab08aa 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -530,7 +530,7 @@ static int write_event_desc(struct feat_fd *ff, /* * write unique ids for this event */ - ret = do_write(ff, evsel->id, evsel->ids * sizeof(u64)); + ret = do_write(ff, evsel->core.id, evsel->ids * sizeof(u64)); if (ret < 0) return ret; } @@ -1590,7 +1590,7 @@ static void free_event_desc(struct evsel *events) for (evsel = events; evsel->core.attr.size; evsel++) { zfree(&evsel->name); - zfree(&evsel->id); + zfree(&evsel->core.id); } free(events); @@ -1657,7 +1657,7 @@ static struct evsel *read_event_desc(struct feat_fd *ff) if (!id) goto error; evsel->ids = nr; - evsel->id = id; + evsel->core.id = id; for (j = 0 ; j < nr; j++) { if (do_read_u64(ff, id)) @@ -1701,7 +1701,7 @@ static void print_event_desc(struct feat_fd *ff, FILE *fp) if (evsel->ids) { fprintf(fp, ", id = {"); - for (j = 0, id = evsel->id; j < evsel->ids; j++, id++) { + for (j = 0, id = evsel->core.id; j < evsel->ids; j++, id++) { if (j) fputc(',', fp); fprintf(fp, " %"PRIu64, *id); @@ -3068,7 +3068,7 @@ int perf_session__write_header(struct perf_session *session, evlist__for_each_entry(session->evlist, evsel) { evsel->id_offset = lseek(fd, 0, SEEK_CUR); - err = do_write(&ff, evsel->id, evsel->ids * sizeof(u64)); + err = do_write(&ff, evsel->core.id, evsel->ids * sizeof(u64)); if (err < 0) { pr_debug("failed to write perf header\n"); return err; diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index 3888d4cd3ed1..c94360cd9c00 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -795,7 +795,7 @@ static int intel_bts_synth_events(struct intel_bts *bts, attr.sample_id_all = evsel->core.attr.sample_id_all; attr.read_format = evsel->core.attr.read_format; - id = evsel->id[0] + 1000000000; + id = evsel->core.id[0] + 1000000000; if (!id) id = 1; diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index bcdc0359f7cf..24ca5d5908ca 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1705,7 +1705,7 @@ static int intel_pt_synth_pebs_sample(struct intel_pt_queue *ptq) struct intel_pt *pt = ptq->pt; struct evsel *evsel = pt->pebs_evsel; u64 sample_type = evsel->core.attr.sample_type; - u64 id = evsel->id[0]; + u64 id = evsel->core.id[0]; u8 cpumode; if (intel_pt_skip_event(pt)) @@ -2720,7 +2720,7 @@ static void intel_pt_set_event_name(struct evlist *evlist, u64 id, struct evsel *evsel; evlist__for_each_entry(evlist, evsel) { - if (evsel->id && evsel->id[0] == id) { + if (evsel->core.id && evsel->core.id[0] == id) { if (evsel->name) zfree(&evsel->name); evsel->name = strdup(name); @@ -2776,7 +2776,7 @@ static int intel_pt_synth_events(struct intel_pt *pt, attr.sample_id_all = evsel->core.attr.sample_id_all; attr.read_format = evsel->core.attr.read_format; - id = evsel->id[0] + 1000000000; + id = evsel->core.id[0] + 1000000000; if (!id) id = 1; @@ -2903,7 +2903,7 @@ static void intel_pt_setup_pebs_events(struct intel_pt *pt) return; evlist__for_each_entry(pt->session->evlist, evsel) { - if (evsel->core.attr.aux_output && evsel->id) { + if (evsel->core.attr.aux_output && evsel->core.id) { pt->sample_pebs = true; pt->pebs_evsel = evsel; return; diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 8322028a9a97..907ac3971959 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1442,7 +1442,7 @@ int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_ e = &ev->id_index.entries[i++]; - e->id = evsel->id[j]; + e->id = evsel->core.id[j]; sid = perf_evlist__id2sid(evlist, e->id); if (!sid) { @@ -1515,7 +1515,7 @@ int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct evse struct perf_record_event_update *ev; int err; - ev = event_update_event__new(size + 1, PERF_EVENT_UPDATE__UNIT, evsel->id[0]); + ev = event_update_event__new(size + 1, PERF_EVENT_UPDATE__UNIT, evsel->core.id[0]); if (ev == NULL) return -ENOMEM; @@ -1532,7 +1532,7 @@ int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evs struct perf_record_event_update_scale *ev_data; int err; - ev = event_update_event__new(sizeof(*ev_data), PERF_EVENT_UPDATE__SCALE, evsel->id[0]); + ev = event_update_event__new(sizeof(*ev_data), PERF_EVENT_UPDATE__SCALE, evsel->core.id[0]); if (ev == NULL) return -ENOMEM; @@ -1550,7 +1550,7 @@ int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evse size_t len = strlen(evsel->name); int err; - ev = event_update_event__new(len + 1, PERF_EVENT_UPDATE__NAME, evsel->id[0]); + ev = event_update_event__new(len + 1, PERF_EVENT_UPDATE__NAME, evsel->core.id[0]); if (ev == NULL) return -ENOMEM; @@ -1578,7 +1578,7 @@ int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evse ev->header.type = PERF_RECORD_EVENT_UPDATE; ev->header.size = (u16)size; ev->type = PERF_EVENT_UPDATE__CPUS; - ev->id = evsel->id[0]; + ev->id = evsel->core.id[0]; cpu_map_data__synthesize((struct perf_record_cpu_map_data *)ev->data, evsel->core.own_cpus, type, max); @@ -1596,7 +1596,7 @@ int perf_event__synthesize_attrs(struct perf_tool *tool, struct evlist *evlist, evlist__for_each_entry(evlist, evsel) { err = perf_event__synthesize_attr(tool, &evsel->core.attr, evsel->ids, - evsel->id, process); + evsel->core.id, process); if (err) { pr_debug("failed to create perf header attribute\n"); return err; From e7eb9002d4513ac4a26c756b72e6c25bf063baf2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Sep 2019 22:15:47 +0200 Subject: [PATCH 1157/2880] libperf: Move 'ids' from 'struct evsel' to 'struct perf_evsel' Move 'ids' from 'struct evsel' to libperf's 'struct perf_evsel'. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-26-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/include/internal/evsel.h | 1 + tools/perf/util/evlist.c | 2 +- tools/perf/util/evsel.c | 2 +- tools/perf/util/evsel.h | 1 - tools/perf/util/header.c | 14 +++++++------- tools/perf/util/intel-bts.c | 2 +- tools/perf/util/intel-pt.c | 2 +- tools/perf/util/synthetic-events.c | 6 +++--- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tools/perf/lib/include/internal/evsel.h b/tools/perf/lib/include/internal/evsel.h index e7171948c347..385766f06591 100644 --- a/tools/perf/lib/include/internal/evsel.h +++ b/tools/perf/lib/include/internal/evsel.h @@ -19,6 +19,7 @@ struct perf_evsel { struct xyarray *fd; struct xyarray *sample_id; u64 *id; + u32 ids; /* parse modifier helper */ int nr_members; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 24f03f245525..5e43fb9da359 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -482,7 +482,7 @@ void perf_evlist__id_add(struct evlist *evlist, struct evsel *evsel, int cpu, int thread, u64 id) { perf_evlist__id_hash(evlist, evsel, cpu, thread, id); - evsel->core.id[evsel->ids++] = id; + evsel->core.id[evsel->core.ids++] = id; } int perf_evlist__id_add_fd(struct evlist *evlist, diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 9c1b4f4a5fa3..55638eb9299c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1254,7 +1254,7 @@ static void perf_evsel__free_id(struct evsel *evsel) xyarray__delete(evsel->core.sample_id); evsel->core.sample_id = NULL; zfree(&evsel->core.id); - evsel->ids = 0; + evsel->core.ids = 0; } static void perf_evsel__free_config_terms(struct evsel *evsel) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 0d2aa933ceb3..ed64395ec340 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -119,7 +119,6 @@ struct evsel { struct perf_counts *counts; struct perf_counts *prev_raw_counts; int idx; - u32 ids; unsigned long max_events; unsigned long nr_events_printed; char *name; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 2b0681ab08aa..498f6a825656 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -516,7 +516,7 @@ static int write_event_desc(struct feat_fd *ff, * copy into an nri to be independent of the * type of ids, */ - nri = evsel->ids; + nri = evsel->core.ids; ret = do_write(ff, &nri, sizeof(nri)); if (ret < 0) return ret; @@ -530,7 +530,7 @@ static int write_event_desc(struct feat_fd *ff, /* * write unique ids for this event */ - ret = do_write(ff, evsel->core.id, evsel->ids * sizeof(u64)); + ret = do_write(ff, evsel->core.id, evsel->core.ids * sizeof(u64)); if (ret < 0) return ret; } @@ -1656,7 +1656,7 @@ static struct evsel *read_event_desc(struct feat_fd *ff) id = calloc(nr, sizeof(*id)); if (!id) goto error; - evsel->ids = nr; + evsel->core.ids = nr; evsel->core.id = id; for (j = 0 ; j < nr; j++) { @@ -1699,9 +1699,9 @@ static void print_event_desc(struct feat_fd *ff, FILE *fp) for (evsel = events; evsel->core.attr.size; evsel++) { fprintf(fp, "# event : name = %s, ", evsel->name); - if (evsel->ids) { + if (evsel->core.ids) { fprintf(fp, ", id = {"); - for (j = 0, id = evsel->core.id; j < evsel->ids; j++, id++) { + for (j = 0, id = evsel->core.id; j < evsel->core.ids; j++, id++) { if (j) fputc(',', fp); fprintf(fp, " %"PRIu64, *id); @@ -3068,7 +3068,7 @@ int perf_session__write_header(struct perf_session *session, evlist__for_each_entry(session->evlist, evsel) { evsel->id_offset = lseek(fd, 0, SEEK_CUR); - err = do_write(&ff, evsel->core.id, evsel->ids * sizeof(u64)); + err = do_write(&ff, evsel->core.id, evsel->core.ids * sizeof(u64)); if (err < 0) { pr_debug("failed to write perf header\n"); return err; @@ -3082,7 +3082,7 @@ int perf_session__write_header(struct perf_session *session, .attr = evsel->core.attr, .ids = { .offset = evsel->id_offset, - .size = evsel->ids * sizeof(u64), + .size = evsel->core.ids * sizeof(u64), } }; err = do_write(&ff, &f_attr, sizeof(f_attr)); diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index c94360cd9c00..34cb380d19a3 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -768,7 +768,7 @@ static int intel_bts_synth_events(struct intel_bts *bts, int err; evlist__for_each_entry(evlist, evsel) { - if (evsel->core.attr.type == bts->pmu_type && evsel->ids) { + if (evsel->core.attr.type == bts->pmu_type && evsel->core.ids) { found = true; break; } diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 24ca5d5908ca..a1c9eb6d4f40 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -2735,7 +2735,7 @@ static struct evsel *intel_pt_evsel(struct intel_pt *pt, struct evsel *evsel; evlist__for_each_entry(evlist, evsel) { - if (evsel->core.attr.type == pt->pmu_type && evsel->ids) + if (evsel->core.attr.type == pt->pmu_type && evsel->core.ids) return evsel; } diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 907ac3971959..96ed008c2775 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1413,7 +1413,7 @@ int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_ sizeof(struct id_index_entry); evlist__for_each_entry(evlist, evsel) - nr += evsel->ids; + nr += evsel->core.ids; n = nr > max_nr ? max_nr : nr; sz = sizeof(struct perf_record_id_index) + n * sizeof(struct id_index_entry); @@ -1428,7 +1428,7 @@ int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_ evlist__for_each_entry(evlist, evsel) { u32 j; - for (j = 0; j < evsel->ids; j++) { + for (j = 0; j < evsel->core.ids; j++) { struct id_index_entry *e; struct perf_sample_id *sid; @@ -1595,7 +1595,7 @@ int perf_event__synthesize_attrs(struct perf_tool *tool, struct evlist *evlist, int err = 0; evlist__for_each_entry(evlist, evsel) { - err = perf_event__synthesize_attr(tool, &evsel->core.attr, evsel->ids, + err = perf_event__synthesize_attr(tool, &evsel->core.attr, evsel->core.ids, evsel->core.id, process); if (err) { pr_debug("failed to create perf header attribute\n"); From 1d5af02d7a92acaa877ab0fbec0756114852720a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Sep 2019 22:20:12 +0200 Subject: [PATCH 1158/2880] libperf: Move 'heads' from 'struct evlist' to 'struct perf_evlist' Move 'heads' hash table from 'struct evlist' to 'struct perf_evlist'. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-27-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/evlist.c | 4 ++++ tools/perf/lib/include/internal/evlist.h | 4 ++++ tools/perf/util/evlist.c | 10 +++------- tools/perf/util/evlist.h | 4 ---- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/perf/lib/evlist.c b/tools/perf/lib/evlist.c index f4dc9a208332..2bdd9d2c79d9 100644 --- a/tools/perf/lib/evlist.c +++ b/tools/perf/lib/evlist.c @@ -11,6 +11,10 @@ void perf_evlist__init(struct perf_evlist *evlist) { + int i; + + for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i) + INIT_HLIST_HEAD(&evlist->heads[i]); INIT_LIST_HEAD(&evlist->entries); evlist->nr_entries = 0; } diff --git a/tools/perf/lib/include/internal/evlist.h b/tools/perf/lib/include/internal/evlist.h index 8a4eb66fbf3a..c5a06890fd6a 100644 --- a/tools/perf/lib/include/internal/evlist.h +++ b/tools/perf/lib/include/internal/evlist.h @@ -5,6 +5,9 @@ #include #include +#define PERF_EVLIST__HLIST_BITS 8 +#define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS) + struct perf_cpu_map; struct perf_thread_map; @@ -17,6 +20,7 @@ struct perf_evlist { int nr_mmaps; size_t mmap_len; struct fdarray pollfd; + struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; }; /** diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 5e43fb9da359..c6af7c622612 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -55,10 +55,6 @@ int sigqueue(pid_t pid, int sig, const union sigval value); void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus, struct perf_thread_map *threads) { - int i; - - for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i) - INIT_HLIST_HEAD(&evlist->heads[i]); perf_evlist__init(&evlist->core); perf_evlist__set_maps(&evlist->core, cpus, threads); fdarray__init(&evlist->core.pollfd, 64); @@ -475,7 +471,7 @@ static void perf_evlist__id_hash(struct evlist *evlist, sid->id = id; sid->evsel = evsel; hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS); - hlist_add_head(&sid->node, &evlist->heads[hash]); + hlist_add_head(&sid->node, &evlist->core.heads[hash]); } void perf_evlist__id_add(struct evlist *evlist, struct evsel *evsel, @@ -549,7 +545,7 @@ struct perf_sample_id *perf_evlist__id2sid(struct evlist *evlist, u64 id) int hash; hash = hash_64(id, PERF_EVLIST__HLIST_BITS); - head = &evlist->heads[hash]; + head = &evlist->core.heads[hash]; hlist_for_each_entry(sid, head, node) if (sid->id == id) @@ -635,7 +631,7 @@ struct evsel *perf_evlist__event2evsel(struct evlist *evlist, return first; hash = hash_64(id, PERF_EVLIST__HLIST_BITS); - head = &evlist->heads[hash]; + head = &evlist->core.heads[hash]; hlist_for_each_entry(sid, head, node) { if (sid->id == id) diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 4fcdf93eb19c..ad9c0ba57a91 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -48,12 +48,8 @@ enum bkw_mmap_state { BKW_MMAP_EMPTY, }; -#define PERF_EVLIST__HLIST_BITS 8 -#define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS) - struct evlist { struct perf_evlist core; - struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; int nr_groups; bool enabled; int id_pos; From 70c20369ee95ef8b6887944194cfb74a5a8d1fe3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 3 Sep 2019 10:34:29 +0200 Subject: [PATCH 1159/2880] libperf: Add perf_evsel__alloc_id/perf_evsel__free_id functions Add perf_evsel__alloc_id()/perf_evsel__free_id() functions to libperf as internal functions. Move 'struct perf_sample_id' to internal/evsel.h header and change 'struct perf_sample_id::evsel' to 'struct perf_evsel' and the related code that touches it. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-28-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/evsel.c | 30 +++++++++++++++++++++ tools/perf/lib/include/internal/evsel.h | 27 +++++++++++++++++++ tools/perf/tests/event_update.c | 2 +- tools/perf/util/evlist.c | 10 +++---- tools/perf/util/evsel.c | 36 +++---------------------- tools/perf/util/evsel.h | 25 ----------------- tools/perf/util/header.c | 4 +-- tools/perf/util/session.c | 4 ++- 8 files changed, 71 insertions(+), 67 deletions(-) diff --git a/tools/perf/lib/evsel.c b/tools/perf/lib/evsel.c index 24abc80dd767..a8cb582e2721 100644 --- a/tools/perf/lib/evsel.c +++ b/tools/perf/lib/evsel.c @@ -230,3 +230,33 @@ struct perf_event_attr *perf_evsel__attr(struct perf_evsel *evsel) { return &evsel->attr; } + +int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads) +{ + if (ncpus == 0 || nthreads == 0) + return 0; + + if (evsel->system_wide) + nthreads = 1; + + evsel->sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id)); + if (evsel->sample_id == NULL) + return -ENOMEM; + + evsel->id = zalloc(ncpus * nthreads * sizeof(u64)); + if (evsel->id == NULL) { + xyarray__delete(evsel->sample_id); + evsel->sample_id = NULL; + return -ENOMEM; + } + + return 0; +} + +void perf_evsel__free_id(struct perf_evsel *evsel) +{ + xyarray__delete(evsel->sample_id); + evsel->sample_id = NULL; + zfree(&evsel->id); + evsel->ids = 0; +} diff --git a/tools/perf/lib/include/internal/evsel.h b/tools/perf/lib/include/internal/evsel.h index 385766f06591..a69b8299c36f 100644 --- a/tools/perf/lib/include/internal/evsel.h +++ b/tools/perf/lib/include/internal/evsel.h @@ -5,11 +5,35 @@ #include #include #include +#include struct perf_cpu_map; struct perf_thread_map; struct xyarray; +/* + * Per fd, to map back from PERF_SAMPLE_ID to evsel, only used when there are + * more than one entry in the evlist. + */ +struct perf_sample_id { + struct hlist_node node; + u64 id; + struct perf_evsel *evsel; + /* + * 'idx' will be used for AUX area sampling. A sample will have AUX area + * data that will be queued for decoding, where there are separate + * queues for each CPU (per-cpu tracing) or task (per-thread tracing). + * The sample ID can be used to lookup 'idx' which is effectively the + * queue number. + */ + int idx; + int cpu; + pid_t tid; + + /* Holds total ID period value for PERF_SAMPLE_READ processing. */ + u64 period; +}; + struct perf_evsel { struct list_head node; struct perf_event_attr attr; @@ -32,4 +56,7 @@ void perf_evsel__free_fd(struct perf_evsel *evsel); int perf_evsel__read_size(struct perf_evsel *evsel); int perf_evsel__apply_filter(struct perf_evsel *evsel, const char *filter); +int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads); +void perf_evsel__free_id(struct perf_evsel *evsel); + #endif /* __LIBPERF_INTERNAL_EVSEL_H */ diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index 0497d900ced2..cf4f90170f90 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -95,7 +95,7 @@ int test__event_update(struct test *test __maybe_unused, int subtest __maybe_unu evsel = perf_evlist__first(evlist); TEST_ASSERT_VAL("failed to allocate ids", - !perf_evsel__alloc_id(evsel, 1, 1)); + !perf_evsel__alloc_id(&evsel->core, 1, 1)); perf_evlist__id_add(evlist, evsel, 0, 0, 123); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index c6af7c622612..559db38594a8 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -469,7 +469,7 @@ static void perf_evlist__id_hash(struct evlist *evlist, struct perf_sample_id *sid = SID(evsel, cpu, thread); sid->id = id; - sid->evsel = evsel; + sid->evsel = &evsel->core; hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS); hlist_add_head(&sid->node, &evlist->core.heads[hash]); } @@ -563,7 +563,7 @@ struct evsel *perf_evlist__id2evsel(struct evlist *evlist, u64 id) sid = perf_evlist__id2sid(evlist, id); if (sid) - return sid->evsel; + return container_of(sid->evsel, struct evsel, core); if (!perf_evlist__sample_id_all(evlist)) return perf_evlist__first(evlist); @@ -581,7 +581,7 @@ struct evsel *perf_evlist__id2evsel_strict(struct evlist *evlist, sid = perf_evlist__id2sid(evlist, id); if (sid) - return sid->evsel; + return container_of(sid->evsel, struct evsel, core); return NULL; } @@ -635,7 +635,7 @@ struct evsel *perf_evlist__event2evsel(struct evlist *evlist, hlist_for_each_entry(sid, head, node) { if (sid->id == id) - return sid->evsel; + return container_of(sid->evsel, struct evsel, core); } return NULL; } @@ -1018,7 +1018,7 @@ int evlist__mmap_ex(struct evlist *evlist, unsigned int pages, evlist__for_each_entry(evlist, evsel) { if ((evsel->core.attr.read_format & PERF_FORMAT_ID) && evsel->core.sample_id == NULL && - perf_evsel__alloc_id(evsel, perf_cpu_map__nr(cpus), threads->nr) < 0) + perf_evsel__alloc_id(&evsel->core, perf_cpu_map__nr(cpus), threads->nr) < 0) return -ENOMEM; } diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 55638eb9299c..a4a492f11849 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1227,36 +1227,6 @@ int evsel__disable(struct evsel *evsel) return err; } -int perf_evsel__alloc_id(struct evsel *evsel, int ncpus, int nthreads) -{ - if (ncpus == 0 || nthreads == 0) - return 0; - - if (evsel->core.system_wide) - nthreads = 1; - - evsel->core.sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id)); - if (evsel->core.sample_id == NULL) - return -ENOMEM; - - evsel->core.id = zalloc(ncpus * nthreads * sizeof(u64)); - if (evsel->core.id == NULL) { - xyarray__delete(evsel->core.sample_id); - evsel->core.sample_id = NULL; - return -ENOMEM; - } - - return 0; -} - -static void perf_evsel__free_id(struct evsel *evsel) -{ - xyarray__delete(evsel->core.sample_id); - evsel->core.sample_id = NULL; - zfree(&evsel->core.id); - evsel->core.ids = 0; -} - static void perf_evsel__free_config_terms(struct evsel *evsel) { struct perf_evsel_config_term *term, *h; @@ -1273,7 +1243,7 @@ void perf_evsel__exit(struct evsel *evsel) assert(evsel->evlist == NULL); perf_evsel__free_counts(evsel); perf_evsel__free_fd(&evsel->core); - perf_evsel__free_id(evsel); + perf_evsel__free_id(&evsel->core); perf_evsel__free_config_terms(evsel); cgroup__put(evsel->cgrp); perf_cpu_map__put(evsel->core.cpus); @@ -1992,7 +1962,7 @@ out_close: void evsel__close(struct evsel *evsel) { perf_evsel__close(&evsel->core); - perf_evsel__free_id(evsel); + perf_evsel__free_id(&evsel->core); } int perf_evsel__open_per_cpu(struct evsel *evsel, @@ -2706,7 +2676,7 @@ int perf_evsel__store_ids(struct evsel *evsel, struct evlist *evlist) struct perf_cpu_map *cpus = evsel->core.cpus; struct perf_thread_map *threads = evsel->core.threads; - if (perf_evsel__alloc_id(evsel, cpus->nr, threads->nr)) + if (perf_evsel__alloc_id(&evsel->core, cpus->nr, threads->nr)) return -ENOMEM; return store_evsel_ids(evsel, evlist); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index ed64395ec340..a5b29ac10da0 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -17,29 +17,6 @@ struct addr_location; struct evsel; union perf_event; -/* - * Per fd, to map back from PERF_SAMPLE_ID to evsel, only used when there are - * more than one entry in the evlist. - */ -struct perf_sample_id { - struct hlist_node node; - u64 id; - struct evsel *evsel; - /* - * 'idx' will be used for AUX area sampling. A sample will have AUX area - * data that will be queued for decoding, where there are separate - * queues for each CPU (per-cpu tracing) or task (per-thread tracing). - * The sample ID can be used to lookup 'idx' which is effectively the - * queue number. - */ - int idx; - int cpu; - pid_t tid; - - /* Holds total ID period value for PERF_SAMPLE_READ processing. */ - u64 period; -}; - struct cgroup; /* @@ -272,8 +249,6 @@ const char *perf_evsel__name(struct evsel *evsel); const char *perf_evsel__group_name(struct evsel *evsel); int perf_evsel__group_desc(struct evsel *evsel, char *buf, size_t size); -int perf_evsel__alloc_id(struct evsel *evsel, int ncpus, int nthreads); - void __perf_evsel__set_sample_bit(struct evsel *evsel, enum perf_event_sample_format bit); void __perf_evsel__reset_sample_bit(struct evsel *evsel, diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 498f6a825656..fc5eac41e102 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3609,7 +3609,7 @@ int perf_session__read_header(struct perf_session *session) * for allocating the perf_sample_id table we fake 1 cpu and * hattr->ids threads. */ - if (perf_evsel__alloc_id(evsel, 1, nr_ids)) + if (perf_evsel__alloc_id(&evsel->core, 1, nr_ids)) goto out_delete_evlist; lseek(fd, f_attr.ids.offset, SEEK_SET); @@ -3750,7 +3750,7 @@ int perf_event__process_attr(struct perf_tool *tool __maybe_unused, * for allocating the perf_sample_id table we fake 1 cpu and * hattr->ids threads. */ - if (perf_evsel__alloc_id(evsel, 1, n_ids)) + if (perf_evsel__alloc_id(&evsel->core, 1, n_ids)) return -ENOMEM; for (i = 0; i < n_ids; i++) { diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index a621c73bad42..84a30ff3968a 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1324,6 +1324,7 @@ static int deliver_sample_value(struct evlist *evlist, struct machine *machine) { struct perf_sample_id *sid = perf_evlist__id2sid(evlist, v->id); + struct evsel *evsel; if (sid) { sample->id = v->id; @@ -1343,7 +1344,8 @@ static int deliver_sample_value(struct evlist *evlist, if (!sample->period) return 0; - return tool->sample(tool, event, sample, sid->evsel, machine); + evsel = container_of(sid->evsel, struct evsel, core); + return tool->sample(tool, event, sample, evsel, machine); } static int deliver_sample_group(struct evlist *evlist, From 515dbe48f6202147fb7c88aac48c43f49db1c793 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 3 Sep 2019 10:39:52 +0200 Subject: [PATCH 1160/2880] libperf: Add perf_evlist__first()/last() functions Add perf_evlist__first()/last() functions to libperf, as internal functions and rename perf's origins to evlist__first/last. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-29-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 2 +- tools/perf/arch/arm64/util/arm-spe.c | 2 +- tools/perf/arch/x86/tests/intel-cqm.c | 4 +- tools/perf/arch/x86/tests/perf-time-to-tsc.c | 2 +- tools/perf/arch/x86/util/intel-bts.c | 2 +- tools/perf/arch/x86/util/intel-pt.c | 6 +- tools/perf/builtin-record.c | 4 +- tools/perf/builtin-script.c | 2 +- tools/perf/builtin-top.c | 10 +- tools/perf/builtin-trace.c | 2 +- tools/perf/lib/include/internal/evlist.h | 11 ++ tools/perf/tests/code-reading.c | 2 +- tools/perf/tests/event-times.c | 14 +-- tools/perf/tests/event_update.c | 2 +- tools/perf/tests/evsel-roundtrip-name.c | 2 +- tools/perf/tests/hists_cumulate.c | 2 +- tools/perf/tests/hists_link.c | 4 +- tools/perf/tests/hists_output.c | 2 +- tools/perf/tests/keep-tracking.c | 4 +- tools/perf/tests/parse-events.c | 116 +++++++++---------- tools/perf/tests/perf-record.c | 2 +- tools/perf/tests/switch-tracking.c | 14 +-- tools/perf/tests/task-exit.c | 2 +- tools/perf/ui/browsers/hists.c | 6 +- tools/perf/util/bpf-loader.c | 2 +- tools/perf/util/evlist.c | 24 ++-- tools/perf/util/evlist.h | 13 ++- tools/perf/util/jitdump.c | 2 +- tools/perf/util/parse-events.c | 4 +- tools/perf/util/record.c | 6 +- tools/perf/util/sort.c | 2 +- tools/perf/util/top.c | 2 +- 32 files changed, 145 insertions(+), 129 deletions(-) diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 051e9066fb38..6654bcfc1224 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -416,7 +416,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, if (err) goto out; - tracking_evsel = perf_evlist__last(evlist); + tracking_evsel = evlist__last(evlist); perf_evlist__set_tracking_event(evlist, tracking_evsel); tracking_evsel->core.attr.freq = 0; diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 9302c6566f53..745f2d50ee82 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -129,7 +129,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, if (err) return err; - tracking_evsel = perf_evlist__last(evlist); + tracking_evsel = evlist__last(evlist); perf_evlist__set_tracking_event(evlist, tracking_evsel); tracking_evsel->core.attr.freq = 0; diff --git a/tools/perf/arch/x86/tests/intel-cqm.c b/tools/perf/arch/x86/tests/intel-cqm.c index 111c0ab2e7b5..0329b9168fae 100644 --- a/tools/perf/arch/x86/tests/intel-cqm.c +++ b/tools/perf/arch/x86/tests/intel-cqm.c @@ -62,9 +62,9 @@ int test__intel_cqm_count_nmi_context(struct test *test __maybe_unused, int subt goto out; } - evsel = perf_evlist__first(evlist); + evsel = evlist__first(evlist); if (!evsel) { - pr_debug("perf_evlist__first failed\n"); + pr_debug("evlist__first failed\n"); goto out; } diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index 10b7acebc0eb..fa947952c16a 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -83,7 +83,7 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe perf_evlist__config(evlist, &opts, NULL); - evsel = perf_evlist__first(evlist); + evsel = evlist__first(evlist); evsel->core.attr.comm = 1; evsel->core.attr.disabled = 1; diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index e81535c8e9c5..e2c7095327db 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -231,7 +231,7 @@ static int intel_bts_recording_options(struct auxtrace_record *itr, if (err) return err; - tracking_evsel = perf_evlist__last(evlist); + tracking_evsel = evlist__last(evlist); perf_evlist__set_tracking_event(evlist, tracking_evsel); diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 886b3ac60f23..84a65524c418 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -417,7 +417,7 @@ static int intel_pt_track_switches(struct evlist *evlist) return err; } - evsel = perf_evlist__last(evlist); + evsel = evlist__last(evlist); perf_evsel__set_sample_bit(evsel, CPU); perf_evsel__set_sample_bit(evsel, TIME); @@ -717,7 +717,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, if (err) return err; - switch_evsel = perf_evlist__last(evlist); + switch_evsel = evlist__last(evlist); switch_evsel->core.attr.freq = 0; switch_evsel->core.attr.sample_period = 1; @@ -775,7 +775,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, if (err) return err; - tracking_evsel = perf_evlist__last(evlist); + tracking_evsel = evlist__last(evlist); perf_evlist__set_tracking_event(evlist, tracking_evsel); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 94997144547d..48600c90cc7e 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -756,9 +756,9 @@ static int record__open(struct record *rec) if (perf_evlist__add_dummy(evlist)) return -ENOMEM; - pos = perf_evlist__first(evlist); + pos = evlist__first(evlist); pos->tracking = 0; - pos = perf_evlist__last(evlist); + pos = evlist__last(evlist); pos->tracking = 1; pos->core.attr.enable_on_exec = 1; } diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index e7a49e2d7575..22c1d114014c 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2043,7 +2043,7 @@ static int process_attr(struct perf_tool *tool, union perf_event *event, return err; evlist = *pevlist; - evsel = perf_evlist__last(*pevlist); + evsel = evlist__last(*pevlist); if (!evsel->priv) { if (scr->per_event_dump) { diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 474b9860cfd4..73bf79053ae3 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -531,7 +531,7 @@ static bool perf_top__handle_keypress(struct perf_top *top, int c) prompt_integer(&counter, "Enter details event counter"); if (counter >= top->evlist->core.nr_entries) { - top->sym_evsel = perf_evlist__first(top->evlist); + top->sym_evsel = evlist__first(top->evlist); fprintf(stderr, "Sorry, no such event, using %s.\n", perf_evsel__name(top->sym_evsel)); sleep(1); break; @@ -540,7 +540,7 @@ static bool perf_top__handle_keypress(struct perf_top *top, int c) if (top->sym_evsel->idx == counter) break; } else - top->sym_evsel = perf_evlist__first(top->evlist); + top->sym_evsel = evlist__first(top->evlist); break; case 'f': prompt_integer(&top->count_filter, "Enter display event count filter"); @@ -962,7 +962,7 @@ static int perf_top__overwrite_check(struct perf_top *top) /* has term for current event */ if ((overwrite < 0) && (set >= 0)) { /* if it's first event, set overwrite */ - if (evsel == perf_evlist__first(evlist)) + if (evsel == evlist__first(evlist)) overwrite = set; else return -1; @@ -986,7 +986,7 @@ static int perf_top_overwrite_fallback(struct perf_top *top, return 0; /* only fall back when first event fails */ - if (evsel != perf_evlist__first(evlist)) + if (evsel != evlist__first(evlist)) return 0; evlist__for_each_entry(evlist, counter) @@ -1644,7 +1644,7 @@ int cmd_top(int argc, const char **argv) goto out_delete_evlist; } - top.sym_evsel = perf_evlist__first(top.evlist); + top.sym_evsel = evlist__first(top.evlist); if (!callchain_param.enabled) { symbol_conf.cumulate_callchain = false; diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 91c73c7472ba..97667287f573 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3427,7 +3427,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 || evlist->core.threads->nr > 1 || - perf_evlist__first(evlist)->core.attr.inherit; + evlist__first(evlist)->core.attr.inherit; /* * Now that we already used evsel->core.attr to ask the kernel to setup the diff --git a/tools/perf/lib/include/internal/evlist.h b/tools/perf/lib/include/internal/evlist.h index c5a06890fd6a..16ae6d6cfb39 100644 --- a/tools/perf/lib/include/internal/evlist.h +++ b/tools/perf/lib/include/internal/evlist.h @@ -4,6 +4,7 @@ #include #include +#include #define PERF_EVLIST__HLIST_BITS 8 #define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS) @@ -55,4 +56,14 @@ struct perf_evlist { #define perf_evlist__for_each_entry_reverse(evlist, evsel) \ __perf_evlist__for_each_entry_reverse(&(evlist)->entries, evsel) +static inline struct perf_evsel *perf_evlist__first(struct perf_evlist *evlist) +{ + return list_entry(evlist->entries.next, struct perf_evsel, node); +} + +static inline struct perf_evsel *perf_evlist__last(struct perf_evlist *evlist) +{ + return list_entry(evlist->entries.prev, struct perf_evsel, node); +} + #endif /* __LIBPERF_INTERNAL_EVLIST_H */ diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 7dac69a375f9..f5764a3890b9 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -652,7 +652,7 @@ static int do_test_code_reading(bool try_kcore) perf_evlist__config(evlist, &opts, NULL); - evsel = perf_evlist__first(evlist); + evsel = evlist__first(evlist); evsel->core.attr.comm = 1; evsel->core.attr.disabled = 1; diff --git a/tools/perf/tests/event-times.c b/tools/perf/tests/event-times.c index 0228ba435a2a..1ee8704e2284 100644 --- a/tools/perf/tests/event-times.c +++ b/tools/perf/tests/event-times.c @@ -16,7 +16,7 @@ static int attach__enable_on_exec(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__last(evlist); + struct evsel *evsel = evlist__last(evlist); struct target target = { .uid = UINT_MAX, }; @@ -58,7 +58,7 @@ static int detach__enable_on_exec(struct evlist *evlist) static int attach__current_disabled(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__last(evlist); + struct evsel *evsel = evlist__last(evlist); struct perf_thread_map *threads; int err; @@ -84,7 +84,7 @@ static int attach__current_disabled(struct evlist *evlist) static int attach__current_enabled(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__last(evlist); + struct evsel *evsel = evlist__last(evlist); struct perf_thread_map *threads; int err; @@ -104,14 +104,14 @@ static int attach__current_enabled(struct evlist *evlist) static int detach__disable(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__last(evlist); + struct evsel *evsel = evlist__last(evlist); return evsel__enable(evsel); } static int attach__cpu_disabled(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__last(evlist); + struct evsel *evsel = evlist__last(evlist); struct perf_cpu_map *cpus; int err; @@ -140,7 +140,7 @@ static int attach__cpu_disabled(struct evlist *evlist) static int attach__cpu_enabled(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__last(evlist); + struct evsel *evsel = evlist__last(evlist); struct perf_cpu_map *cpus; int err; @@ -180,7 +180,7 @@ static int test_times(int (attach)(struct evlist *), goto out_err; } - evsel = perf_evlist__last(evlist); + evsel = evlist__last(evlist); evsel->core.attr.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index cf4f90170f90..cd6cae8e5137 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -92,7 +92,7 @@ int test__event_update(struct test *test __maybe_unused, int subtest __maybe_unu evlist = perf_evlist__new_default(); TEST_ASSERT_VAL("failed to get evlist", evlist); - evsel = perf_evlist__first(evlist); + evsel = evlist__first(evlist); TEST_ASSERT_VAL("failed to allocate ids", !perf_evsel__alloc_id(&evsel->core, 1, 1)); diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c index 5330f106a6ee..956205bf9326 100644 --- a/tools/perf/tests/evsel-roundtrip-name.c +++ b/tools/perf/tests/evsel-roundtrip-name.c @@ -34,7 +34,7 @@ static int perf_evsel__roundtrip_cache_name_test(void) } idx = 0; - evsel = perf_evlist__first(evlist); + evsel = evlist__first(evlist); for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c index fa55b7bad3af..6367c8f6ca22 100644 --- a/tools/perf/tests/hists_cumulate.c +++ b/tools/perf/tests/hists_cumulate.c @@ -721,7 +721,7 @@ int test__hists_cumulate(struct test *test __maybe_unused, int subtest __maybe_u if (verbose > 1) machine__fprintf(machine, stderr); - evsel = perf_evlist__first(evlist); + evsel = evlist__first(evlist); for (i = 0; i < ARRAY_SIZE(testcases); i++) { err = testcases[i](evsel, machine); diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c index 1a3bdc0a2d14..a024d3f3a412 100644 --- a/tools/perf/tests/hists_link.c +++ b/tools/perf/tests/hists_link.c @@ -311,8 +311,8 @@ int test__hists_link(struct test *test __maybe_unused, int subtest __maybe_unuse print_hists_in(hists); } - first = perf_evlist__first(evlist); - evsel = perf_evlist__last(evlist); + first = evlist__first(evlist); + evsel = evlist__last(evlist); first_hists = evsel__hists(first); hists = evsel__hists(evsel); diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c index 3f6dfa212260..38f804ff6452 100644 --- a/tools/perf/tests/hists_output.c +++ b/tools/perf/tests/hists_output.c @@ -608,7 +608,7 @@ int test__hists_output(struct test *test __maybe_unused, int subtest __maybe_unu if (verbose > 1) machine__fprintf(machine, stderr); - evsel = perf_evlist__first(evlist); + evsel = evlist__first(evlist); for (i = 0; i < ARRAY_SIZE(testcases); i++) { err = testcases[i](evsel, machine); diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index bd4ae8e5cd5d..92c7d591bcac 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -93,7 +93,7 @@ int test__keep_tracking(struct test *test __maybe_unused, int subtest __maybe_un perf_evlist__config(evlist, &opts, NULL); - evsel = perf_evlist__first(evlist); + evsel = evlist__first(evlist); evsel->core.attr.comm = 1; evsel->core.attr.disabled = 1; @@ -132,7 +132,7 @@ int test__keep_tracking(struct test *test __maybe_unused, int subtest __maybe_un evlist__enable(evlist); - evsel = perf_evlist__last(evlist); + evsel = evlist__last(evlist); CHECK__(evsel__disable(evsel)); diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index c25c8e7b41e5..25e0ed2eedfc 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -46,7 +46,7 @@ static bool kvm_s390_create_vm_valid(void) static int test__checkevent_tracepoint(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong number of groups", 0 == evlist->nr_groups); @@ -77,7 +77,7 @@ static int test__checkevent_tracepoint_multi(struct evlist *evlist) static int test__checkevent_raw(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); @@ -87,7 +87,7 @@ static int test__checkevent_raw(struct evlist *evlist) static int test__checkevent_numeric(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", 1 == evsel->core.attr.type); @@ -97,7 +97,7 @@ static int test__checkevent_numeric(struct evlist *evlist) static int test__checkevent_symbolic_name(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); @@ -108,7 +108,7 @@ static int test__checkevent_symbolic_name(struct evlist *evlist) static int test__checkevent_symbolic_name_config(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); @@ -129,7 +129,7 @@ static int test__checkevent_symbolic_name_config(struct evlist *evlist) static int test__checkevent_symbolic_alias(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); @@ -140,7 +140,7 @@ static int test__checkevent_symbolic_alias(struct evlist *evlist) static int test__checkevent_genhw(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->core.attr.type); @@ -150,7 +150,7 @@ static int test__checkevent_genhw(struct evlist *evlist) static int test__checkevent_breakpoint(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); @@ -164,7 +164,7 @@ static int test__checkevent_breakpoint(struct evlist *evlist) static int test__checkevent_breakpoint_x(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); @@ -177,7 +177,7 @@ static int test__checkevent_breakpoint_x(struct evlist *evlist) static int test__checkevent_breakpoint_r(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", @@ -192,7 +192,7 @@ static int test__checkevent_breakpoint_r(struct evlist *evlist) static int test__checkevent_breakpoint_w(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", @@ -207,7 +207,7 @@ static int test__checkevent_breakpoint_w(struct evlist *evlist) static int test__checkevent_breakpoint_rw(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", @@ -222,7 +222,7 @@ static int test__checkevent_breakpoint_rw(struct evlist *evlist) static int test__checkevent_tracepoint_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); @@ -253,7 +253,7 @@ test__checkevent_tracepoint_multi_modifier(struct evlist *evlist) static int test__checkevent_raw_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); @@ -265,7 +265,7 @@ static int test__checkevent_raw_modifier(struct evlist *evlist) static int test__checkevent_numeric_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); @@ -277,7 +277,7 @@ static int test__checkevent_numeric_modifier(struct evlist *evlist) static int test__checkevent_symbolic_name_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); @@ -289,7 +289,7 @@ static int test__checkevent_symbolic_name_modifier(struct evlist *evlist) static int test__checkevent_exclude_host_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); @@ -299,7 +299,7 @@ static int test__checkevent_exclude_host_modifier(struct evlist *evlist) static int test__checkevent_exclude_guest_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); @@ -309,7 +309,7 @@ static int test__checkevent_exclude_guest_modifier(struct evlist *evlist) static int test__checkevent_symbolic_alias_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); @@ -321,7 +321,7 @@ static int test__checkevent_symbolic_alias_modifier(struct evlist *evlist) static int test__checkevent_genhw_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); @@ -333,7 +333,7 @@ static int test__checkevent_genhw_modifier(struct evlist *evlist) static int test__checkevent_exclude_idle_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude idle", evsel->core.attr.exclude_idle); TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); @@ -348,7 +348,7 @@ static int test__checkevent_exclude_idle_modifier(struct evlist *evlist) static int test__checkevent_exclude_idle_modifier_1(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude idle", evsel->core.attr.exclude_idle); TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); @@ -363,7 +363,7 @@ static int test__checkevent_exclude_idle_modifier_1(struct evlist *evlist) static int test__checkevent_breakpoint_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); @@ -378,7 +378,7 @@ static int test__checkevent_breakpoint_modifier(struct evlist *evlist) static int test__checkevent_breakpoint_x_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); @@ -392,7 +392,7 @@ static int test__checkevent_breakpoint_x_modifier(struct evlist *evlist) static int test__checkevent_breakpoint_r_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); @@ -406,7 +406,7 @@ static int test__checkevent_breakpoint_r_modifier(struct evlist *evlist) static int test__checkevent_breakpoint_w_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); @@ -420,7 +420,7 @@ static int test__checkevent_breakpoint_w_modifier(struct evlist *evlist) static int test__checkevent_breakpoint_rw_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); @@ -435,7 +435,7 @@ static int test__checkevent_breakpoint_rw_modifier(struct evlist *evlist) static int test__checkevent_pmu(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); @@ -453,7 +453,7 @@ static int test__checkevent_pmu(struct evlist *evlist) static int test__checkevent_list(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->core.nr_entries); @@ -492,7 +492,7 @@ static int test__checkevent_list(struct evlist *evlist) static int test__checkevent_pmu_name(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); /* cpu/config=1,name=krava/u */ TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); @@ -513,7 +513,7 @@ static int test__checkevent_pmu_name(struct evlist *evlist) static int test__checkevent_pmu_partial_time_callgraph(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); /* cpu/config=1,call-graph=fp,time,period=100000/ */ TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); @@ -546,7 +546,7 @@ static int test__checkevent_pmu_partial_time_callgraph(struct evlist *evlist) static int test__checkevent_pmu_events(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); @@ -564,7 +564,7 @@ static int test__checkevent_pmu_events(struct evlist *evlist) static int test__checkevent_pmu_events_mix(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); /* pmu-event:u */ TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); @@ -642,7 +642,7 @@ static int test__group1(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups); /* instructions:k */ - evsel = leader = perf_evlist__first(evlist); + evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); @@ -684,7 +684,7 @@ static int test__group2(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups); /* faults + :ku modifier */ - evsel = leader = perf_evlist__first(evlist); + evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_SW_PAGE_FAULTS == evsel->core.attr.config); @@ -739,7 +739,7 @@ static int test__group3(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong number of groups", 2 == evlist->nr_groups); /* group1 syscalls:sys_enter_openat:H */ - evsel = leader = perf_evlist__first(evlist); + evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->core.attr.type); TEST_ASSERT_VAL("wrong sample_type", PERF_TP_SAMPLE_TYPE == evsel->core.attr.sample_type); @@ -831,7 +831,7 @@ static int test__group4(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups); /* cycles:u + p */ - evsel = leader = perf_evlist__first(evlist); + evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -875,7 +875,7 @@ static int test__group5(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong number of groups", 2 == evlist->nr_groups); /* cycles + G */ - evsel = leader = perf_evlist__first(evlist); + evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -961,7 +961,7 @@ static int test__group_gh1(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups); /* cycles + :H group modifier */ - evsel = leader = perf_evlist__first(evlist); + evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -1001,7 +1001,7 @@ static int test__group_gh2(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups); /* cycles + :G group modifier */ - evsel = leader = perf_evlist__first(evlist); + evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -1041,7 +1041,7 @@ static int test__group_gh3(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups); /* cycles:G + :u group modifier */ - evsel = leader = perf_evlist__first(evlist); + evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -1081,7 +1081,7 @@ static int test__group_gh4(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups); /* cycles:G + :uG group modifier */ - evsel = leader = perf_evlist__first(evlist); + evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -1120,7 +1120,7 @@ static int test__leader_sample1(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->core.nr_entries); /* cycles - sampling group leader */ - evsel = leader = perf_evlist__first(evlist); + evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -1173,7 +1173,7 @@ static int test__leader_sample2(struct evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); /* instructions - sampling group leader */ - evsel = leader = perf_evlist__first(evlist); + evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); @@ -1207,7 +1207,7 @@ static int test__leader_sample2(struct evlist *evlist __maybe_unused) static int test__checkevent_pinned_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); @@ -1225,7 +1225,7 @@ static int test__pinned_group(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->core.nr_entries); /* cycles - group leader */ - evsel = leader = perf_evlist__first(evlist); + evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); @@ -1251,7 +1251,7 @@ static int test__pinned_group(struct evlist *evlist) static int test__checkevent_breakpoint_len(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); @@ -1266,7 +1266,7 @@ static int test__checkevent_breakpoint_len(struct evlist *evlist) static int test__checkevent_breakpoint_len_w(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); @@ -1282,7 +1282,7 @@ static int test__checkevent_breakpoint_len_w(struct evlist *evlist) static int test__checkevent_breakpoint_len_rw_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); @@ -1294,7 +1294,7 @@ test__checkevent_breakpoint_len_rw_modifier(struct evlist *evlist) static int test__checkevent_precise_max_modifier(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); @@ -1305,7 +1305,7 @@ static int test__checkevent_precise_max_modifier(struct evlist *evlist) static int test__checkevent_config_symbol(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "insn") == 0); return 0; @@ -1313,7 +1313,7 @@ static int test__checkevent_config_symbol(struct evlist *evlist) static int test__checkevent_config_raw(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "rawpmu") == 0); return 0; @@ -1321,7 +1321,7 @@ static int test__checkevent_config_raw(struct evlist *evlist) static int test__checkevent_config_num(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "numpmu") == 0); return 0; @@ -1329,7 +1329,7 @@ static int test__checkevent_config_num(struct evlist *evlist) static int test__checkevent_config_cache(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "cachepmu") == 0); return 0; @@ -1342,7 +1342,7 @@ static bool test__intel_pt_valid(void) static int test__intel_pt(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "intel_pt//u") == 0); return 0; @@ -1350,7 +1350,7 @@ static int test__intel_pt(struct evlist *evlist) static int test__checkevent_complex_name(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong complex name parsing", strcmp(evsel->name, "COMPLEX_CYCLES_NAME:orig=cycles,desc=chip-clock-ticks") == 0); return 0; @@ -1358,7 +1358,7 @@ static int test__checkevent_complex_name(struct evlist *evlist) static int test__sym_event_slash(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", evsel->core.attr.type == PERF_TYPE_HARDWARE); TEST_ASSERT_VAL("wrong config", evsel->core.attr.config == PERF_COUNT_HW_CPU_CYCLES); @@ -1368,7 +1368,7 @@ static int test__sym_event_slash(struct evlist *evlist) static int test__sym_event_dc(struct evlist *evlist) { - struct evsel *evsel = perf_evlist__first(evlist); + struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", evsel->core.attr.type == PERF_TYPE_HARDWARE); TEST_ASSERT_VAL("wrong config", evsel->core.attr.config == PERF_COUNT_HW_CPU_CYCLES); diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 199a66444e60..401e8d11427b 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -104,7 +104,7 @@ int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unus /* * Config the evsels, setting attr->comm on the first one, etc. */ - evsel = perf_evlist__first(evlist); + evsel = evlist__first(evlist); perf_evsel__set_sample_bit(evsel, CPU); perf_evsel__set_sample_bit(evsel, TID); perf_evsel__set_sample_bit(evsel, TIME); diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index 30a70db6473d..ffa592e0020e 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -367,7 +367,7 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ goto out_err; } - cpu_clocks_evsel = perf_evlist__last(evlist); + cpu_clocks_evsel = evlist__last(evlist); /* Second event */ err = parse_events(evlist, "cycles:u", NULL); @@ -376,7 +376,7 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ goto out_err; } - cycles_evsel = perf_evlist__last(evlist); + cycles_evsel = evlist__last(evlist); /* Third event */ if (!perf_evlist__can_select_event(evlist, sched_switch)) { @@ -391,7 +391,7 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ goto out_err; } - switch_evsel = perf_evlist__last(evlist); + switch_evsel = evlist__last(evlist); perf_evsel__set_sample_bit(switch_evsel, CPU); perf_evsel__set_sample_bit(switch_evsel, TIME); @@ -401,12 +401,12 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ switch_evsel->immediate = true; /* Test moving an event to the front */ - if (cycles_evsel == perf_evlist__first(evlist)) { + if (cycles_evsel == evlist__first(evlist)) { pr_debug("cycles event already at front"); goto out_err; } perf_evlist__to_front(evlist, cycles_evsel); - if (cycles_evsel != perf_evlist__first(evlist)) { + if (cycles_evsel != evlist__first(evlist)) { pr_debug("Failed to move cycles event to front"); goto out_err; } @@ -421,7 +421,7 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ goto out_err; } - tracking_evsel = perf_evlist__last(evlist); + tracking_evsel = evlist__last(evlist); perf_evlist__set_tracking_event(evlist, tracking_evsel); @@ -434,7 +434,7 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ perf_evlist__config(evlist, &opts, NULL); /* Check moved event is still at the front */ - if (cycles_evsel != perf_evlist__first(evlist)) { + if (cycles_evsel != evlist__first(evlist)) { pr_debug("Front event no longer at front"); goto out_err; } diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index 7fc39af48a76..24565f83e07d 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -88,7 +88,7 @@ int test__task_exit(struct test *test __maybe_unused, int subtest __maybe_unused goto out_delete_evlist; } - evsel = perf_evlist__first(evlist); + evsel = evlist__first(evlist); evsel->core.attr.task = 1; #ifdef __s390x__ evsel->core.attr.sample_freq = 1000000; diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 589168ca9f62..7a7187e069b4 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -3319,13 +3319,13 @@ browse_hists: switch (key) { case K_TAB: if (pos->core.node.next == &evlist->core.entries) - pos = perf_evlist__first(evlist); + pos = evlist__first(evlist); else pos = perf_evsel__next(pos); goto browse_hists; case K_UNTAB: if (pos->core.node.prev == &evlist->core.entries) - pos = perf_evlist__last(evlist); + pos = evlist__last(evlist); else pos = perf_evsel__prev(pos); goto browse_hists; @@ -3417,7 +3417,7 @@ int perf_evlist__tui_browse_hists(struct evlist *evlist, const char *help, single_entry: if (nr_entries == 1) { - struct evsel *first = perf_evlist__first(evlist); + struct evsel *first = evlist__first(evlist); return perf_evsel__hists_browse(first, nr_entries, help, false, hbt, min_pcnt, diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 37283e865352..10c187b8b8ea 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -1568,7 +1568,7 @@ struct evsel *bpf__setup_output_event(struct evlist *evlist, const char *name) return ERR_PTR(-err); } - evsel = perf_evlist__last(evlist); + evsel = evlist__last(evlist); } bpf__for_each_map_named(map, obj, tmp, name) { diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 559db38594a8..e8f0357b1532 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -105,7 +105,7 @@ struct evlist *perf_evlist__new_dummy(void) */ void perf_evlist__set_id_pos(struct evlist *evlist) { - struct evsel *first = perf_evlist__first(evlist); + struct evsel *first = evlist__first(evlist); evlist->id_pos = first->id_pos; evlist->is_pos = first->is_pos; @@ -559,14 +559,14 @@ struct evsel *perf_evlist__id2evsel(struct evlist *evlist, u64 id) struct perf_sample_id *sid; if (evlist->core.nr_entries == 1 || !id) - return perf_evlist__first(evlist); + return evlist__first(evlist); sid = perf_evlist__id2sid(evlist, id); if (sid) return container_of(sid->evsel, struct evsel, core); if (!perf_evlist__sample_id_all(evlist)) - return perf_evlist__first(evlist); + return evlist__first(evlist); return NULL; } @@ -610,7 +610,7 @@ static int perf_evlist__event2id(struct evlist *evlist, struct evsel *perf_evlist__event2evsel(struct evlist *evlist, union perf_event *event) { - struct evsel *first = perf_evlist__first(evlist); + struct evsel *first = evlist__first(evlist); struct hlist_head *head; struct perf_sample_id *sid; int hash; @@ -1222,7 +1222,7 @@ u64 perf_evlist__combined_branch_type(struct evlist *evlist) bool perf_evlist__valid_read_format(struct evlist *evlist) { - struct evsel *first = perf_evlist__first(evlist), *pos = first; + struct evsel *first = evlist__first(evlist), *pos = first; u64 read_format = first->core.attr.read_format; u64 sample_type = first->core.attr.sample_type; @@ -1242,13 +1242,13 @@ bool perf_evlist__valid_read_format(struct evlist *evlist) u64 perf_evlist__read_format(struct evlist *evlist) { - struct evsel *first = perf_evlist__first(evlist); + struct evsel *first = evlist__first(evlist); return first->core.attr.read_format; } u16 perf_evlist__id_hdr_size(struct evlist *evlist) { - struct evsel *first = perf_evlist__first(evlist); + struct evsel *first = evlist__first(evlist); struct perf_sample *data; u64 sample_type; u16 size = 0; @@ -1281,7 +1281,7 @@ out: bool perf_evlist__valid_sample_id_all(struct evlist *evlist) { - struct evsel *first = perf_evlist__first(evlist), *pos = first; + struct evsel *first = evlist__first(evlist), *pos = first; evlist__for_each_entry_continue(evlist, pos) { if (first->core.attr.sample_id_all != pos->core.attr.sample_id_all) @@ -1293,7 +1293,7 @@ bool perf_evlist__valid_sample_id_all(struct evlist *evlist) bool perf_evlist__sample_id_all(struct evlist *evlist) { - struct evsel *first = perf_evlist__first(evlist); + struct evsel *first = evlist__first(evlist); return first->core.attr.sample_id_all; } @@ -1568,7 +1568,7 @@ int perf_evlist__strerror_open(struct evlist *evlist, "Hint:\tThe current value is %d.", value); break; case EINVAL: { - struct evsel *first = perf_evlist__first(evlist); + struct evsel *first = evlist__first(evlist); int max_freq; if (sysctl__read_int("kernel/perf_event_max_sample_rate", &max_freq) < 0) @@ -1630,7 +1630,7 @@ void perf_evlist__to_front(struct evlist *evlist, struct evsel *evsel, *n; LIST_HEAD(move); - if (move_evsel == perf_evlist__first(evlist)) + if (move_evsel == evlist__first(evlist)) return; evlist__for_each_entry_safe(evlist, n, evsel) { @@ -1751,7 +1751,7 @@ bool perf_evlist__exclude_kernel(struct evlist *evlist) void perf_evlist__force_leader(struct evlist *evlist) { if (!evlist->nr_groups) { - struct evsel *leader = perf_evlist__first(evlist); + struct evsel *leader = evlist__first(evlist); perf_evlist__set_leader(evlist); leader->forced_leader = true; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index ad9c0ba57a91..6529ad2a9d97 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "events_stats.h" #include "evsel.h" #include @@ -242,14 +243,18 @@ static inline bool perf_evlist__empty(struct evlist *evlist) return list_empty(&evlist->core.entries); } -static inline struct evsel *perf_evlist__first(struct evlist *evlist) +static inline struct evsel *evlist__first(struct evlist *evlist) { - return list_entry(evlist->core.entries.next, struct evsel, core.node); + struct perf_evsel *evsel = perf_evlist__first(&evlist->core); + + return container_of(evsel, struct evsel, core); } -static inline struct evsel *perf_evlist__last(struct evlist *evlist) +static inline struct evsel *evlist__last(struct evlist *evlist) { - return list_entry(evlist->core.entries.prev, struct evsel, core.node); + struct perf_evsel *evsel = perf_evlist__last(&evlist->core); + + return container_of(evsel, struct evsel, core); } size_t perf_evlist__fprintf(struct evlist *evlist, FILE *fp); diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index 9f9c6c6a2fd3..1bdf4c6ea3e5 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -777,7 +777,7 @@ jit_process(struct perf_session *session, * track sample_type to compute id_all layout * perf sets the same sample type to all events as of now */ - first = perf_evlist__first(session->evlist); + first = evlist__first(session->evlist); jd.sample_type = first->core.attr.sample_type; *nbytes = 0; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index d7aebe9b005d..d69ff746cda5 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1936,7 +1936,7 @@ int parse_events(struct evlist *evlist, const char *str, perf_evlist__splice_list_tail(evlist, &parse_state.list); evlist->nr_groups += parse_state.nr_groups; - last = perf_evlist__last(evlist); + last = evlist__last(evlist); last->cmdline_group_boundary = true; return 0; @@ -2050,7 +2050,7 @@ foreach_evsel_in_last_glob(struct evlist *evlist, * So no need to WARN here, let *func do this. */ if (evlist->core.nr_entries > 0) - last = perf_evlist__last(evlist); + last = evlist__last(evlist); do { err = (*func)(last, arg); diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 8a015fc0aba0..8579505c29a4 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -30,7 +30,7 @@ static int perf_do_probe_api(setup_probe_fn_t fn, int cpu, const char *str) if (parse_events(evlist, str, NULL)) goto out_delete; - evsel = perf_evlist__first(evlist); + evsel = evlist__first(evlist); while (1) { fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1, flags); @@ -171,7 +171,7 @@ void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, use_sample_identifier = perf_can_sample_identifier(); sample_id = true; } else if (evlist->core.nr_entries > 1) { - struct evsel *first = perf_evlist__first(evlist); + struct evsel *first = evlist__first(evlist); evlist__for_each_entry(evlist, evsel) { if (evsel->core.attr.sample_type == first->core.attr.sample_type) @@ -276,7 +276,7 @@ bool perf_evlist__can_select_event(struct evlist *evlist, const char *str) if (err) goto out_delete; - evsel = perf_evlist__last(temp_evlist); + evsel = evlist__last(temp_evlist); if (!evlist || perf_cpu_map__empty(evlist->core.cpus)) { struct perf_cpu_map *cpus = perf_cpu_map__new(NULL); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index a2308eb77681..43d1d410854a 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -2329,7 +2329,7 @@ static struct evsel *find_evsel(struct evlist *evlist, char *event_name) if (nr > evlist->core.nr_entries) return NULL; - evsel = perf_evlist__first(evlist); + evsel = evlist__first(evlist); while (--nr > 0) evsel = perf_evsel__next(evsel); diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c index ef96e3dd6902..3dce2de9d005 100644 --- a/tools/perf/util/top.c +++ b/tools/perf/util/top.c @@ -71,7 +71,7 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) } if (top->evlist->core.nr_entries == 1) { - struct evsel *first = perf_evlist__first(top->evlist); + struct evsel *first = evlist__first(top->evlist); ret += SNPRINTF(bf + ret, size - ret, "%" PRIu64 "%s ", (uint64_t)first->core.attr.sample_period, opts->freq ? "Hz" : ""); From ff47d86a0d9bf618b185b49cb4bb9c6f957bb445 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 3 Sep 2019 10:54:48 +0200 Subject: [PATCH 1161/2880] libperf: Add perf_evlist__read_format() function Add the perf_evlist__read_format() function to libperf as internal function. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-30-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/evlist.c | 7 +++++++ tools/perf/lib/include/internal/evlist.h | 2 ++ tools/perf/util/evlist.c | 8 +------- tools/perf/util/evlist.h | 1 - 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tools/perf/lib/evlist.c b/tools/perf/lib/evlist.c index 2bdd9d2c79d9..ae861bf38976 100644 --- a/tools/perf/lib/evlist.c +++ b/tools/perf/lib/evlist.c @@ -161,3 +161,10 @@ void perf_evlist__disable(struct perf_evlist *evlist) perf_evlist__for_each_entry(evlist, evsel) perf_evsel__disable(evsel); } + +u64 perf_evlist__read_format(struct perf_evlist *evlist) +{ + struct perf_evsel *first = perf_evlist__first(evlist); + + return first->attr.read_format; +} diff --git a/tools/perf/lib/include/internal/evlist.h b/tools/perf/lib/include/internal/evlist.h index 16ae6d6cfb39..63516fe14f4c 100644 --- a/tools/perf/lib/include/internal/evlist.h +++ b/tools/perf/lib/include/internal/evlist.h @@ -66,4 +66,6 @@ static inline struct perf_evsel *perf_evlist__last(struct perf_evlist *evlist) return list_entry(evlist->entries.prev, struct perf_evsel, node); } +u64 perf_evlist__read_format(struct perf_evlist *evlist); + #endif /* __LIBPERF_INTERNAL_EVLIST_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index e8f0357b1532..bc788192493f 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -503,7 +503,7 @@ int perf_evlist__id_add_fd(struct evlist *evlist, * This way does not work with group format read, so bail * out in that case. */ - if (perf_evlist__read_format(evlist) & PERF_FORMAT_GROUP) + if (perf_evlist__read_format(&evlist->core) & PERF_FORMAT_GROUP) return -1; if (!(evsel->core.attr.read_format & PERF_FORMAT_ID) || @@ -1240,12 +1240,6 @@ bool perf_evlist__valid_read_format(struct evlist *evlist) return true; } -u64 perf_evlist__read_format(struct evlist *evlist) -{ - struct evsel *first = evlist__first(evlist); - return first->core.attr.read_format; -} - u16 perf_evlist__id_hdr_size(struct evlist *evlist) { struct evsel *first = evlist__first(evlist); diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 6529ad2a9d97..3f89d9913a30 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -217,7 +217,6 @@ int perf_evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel); void __perf_evlist__set_leader(struct list_head *list); void perf_evlist__set_leader(struct evlist *evlist); -u64 perf_evlist__read_format(struct evlist *evlist); u64 __perf_evlist__combined_sample_type(struct evlist *evlist); u64 perf_evlist__combined_sample_type(struct evlist *evlist); u64 perf_evlist__combined_branch_type(struct evlist *evlist); From b0031c22819ab606a0cb648c0f0a7d80db3c3a89 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 3 Sep 2019 11:01:04 +0200 Subject: [PATCH 1162/2880] libperf: Add perf_evlist__id_add() function Add the perf_evlist__id_add() function to libperf as an internal function. We already have the 'heads' member in 'struct perf_evlist'. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-31-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/evlist.c | 26 ++++++++++++++++++++++++ tools/perf/lib/include/internal/evlist.h | 4 ++++ tools/perf/tests/event_update.c | 2 +- tools/perf/util/evlist.c | 22 +------------------- tools/perf/util/evlist.h | 2 -- tools/perf/util/header.c | 4 ++-- 6 files changed, 34 insertions(+), 26 deletions(-) diff --git a/tools/perf/lib/evlist.c b/tools/perf/lib/evlist.c index ae861bf38976..a29ee8a746d9 100644 --- a/tools/perf/lib/evlist.c +++ b/tools/perf/lib/evlist.c @@ -1,9 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include +#include #include #include +#include #include #include #include @@ -168,3 +171,26 @@ u64 perf_evlist__read_format(struct perf_evlist *evlist) return first->attr.read_format; } + +#define SID(e, x, y) xyarray__entry(e->sample_id, x, y) + +static void perf_evlist__id_hash(struct perf_evlist *evlist, + struct perf_evsel *evsel, + int cpu, int thread, u64 id) +{ + int hash; + struct perf_sample_id *sid = SID(evsel, cpu, thread); + + sid->id = id; + sid->evsel = evsel; + hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS); + hlist_add_head(&sid->node, &evlist->heads[hash]); +} + +void perf_evlist__id_add(struct perf_evlist *evlist, + struct perf_evsel *evsel, + int cpu, int thread, u64 id) +{ + perf_evlist__id_hash(evlist, evsel, cpu, thread, id); + evsel->id[evsel->ids++] = id; +} diff --git a/tools/perf/lib/include/internal/evlist.h b/tools/perf/lib/include/internal/evlist.h index 63516fe14f4c..649406f717bc 100644 --- a/tools/perf/lib/include/internal/evlist.h +++ b/tools/perf/lib/include/internal/evlist.h @@ -68,4 +68,8 @@ static inline struct perf_evsel *perf_evlist__last(struct perf_evlist *evlist) u64 perf_evlist__read_format(struct perf_evlist *evlist); +void perf_evlist__id_add(struct perf_evlist *evlist, + struct perf_evsel *evsel, + int cpu, int thread, u64 id); + #endif /* __LIBPERF_INTERNAL_EVLIST_H */ diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index cd6cae8e5137..c727379cf20e 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -97,7 +97,7 @@ int test__event_update(struct test *test __maybe_unused, int subtest __maybe_unu TEST_ASSERT_VAL("failed to allocate ids", !perf_evsel__alloc_id(&evsel->core, 1, 1)); - perf_evlist__id_add(evlist, evsel, 0, 0, 123); + perf_evlist__id_add(&evlist->core, &evsel->core, 0, 0, 123); evsel->unit = strdup("KRAVA"); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index bc788192493f..f2863b4c61d7 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -461,26 +461,6 @@ int perf_evlist__poll(struct evlist *evlist, int timeout) return fdarray__poll(&evlist->core.pollfd, timeout); } -static void perf_evlist__id_hash(struct evlist *evlist, - struct evsel *evsel, - int cpu, int thread, u64 id) -{ - int hash; - struct perf_sample_id *sid = SID(evsel, cpu, thread); - - sid->id = id; - sid->evsel = &evsel->core; - hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS); - hlist_add_head(&sid->node, &evlist->core.heads[hash]); -} - -void perf_evlist__id_add(struct evlist *evlist, struct evsel *evsel, - int cpu, int thread, u64 id) -{ - perf_evlist__id_hash(evlist, evsel, cpu, thread, id); - evsel->core.id[evsel->core.ids++] = id; -} - int perf_evlist__id_add_fd(struct evlist *evlist, struct evsel *evsel, int cpu, int thread, int fd) @@ -518,7 +498,7 @@ int perf_evlist__id_add_fd(struct evlist *evlist, id = read_data[id_idx]; add: - perf_evlist__id_add(evlist, evsel, cpu, thread, id); + perf_evlist__id_add(&evlist->core, &evsel->core, cpu, thread, id); return 0; } diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 3f89d9913a30..eb35b4b1d86f 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -141,8 +141,6 @@ struct evsel * perf_evlist__find_tracepoint_by_name(struct evlist *evlist, const char *name); -void perf_evlist__id_add(struct evlist *evlist, struct evsel *evsel, - int cpu, int thread, u64 id); int perf_evlist__id_add_fd(struct evlist *evlist, struct evsel *evsel, int cpu, int thread, int fd); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index fc5eac41e102..02602bc8cabf 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3618,7 +3618,7 @@ int perf_session__read_header(struct perf_session *session) if (perf_header__getbuffer64(header, fd, &f_id, sizeof(f_id))) goto out_errno; - perf_evlist__id_add(session->evlist, evsel, 0, j, f_id); + perf_evlist__id_add(&session->evlist->core, &evsel->core, 0, j, f_id); } lseek(fd, tmp, SEEK_SET); @@ -3754,7 +3754,7 @@ int perf_event__process_attr(struct perf_tool *tool __maybe_unused, return -ENOMEM; for (i = 0; i < n_ids; i++) { - perf_evlist__id_add(evlist, evsel, 0, i, event->attr.id[i]); + perf_evlist__id_add(&evlist->core, &evsel->core, 0, i, event->attr.id[i]); } return 0; From d5a99483dece17dbde01968a7ffc03b7f575dc11 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 3 Sep 2019 11:19:56 +0200 Subject: [PATCH 1163/2880] libperf: Add perf_evlist__id_add_fd() function Add the perf_evlist__id_add_fd() function to libperf as an internal function. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-32-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/evlist.c | 44 ++++++++++++++++++++++++ tools/perf/lib/include/internal/evlist.h | 4 +++ tools/perf/util/evlist.c | 43 +---------------------- tools/perf/util/evlist.h | 4 --- tools/perf/util/evsel.c | 2 +- 5 files changed, 50 insertions(+), 47 deletions(-) diff --git a/tools/perf/lib/evlist.c b/tools/perf/lib/evlist.c index a29ee8a746d9..3a16dd0c044f 100644 --- a/tools/perf/lib/evlist.c +++ b/tools/perf/lib/evlist.c @@ -4,11 +4,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #include #include @@ -194,3 +197,44 @@ void perf_evlist__id_add(struct perf_evlist *evlist, perf_evlist__id_hash(evlist, evsel, cpu, thread, id); evsel->id[evsel->ids++] = id; } + +int perf_evlist__id_add_fd(struct perf_evlist *evlist, + struct perf_evsel *evsel, + int cpu, int thread, int fd) +{ + u64 read_data[4] = { 0, }; + int id_idx = 1; /* The first entry is the counter value */ + u64 id; + int ret; + + ret = ioctl(fd, PERF_EVENT_IOC_ID, &id); + if (!ret) + goto add; + + if (errno != ENOTTY) + return -1; + + /* Legacy way to get event id.. All hail to old kernels! */ + + /* + * This way does not work with group format read, so bail + * out in that case. + */ + if (perf_evlist__read_format(evlist) & PERF_FORMAT_GROUP) + return -1; + + if (!(evsel->attr.read_format & PERF_FORMAT_ID) || + read(fd, &read_data, sizeof(read_data)) == -1) + return -1; + + if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + ++id_idx; + if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + ++id_idx; + + id = read_data[id_idx]; + +add: + perf_evlist__id_add(evlist, evsel, cpu, thread, id); + return 0; +} diff --git a/tools/perf/lib/include/internal/evlist.h b/tools/perf/lib/include/internal/evlist.h index 649406f717bc..7d64185cfabd 100644 --- a/tools/perf/lib/include/internal/evlist.h +++ b/tools/perf/lib/include/internal/evlist.h @@ -72,4 +72,8 @@ void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel, int cpu, int thread, u64 id); +int perf_evlist__id_add_fd(struct perf_evlist *evlist, + struct perf_evsel *evsel, + int cpu, int thread, int fd); + #endif /* __LIBPERF_INTERNAL_EVLIST_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index f2863b4c61d7..a37eccf65eae 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -461,47 +461,6 @@ int perf_evlist__poll(struct evlist *evlist, int timeout) return fdarray__poll(&evlist->core.pollfd, timeout); } -int perf_evlist__id_add_fd(struct evlist *evlist, - struct evsel *evsel, - int cpu, int thread, int fd) -{ - u64 read_data[4] = { 0, }; - int id_idx = 1; /* The first entry is the counter value */ - u64 id; - int ret; - - ret = ioctl(fd, PERF_EVENT_IOC_ID, &id); - if (!ret) - goto add; - - if (errno != ENOTTY) - return -1; - - /* Legacy way to get event id.. All hail to old kernels! */ - - /* - * This way does not work with group format read, so bail - * out in that case. - */ - if (perf_evlist__read_format(&evlist->core) & PERF_FORMAT_GROUP) - return -1; - - if (!(evsel->core.attr.read_format & PERF_FORMAT_ID) || - read(fd, &read_data, sizeof(read_data)) == -1) - return -1; - - if (evsel->core.attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - ++id_idx; - if (evsel->core.attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - ++id_idx; - - id = read_data[id_idx]; - - add: - perf_evlist__id_add(&evlist->core, &evsel->core, cpu, thread, id); - return 0; -} - static void perf_evlist__set_sid_idx(struct evlist *evlist, struct evsel *evsel, int idx, int cpu, int thread) @@ -776,7 +735,7 @@ static int evlist__mmap_per_evsel(struct evlist *evlist, int idx, } if (evsel->core.attr.read_format & PERF_FORMAT_ID) { - if (perf_evlist__id_add_fd(evlist, evsel, cpu, thread, + if (perf_evlist__id_add_fd(&evlist->core, &evsel->core, cpu, thread, fd) < 0) return -1; perf_evlist__set_sid_idx(evlist, evsel, idx, cpu, diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index eb35b4b1d86f..80b3361613e5 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -141,10 +141,6 @@ struct evsel * perf_evlist__find_tracepoint_by_name(struct evlist *evlist, const char *name); -int perf_evlist__id_add_fd(struct evlist *evlist, - struct evsel *evsel, - int cpu, int thread, int fd); - int perf_evlist__add_pollfd(struct evlist *evlist, int fd); int perf_evlist__alloc_pollfd(struct evlist *evlist); int perf_evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index a4a492f11849..9c284d2adcea 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2662,7 +2662,7 @@ static int store_evsel_ids(struct evsel *evsel, struct evlist *evlist) thread++) { int fd = FD(evsel, cpu, thread); - if (perf_evlist__id_add_fd(evlist, evsel, + if (perf_evlist__id_add_fd(&evlist->core, &evsel->core, cpu, thread, fd) < 0) return -1; } From 20f2be1d48ec293b5a935595bd0c2e2915ffa77c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 6 Aug 2019 15:25:25 +0200 Subject: [PATCH 1164/2880] libperf: Move 'page_size' global variable to libperf We need the 'page_size' variable in libperf, so move it there. Add a libperf_init() as a global libperf init function to obtain this value via sysconf() at tool start. Committer notes: Add internal/lib.h to tools/perf/ files using 'page_size', sometimes replacing util.h with it if that was the only reason for having util.h included. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-33-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 2 +- tools/perf/arch/arm64/util/arm-spe.c | 2 +- tools/perf/arch/s390/util/machine.c | 2 +- tools/perf/arch/x86/tests/intel-cqm.c | 1 + tools/perf/arch/x86/tests/rdpmc.c | 2 +- tools/perf/arch/x86/util/intel-bts.c | 2 +- tools/perf/arch/x86/util/intel-pt.c | 2 +- tools/perf/arch/x86/util/machine.c | 2 +- tools/perf/lib/core.c | 7 +++++++ tools/perf/lib/include/internal/lib.h | 2 ++ tools/perf/lib/include/perf/core.h | 1 + tools/perf/lib/lib.c | 2 ++ tools/perf/lib/libperf.map | 1 + tools/perf/perf.c | 7 ++++--- tools/perf/tests/mmap-thread-lookup.c | 2 +- tools/perf/tests/vmlinux-kallsyms.c | 2 +- tools/perf/util/auxtrace.c | 1 - tools/perf/util/evlist.c | 2 +- tools/perf/util/header.c | 2 +- tools/perf/util/machine.c | 1 + tools/perf/util/mmap.c | 2 +- tools/perf/util/python.c | 2 +- tools/perf/util/session.c | 1 - tools/perf/util/srccode.c | 2 +- tools/perf/util/synthetic-events.c | 2 +- tools/perf/util/trace-event-info.c | 2 +- tools/perf/util/util.c | 3 +-- tools/perf/util/util.h | 2 -- 28 files changed, 36 insertions(+), 25 deletions(-) diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 6654bcfc1224..2e4de211d881 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -25,7 +25,7 @@ #include "../../util/evsel.h" #include "../../util/pmu.h" #include "../../util/cs-etm.h" -#include "../../util/util.h" // page_size +#include // page_size #include "../../util/session.h" #include diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 745f2d50ee82..eba6541ec0f1 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -16,7 +16,7 @@ #include "../../util/evsel.h" #include "../../util/evlist.h" #include "../../util/session.h" -#include "../../util/util.h" // page_size +#include // page_size #include "../../util/pmu.h" #include "../../util/debug.h" #include "../../util/auxtrace.h" diff --git a/tools/perf/arch/s390/util/machine.c b/tools/perf/arch/s390/util/machine.c index df099d6cc3f5..724efb2d842d 100644 --- a/tools/perf/arch/s390/util/machine.c +++ b/tools/perf/arch/s390/util/machine.c @@ -2,7 +2,7 @@ #include #include #include -#include "util.h" // page_size +#include // page_size #include "machine.h" #include "api/fs/fs.h" #include "debug.h" diff --git a/tools/perf/arch/x86/tests/intel-cqm.c b/tools/perf/arch/x86/tests/intel-cqm.c index 0329b9168fae..3ec562a2aaba 100644 --- a/tools/perf/arch/x86/tests/intel-cqm.c +++ b/tools/perf/arch/x86/tests/intel-cqm.c @@ -5,6 +5,7 @@ #include "evlist.h" #include "evsel.h" #include "arch-tests.h" +#include // page_size #include #include diff --git a/tools/perf/arch/x86/tests/rdpmc.c b/tools/perf/arch/x86/tests/rdpmc.c index e7640fb047de..1ea916656a2d 100644 --- a/tools/perf/arch/x86/tests/rdpmc.c +++ b/tools/perf/arch/x86/tests/rdpmc.c @@ -13,7 +13,7 @@ #include "tests/tests.h" #include "cloexec.h" #include "event.h" -#include "util.h" // page_size +#include // page_size #include "arch-tests.h" static u64 rdpmc(unsigned int counter) diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index e2c7095327db..f7f68a50a5cd 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -23,7 +23,7 @@ #include "../../util/tsc.h" #include "../../util/auxtrace.h" #include "../../util/intel-bts.h" -#include "../../util/util.h" // page_size +#include // page_size #define KiB(x) ((x) * 1024) #define MiB(x) ((x) * 1024 * 1024) diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 84a65524c418..d6d26256915f 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -27,7 +27,7 @@ #include "../../util/record.h" #include "../../util/target.h" #include "../../util/tsc.h" -#include "../../util/util.h" // page_size +#include // page_size #include "../../util/intel-pt.h" #define KiB(x) ((x) * 1024) diff --git a/tools/perf/arch/x86/util/machine.c b/tools/perf/arch/x86/util/machine.c index f0c289862f9f..e17e080e76f4 100644 --- a/tools/perf/arch/x86/util/machine.c +++ b/tools/perf/arch/x86/util/machine.c @@ -4,7 +4,7 @@ #include #include -#include "../../util/util.h" // page_size +#include // page_size #include "../../util/machine.h" #include "../../util/map.h" #include "../../util/symbol.h" diff --git a/tools/perf/lib/core.c b/tools/perf/lib/core.c index 29d5e3348718..6689d593c2d1 100644 --- a/tools/perf/lib/core.c +++ b/tools/perf/lib/core.c @@ -4,7 +4,9 @@ #include #include +#include #include +#include #include "internal.h" static int __base_pr(enum libperf_print_level level, const char *format, @@ -32,3 +34,8 @@ void libperf_print(enum libperf_print_level level, const char *format, ...) __libperf_pr(level, format, args); va_end(args); } + +void libperf_init(void) +{ + page_size = sysconf(_SC_PAGE_SIZE); +} diff --git a/tools/perf/lib/include/internal/lib.h b/tools/perf/lib/include/internal/lib.h index 0b56f1201dc9..9168b7d2a7e1 100644 --- a/tools/perf/lib/include/internal/lib.h +++ b/tools/perf/lib/include/internal/lib.h @@ -4,6 +4,8 @@ #include +extern unsigned int page_size; + ssize_t readn(int fd, void *buf, size_t n); ssize_t writen(int fd, const void *buf, size_t n); diff --git a/tools/perf/lib/include/perf/core.h b/tools/perf/lib/include/perf/core.h index c341a7b2c874..ba2f4e76a3e2 100644 --- a/tools/perf/lib/include/perf/core.h +++ b/tools/perf/lib/include/perf/core.h @@ -18,5 +18,6 @@ typedef int (*libperf_print_fn_t)(enum libperf_print_level level, const char *, va_list ap); LIBPERF_API void libperf_set_print(libperf_print_fn_t fn); +LIBPERF_API void libperf_init(void); #endif /* __LIBPERF_CORE_H */ diff --git a/tools/perf/lib/lib.c b/tools/perf/lib/lib.c index 2a81819c3b8c..18658931fc71 100644 --- a/tools/perf/lib/lib.c +++ b/tools/perf/lib/lib.c @@ -5,6 +5,8 @@ #include #include +unsigned int page_size; + static ssize_t ion(bool is_read, int fd, void *buf, size_t n) { void *buf_start = buf; diff --git a/tools/perf/lib/libperf.map b/tools/perf/lib/libperf.map index cd0d17b996c8..724da66776ef 100644 --- a/tools/perf/lib/libperf.map +++ b/tools/perf/lib/libperf.map @@ -1,5 +1,6 @@ LIBPERF_0.0.1 { global: + libperf_init; libperf_set_print; perf_cpu_map__dummy_new; perf_cpu_map__get; diff --git a/tools/perf/perf.c b/tools/perf/perf.c index bb40a108b8f7..c012ceb64ff9 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -12,6 +12,7 @@ #include "util/build-id.h" #include "util/cache.h" #include "util/env.h" +#include // page_size #include #include "util/config.h" #include @@ -20,11 +21,12 @@ #include "util/bpf-loader.h" #include "util/debug.h" #include "util/event.h" -#include "util/util.h" // page_size, usage() +#include "util/util.h" // usage() #include "ui/ui.h" #include "perf-sys.h" #include #include +#include #include #include #include @@ -438,8 +440,7 @@ int main(int argc, const char **argv) exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT); pager_init(PERF_PAGER_ENVIRONMENT); - /* The page_size is placed in util object. */ - page_size = sysconf(_SC_PAGE_SIZE); + libperf_init(); cmd = extract_argv0_path(argv[0]); if (!cmd) diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c index 33b496d194f4..8d9d4cbff76d 100644 --- a/tools/perf/tests/mmap-thread-lookup.c +++ b/tools/perf/tests/mmap-thread-lookup.c @@ -16,7 +16,7 @@ #include "symbol.h" #include "util/synthetic-events.h" #include "thread.h" -#include "util.h" // page_size +#include // page_size #define THREADS 4 diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index 7f28775875c2..aa296ffea6d1 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -7,7 +7,7 @@ #include "dso.h" #include "map.h" #include "symbol.h" -#include "util.h" // page_size +#include // page_size #include "tests.h" #include "debug.h" #include "machine.h" diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 4bd92f5bfb5f..8470dfe9fe97 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -50,7 +50,6 @@ #include "intel-bts.h" #include "arm-spe.h" #include "s390-cpumsf.h" -#include "util.h" // page_size #include "util/mmap.h" #include diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index a37eccf65eae..6069fb52661f 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -17,7 +17,7 @@ #include "evsel.h" #include "debug.h" #include "units.h" -#include "util.h" // page_size +#include // page_size #include "../perf.h" #include "asm/bug.h" #include "bpf-event.h" diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 02602bc8cabf..3b24b4974c5f 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -42,7 +42,7 @@ #include "tool.h" #include "time-utils.h" #include "units.h" -#include "util.h" // page_size, perf_exe() +#include "util/util.h" // perf_exe() #include "cputopo.h" #include "bpf-event.h" diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 0535338f2d7a..70a9f8716a4b 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -32,6 +32,7 @@ #include "linux/hash.h" #include "asm/bug.h" #include "bpf-event.h" +#include // page_size #include #include diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 4cc3b54b2f73..12671d089748 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -20,7 +20,7 @@ #include "event.h" #include "mmap.h" #include "../perf.h" -#include "util.h" /* page_size */ +#include /* page_size */ size_t perf_mmap__mmap_len(struct mmap *map) { diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index fcbafb1e8d9d..6c46d7dbd263 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -14,7 +14,7 @@ #include "thread_map.h" #include "trace-event.h" #include "mmap.h" -#include "util.h" +#include #include "../perf-sys.h" #if PY_MAJOR_VERSION < 3 diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 84a30ff3968a..061bb4d6a3f5 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -29,7 +29,6 @@ #include "thread-stack.h" #include "sample-raw.h" #include "stat.h" -#include "util.h" #include "ui/progress.h" #include "../perf.h" #include "arch/common.h" diff --git a/tools/perf/util/srccode.c b/tools/perf/util/srccode.c index b402f9ca89ab..d84ed8b6caaa 100644 --- a/tools/perf/util/srccode.c +++ b/tools/perf/util/srccode.c @@ -15,7 +15,7 @@ #include #include "srccode.h" #include "debug.h" -#include "util.h" // page_size +#include // page_size #define MAXSRCCACHE (32*1024*1024) #define MAXSRCFILES 64 diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 96ed008c2775..807cbca403a7 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -16,7 +16,6 @@ #include "util/synthetic-events.h" #include "util/target.h" #include "util/time-utils.h" -#include "util/util.h" #include #include #include @@ -26,6 +25,7 @@ #include #include #include +#include // page_size #include #include #include diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index fe07e7550930..086e98ff42a3 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2008,2009, Steven Rostedt */ -#include "util.h" // page_size #include #include #include @@ -19,6 +18,7 @@ #include #include #include +#include // page_size #include "trace-event.h" #include diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 32322a20a68b..cb6fa4c98470 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -3,6 +3,7 @@ #include "debug.h" #include "event.h" #include "namespaces.h" +#include #include #include #include @@ -41,8 +42,6 @@ void perf_set_multithreaded(void) perf_singlethreaded = false; } -unsigned int page_size; - int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack = PERF_MAX_CONTEXTS_PER_STACK; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 45a5c6f20197..d6ae394e67c4 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -33,8 +33,6 @@ int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size); size_t hex_width(u64 v); -extern unsigned int page_size; - int sysctl__max_stack(void); int fetch_kernel_version(unsigned int *puint, From 26049111c333fda4194e895f4cb719766b602256 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 23 Sep 2019 16:22:22 -0300 Subject: [PATCH 1165/2880] perf tools: No need to include internal/lib.h from util/util.h That was done just to have users of writen() and readn(), that before had their prototypes in util/util.h to get it without having to add an include for internal/lib.h, but the right way is to add it and by now all places already do it. Fix a fallout were readlink() was used but unistd.h was being obtained by luck thru util.h -> internal/lib.h, now to check why unistd.h is being included there... Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-lcnytgrtafey3kwlfog2rzzj@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/sdt.c | 1 + tools/perf/util/util.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/sdt.c b/tools/perf/tests/sdt.c index cf1bd57d3023..60f0e9ee04fb 100644 --- a/tools/perf/tests/sdt.c +++ b/tools/perf/tests/sdt.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index d6ae394e67c4..b78b73e5bb32 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -11,7 +11,6 @@ #include #include #include -#include /* General helper functions */ void usage(const char *err) __noreturn; From 7634d5336a6e66cba5befd8190b3994916424e08 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 23 Sep 2019 18:06:52 -0300 Subject: [PATCH 1166/2880] libperf: Use sys/types.h to get ssize_t, not unistd.h The sys/types.h header looks more sensible, from its name we can gather it should be there because of some needed typedef, and it is much smaller than unistd.h, so use it and fix up the fallout in places where it was being used for something else entirely but being obtained by sheer luck, indirectly. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-49bn251httu22ymwgipeavmy@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/include/internal/lib.h | 2 +- tools/perf/util/mmap.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/lib/include/internal/lib.h b/tools/perf/lib/include/internal/lib.h index 9168b7d2a7e1..5175d491b2d4 100644 --- a/tools/perf/lib/include/internal/lib.h +++ b/tools/perf/lib/include/internal/lib.h @@ -2,7 +2,7 @@ #ifndef __LIBPERF_INTERNAL_LIB_H #define __LIBPERF_INTERNAL_LIB_H -#include +#include extern unsigned int page_size; diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 12671d089748..a35dc57d5995 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -12,6 +12,7 @@ #include #include #include +#include // sysconf() #ifdef HAVE_LIBNUMA_SUPPORT #include #endif From fb4bf51fcc153f74c86e08ce1d17e9f1a1a71ba0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Sep 2019 10:35:47 +0200 Subject: [PATCH 1167/2880] libperf: Add libperf dependency for tests targets Add libperf dependency for tests targets. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-36-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/lib/Makefile b/tools/perf/lib/Makefile index 54466cc84544..85ccb8c439a4 100644 --- a/tools/perf/lib/Makefile +++ b/tools/perf/lib/Makefile @@ -138,7 +138,7 @@ clean: $(LIBAPI)-clean *.o *~ *.a *.so *.so.$(VERSION) *.so.$(LIBPERF_VERSION) .*.d .*.cmd LIBPERF-CFLAGS $(LIBPERF_PC) $(Q)$(MAKE) -C tests clean -tests: +tests: libs $(Q)$(MAKE) -C tests $(Q)$(MAKE) -C tests run From 428dab813a56ce9402f359dc35a3ccdfcaea9429 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 31 Aug 2019 20:44:13 +0200 Subject: [PATCH 1168/2880] libperf: Merge libperf_set_print() into libperf_init() The libperf_set_print() function needs to be called in any case so let's merge it with libperf_init(), so we have just one init function. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-34-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/core.c | 8 ++------ tools/perf/lib/include/perf/core.h | 3 +-- tools/perf/lib/libperf.map | 1 - tools/perf/perf.c | 8 +++++++- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/perf/lib/core.c b/tools/perf/lib/core.c index 6689d593c2d1..d0b9ae422b9f 100644 --- a/tools/perf/lib/core.c +++ b/tools/perf/lib/core.c @@ -17,11 +17,6 @@ static int __base_pr(enum libperf_print_level level, const char *format, static libperf_print_fn_t __libperf_pr = __base_pr; -void libperf_set_print(libperf_print_fn_t fn) -{ - __libperf_pr = fn; -} - __printf(2, 3) void libperf_print(enum libperf_print_level level, const char *format, ...) { @@ -35,7 +30,8 @@ void libperf_print(enum libperf_print_level level, const char *format, ...) va_end(args); } -void libperf_init(void) +void libperf_init(libperf_print_fn_t fn) { page_size = sysconf(_SC_PAGE_SIZE); + __libperf_pr = fn; } diff --git a/tools/perf/lib/include/perf/core.h b/tools/perf/lib/include/perf/core.h index ba2f4e76a3e2..cfd70e720c1c 100644 --- a/tools/perf/lib/include/perf/core.h +++ b/tools/perf/lib/include/perf/core.h @@ -17,7 +17,6 @@ enum libperf_print_level { typedef int (*libperf_print_fn_t)(enum libperf_print_level level, const char *, va_list ap); -LIBPERF_API void libperf_set_print(libperf_print_fn_t fn); -LIBPERF_API void libperf_init(void); +LIBPERF_API void libperf_init(libperf_print_fn_t fn); #endif /* __LIBPERF_CORE_H */ diff --git a/tools/perf/lib/libperf.map b/tools/perf/lib/libperf.map index 724da66776ef..5eb0150ccdc6 100644 --- a/tools/perf/lib/libperf.map +++ b/tools/perf/lib/libperf.map @@ -1,7 +1,6 @@ LIBPERF_0.0.1 { global: libperf_init; - libperf_set_print; perf_cpu_map__dummy_new; perf_cpu_map__get; perf_cpu_map__put; diff --git a/tools/perf/perf.c b/tools/perf/perf.c index c012ceb64ff9..27f94b0bb874 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -430,6 +430,12 @@ void pthread__unblock_sigwinch(void) pthread_sigmask(SIG_UNBLOCK, &set, NULL); } +static int libperf_print(enum libperf_print_level level, + const char *fmt, va_list ap) +{ + return eprintf(level, verbose, fmt, ap); +} + int main(int argc, const char **argv) { int err; @@ -440,7 +446,7 @@ int main(int argc, const char **argv) exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT); pager_init(PERF_PAGER_ENVIRONMENT); - libperf_init(); + libperf_init(libperf_print); cmd = extract_argv0_path(argv[0]); if (!cmd) From 379dd98c3d77de920f09f1933592982d200e076a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Sep 2019 10:33:09 +0200 Subject: [PATCH 1169/2880] libperf: Add libperf_init() call to the tests Add libperf_init() call to the automated tests. Committer notes: Added missing stdarg.h and/or stdio.h to places using vfprintf. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-34-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/tests/test-cpumap.c | 10 ++++++++++ tools/perf/lib/tests/test-evlist.c | 10 ++++++++++ tools/perf/lib/tests/test-evsel.c | 10 ++++++++++ tools/perf/lib/tests/test-threadmap.c | 10 ++++++++++ 4 files changed, 40 insertions(+) diff --git a/tools/perf/lib/tests/test-cpumap.c b/tools/perf/lib/tests/test-cpumap.c index 76a43cfb83a1..aa34c20df07e 100644 --- a/tools/perf/lib/tests/test-cpumap.c +++ b/tools/perf/lib/tests/test-cpumap.c @@ -1,13 +1,23 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include +static int libperf_print(enum libperf_print_level level, + const char *fmt, va_list ap) +{ + return vfprintf(stderr, fmt, ap); +} + int main(int argc, char **argv) { struct perf_cpu_map *cpus; __T_START; + libperf_init(libperf_print); + cpus = perf_cpu_map__dummy_new(); if (!cpus) return -1; diff --git a/tools/perf/lib/tests/test-evlist.c b/tools/perf/lib/tests/test-evlist.c index 4e1407f20ffd..e6b2ab2e2bde 100644 --- a/tools/perf/lib/tests/test-evlist.c +++ b/tools/perf/lib/tests/test-evlist.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include #include @@ -6,6 +8,12 @@ #include #include +static int libperf_print(enum libperf_print_level level, + const char *fmt, va_list ap) +{ + return vfprintf(stderr, fmt, ap); +} + static int test_stat_cpu(void) { struct perf_cpu_map *cpus; @@ -177,6 +185,8 @@ int main(int argc, char **argv) { __T_START; + libperf_init(libperf_print); + test_stat_cpu(); test_stat_thread(); test_stat_thread_enable(); diff --git a/tools/perf/lib/tests/test-evsel.c b/tools/perf/lib/tests/test-evsel.c index 2c648fe5617e..1b6c4285ac2b 100644 --- a/tools/perf/lib/tests/test-evsel.c +++ b/tools/perf/lib/tests/test-evsel.c @@ -1,10 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include #include #include #include +static int libperf_print(enum libperf_print_level level, + const char *fmt, va_list ap) +{ + return vfprintf(stderr, fmt, ap); +} + static int test_stat_cpu(void) { struct perf_cpu_map *cpus; @@ -116,6 +124,8 @@ int main(int argc, char **argv) { __T_START; + libperf_init(libperf_print); + test_stat_cpu(); test_stat_thread(); test_stat_thread_enable(); diff --git a/tools/perf/lib/tests/test-threadmap.c b/tools/perf/lib/tests/test-threadmap.c index 10a4f4cbbdd5..8c5f47247d9e 100644 --- a/tools/perf/lib/tests/test-threadmap.c +++ b/tools/perf/lib/tests/test-threadmap.c @@ -1,13 +1,23 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include +static int libperf_print(enum libperf_print_level level, + const char *fmt, va_list ap) +{ + return vfprintf(stderr, fmt, ap); +} + int main(int argc, char **argv) { struct perf_thread_map *threads; __T_START; + libperf_init(libperf_print); + threads = perf_thread_map__new_dummy(); if (!threads) return -1; From 31f67fc462a9e5df3b900924c2bf649c7bc63af8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 6 Aug 2019 13:21:53 +0200 Subject: [PATCH 1170/2880] libperf: Add perf_evlist__alloc_pollfd() function Move perf_evlist__alloc_pollfd() from tools/perf to libperf, it will be used in the following patches. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-37-jolsa@kernel.org [ Added api/fd/array.h include to the lib/evlist.c file ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/lib/evlist.c | 22 ++++++++++++++++++++++ tools/perf/lib/include/internal/evlist.h | 2 ++ tools/perf/util/evlist.c | 23 +---------------------- tools/perf/util/evlist.h | 1 - 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/tools/perf/lib/evlist.c b/tools/perf/lib/evlist.c index 3a16dd0c044f..3c3b97f40be0 100644 --- a/tools/perf/lib/evlist.c +++ b/tools/perf/lib/evlist.c @@ -14,6 +14,7 @@ #include #include #include +#include void perf_evlist__init(struct perf_evlist *evlist) { @@ -238,3 +239,24 @@ add: perf_evlist__id_add(evlist, evsel, cpu, thread, id); return 0; } + +int perf_evlist__alloc_pollfd(struct perf_evlist *evlist) +{ + int nr_cpus = perf_cpu_map__nr(evlist->cpus); + int nr_threads = perf_thread_map__nr(evlist->threads); + int nfds = 0; + struct perf_evsel *evsel; + + perf_evlist__for_each_entry(evlist, evsel) { + if (evsel->system_wide) + nfds += nr_cpus; + else + nfds += nr_cpus * nr_threads; + } + + if (fdarray__available_entries(&evlist->pollfd) < nfds && + fdarray__grow(&evlist->pollfd, nfds) < 0) + return -ENOMEM; + + return 0; +} diff --git a/tools/perf/lib/include/internal/evlist.h b/tools/perf/lib/include/internal/evlist.h index 7d64185cfabd..88c0dfaf0ddc 100644 --- a/tools/perf/lib/include/internal/evlist.h +++ b/tools/perf/lib/include/internal/evlist.h @@ -24,6 +24,8 @@ struct perf_evlist { struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; }; +int perf_evlist__alloc_pollfd(struct perf_evlist *evlist); + /** * __perf_evlist__for_each_entry - iterate thru all the evsels * @list: list_head instance to iterate diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 6069fb52661f..c47f23e7f3c8 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -398,27 +398,6 @@ int perf_evlist__enable_event_idx(struct evlist *evlist, return perf_evlist__enable_event_thread(evlist, evsel, idx); } -int perf_evlist__alloc_pollfd(struct evlist *evlist) -{ - int nr_cpus = perf_cpu_map__nr(evlist->core.cpus); - int nr_threads = perf_thread_map__nr(evlist->core.threads); - int nfds = 0; - struct evsel *evsel; - - evlist__for_each_entry(evlist, evsel) { - if (evsel->core.system_wide) - nfds += nr_cpus; - else - nfds += nr_cpus * nr_threads; - } - - if (fdarray__available_entries(&evlist->core.pollfd) < nfds && - fdarray__grow(&evlist->core.pollfd, nfds) < 0) - return -ENOMEM; - - return 0; -} - static int __perf_evlist__add_pollfd(struct evlist *evlist, int fd, struct mmap *map, short revent) { @@ -944,7 +923,7 @@ int evlist__mmap_ex(struct evlist *evlist, unsigned int pages, if (!evlist->mmap) return -ENOMEM; - if (evlist->core.pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) < 0) + if (evlist->core.pollfd.entries == NULL && perf_evlist__alloc_pollfd(&evlist->core) < 0) return -ENOMEM; evlist->core.mmap_len = evlist__mmap_size(pages); diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 80b3361613e5..bebbaa9b6325 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -142,7 +142,6 @@ perf_evlist__find_tracepoint_by_name(struct evlist *evlist, const char *name); int perf_evlist__add_pollfd(struct evlist *evlist, int fd); -int perf_evlist__alloc_pollfd(struct evlist *evlist); int perf_evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask); int perf_evlist__poll(struct evlist *evlist, int timeout); From f4009e7bf7ba3375cb00e33ca901d61a5acd6c2b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 16 Aug 2019 16:00:45 +0200 Subject: [PATCH 1171/2880] libperf: Add perf_evlist__add_pollfd() function Move perf_evlist__add_pollfd() from tools/perf to libperf, it will be used in the following patches. Also rename perf's perf_evlist__add_pollfd()/perf_evlist__filter_pollfd() to evlist__add_pollfd()/evlist__filter_pollfd(). Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-38-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kvm.c | 4 ++-- tools/perf/builtin-record.c | 2 +- tools/perf/builtin-trace.c | 2 +- tools/perf/lib/evlist.c | 16 +++++++++++++++ tools/perf/lib/include/internal/evlist.h | 2 ++ tools/perf/util/evlist.c | 25 ++++-------------------- tools/perf/util/evlist.h | 4 ++-- 7 files changed, 28 insertions(+), 27 deletions(-) diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 710a2898ed6c..2227e2f42c09 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -967,10 +967,10 @@ static int kvm_events_live_report(struct perf_kvm_stat *kvm) goto out; } - if (perf_evlist__add_pollfd(kvm->evlist, kvm->timerfd) < 0) + if (evlist__add_pollfd(kvm->evlist, kvm->timerfd) < 0) goto out; - nr_stdin = perf_evlist__add_pollfd(kvm->evlist, fileno(stdin)); + nr_stdin = evlist__add_pollfd(kvm->evlist, fileno(stdin)); if (nr_stdin < 0) goto out; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 48600c90cc7e..d6daaad348b5 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1624,7 +1624,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) err = 0; waking++; - if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0) + if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0) draining = true; } diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 97667287f573..8d9784b5d028 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3475,7 +3475,7 @@ again: int timeout = done ? 100 : -1; if (!draining && perf_evlist__poll(evlist, timeout) > 0) { - if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0) + if (evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0) draining = true; goto again; diff --git a/tools/perf/lib/evlist.c b/tools/perf/lib/evlist.c index 3c3b97f40be0..e4d8b3b7b8fc 100644 --- a/tools/perf/lib/evlist.c +++ b/tools/perf/lib/evlist.c @@ -12,6 +12,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -260,3 +263,16 @@ int perf_evlist__alloc_pollfd(struct perf_evlist *evlist) return 0; } + +int perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd, + void *ptr, short revent) +{ + int pos = fdarray__add(&evlist->pollfd, fd, revent | POLLERR | POLLHUP); + + if (pos >= 0) { + evlist->pollfd.priv[pos].ptr = ptr; + fcntl(fd, F_SETFL, O_NONBLOCK); + } + + return pos; +} diff --git a/tools/perf/lib/include/internal/evlist.h b/tools/perf/lib/include/internal/evlist.h index 88c0dfaf0ddc..9f440ab12b76 100644 --- a/tools/perf/lib/include/internal/evlist.h +++ b/tools/perf/lib/include/internal/evlist.h @@ -25,6 +25,8 @@ struct perf_evlist { }; int perf_evlist__alloc_pollfd(struct perf_evlist *evlist); +int perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd, + void *ptr, short revent); /** * __perf_evlist__for_each_entry - iterate thru all the evsels diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index c47f23e7f3c8..051be9a31db9 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -398,26 +398,9 @@ int perf_evlist__enable_event_idx(struct evlist *evlist, return perf_evlist__enable_event_thread(evlist, evsel, idx); } -static int __perf_evlist__add_pollfd(struct evlist *evlist, int fd, - struct mmap *map, short revent) +int evlist__add_pollfd(struct evlist *evlist, int fd) { - int pos = fdarray__add(&evlist->core.pollfd, fd, revent | POLLERR | POLLHUP); - /* - * Save the idx so that when we filter out fds POLLHUP'ed we can - * close the associated evlist->mmap[] entry. - */ - if (pos >= 0) { - evlist->core.pollfd.priv[pos].ptr = map; - - fcntl(fd, F_SETFL, O_NONBLOCK); - } - - return pos; -} - -int perf_evlist__add_pollfd(struct evlist *evlist, int fd) -{ - return __perf_evlist__add_pollfd(evlist, fd, NULL, POLLIN); + return perf_evlist__add_pollfd(&evlist->core, fd, NULL, POLLIN); } static void perf_evlist__munmap_filtered(struct fdarray *fda, int fd, @@ -429,7 +412,7 @@ static void perf_evlist__munmap_filtered(struct fdarray *fda, int fd, perf_mmap__put(map); } -int perf_evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask) +int evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask) { return fdarray__filter(&evlist->core.pollfd, revents_and_mask, perf_evlist__munmap_filtered, NULL); @@ -708,7 +691,7 @@ static int evlist__mmap_per_evsel(struct evlist *evlist, int idx, * Therefore don't add it for polling. */ if (!evsel->core.system_wide && - __perf_evlist__add_pollfd(evlist, fd, &maps[idx], revent) < 0) { + perf_evlist__add_pollfd(&evlist->core, fd, &maps[idx], revent) < 0) { perf_mmap__put(&maps[idx]); return -1; } diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index bebbaa9b6325..2eac8aab24a3 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -141,8 +141,8 @@ struct evsel * perf_evlist__find_tracepoint_by_name(struct evlist *evlist, const char *name); -int perf_evlist__add_pollfd(struct evlist *evlist, int fd); -int perf_evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask); +int evlist__add_pollfd(struct evlist *evlist, int fd); +int evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask); int perf_evlist__poll(struct evlist *evlist, int timeout); From 80ab2987a016f774201d4f3509118047f9d58175 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 31 Aug 2019 22:48:33 +0200 Subject: [PATCH 1172/2880] libperf: Add perf_evlist__poll() function Move perf_evlist__poll() from tools/perf to libperf, it will be used in the following patches. And rename the existing perf's function to evlist__poll(). Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190913132355.21634-39-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 2 +- tools/perf/builtin-top.c | 4 ++-- tools/perf/builtin-trace.c | 2 +- tools/perf/lib/evlist.c | 5 +++++ tools/perf/lib/include/perf/evlist.h | 1 + tools/perf/lib/libperf.map | 1 + tools/perf/tests/openat-syscall-tp-fields.c | 2 +- tools/perf/tests/perf-record.c | 2 +- tools/perf/tests/task-exit.c | 2 +- tools/perf/util/evlist.c | 6 +++--- tools/perf/util/evlist.h | 2 +- tools/perf/util/python.c | 2 +- 12 files changed, 19 insertions(+), 12 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index d6daaad348b5..23332861de6e 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1615,7 +1615,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) if (hits == rec->samples) { if (done || draining) break; - err = perf_evlist__poll(rec->evlist, -1); + err = evlist__poll(rec->evlist, -1); /* * Propagate error, only if there's any. Ignore positive * number of returned events and interrupt error. diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 73bf79053ae3..30d8eb614377 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1307,7 +1307,7 @@ static int __cmd_top(struct perf_top *top) } /* Wait for a minimal set of events before starting the snapshot */ - perf_evlist__poll(top->evlist, 100); + evlist__poll(top->evlist, 100); perf_top__mmap_read(top); @@ -1317,7 +1317,7 @@ static int __cmd_top(struct perf_top *top) perf_top__mmap_read(top); if (opts->overwrite || (hits == top->samples)) - ret = perf_evlist__poll(top->evlist, 100); + ret = evlist__poll(top->evlist, 100); if (resize) { perf_top__resize(top); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 8d9784b5d028..c52c3120a811 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3474,7 +3474,7 @@ again: if (trace->nr_events == before) { int timeout = done ? 100 : -1; - if (!draining && perf_evlist__poll(evlist, timeout) > 0) { + if (!draining && evlist__poll(evlist, timeout) > 0) { if (evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0) draining = true; diff --git a/tools/perf/lib/evlist.c b/tools/perf/lib/evlist.c index e4d8b3b7b8fc..d1496fee810c 100644 --- a/tools/perf/lib/evlist.c +++ b/tools/perf/lib/evlist.c @@ -276,3 +276,8 @@ int perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd, return pos; } + +int perf_evlist__poll(struct perf_evlist *evlist, int timeout) +{ + return fdarray__poll(&evlist->pollfd, timeout); +} diff --git a/tools/perf/lib/include/perf/evlist.h b/tools/perf/lib/include/perf/evlist.h index 38365f8f3fba..8a2ce0757ab2 100644 --- a/tools/perf/lib/include/perf/evlist.h +++ b/tools/perf/lib/include/perf/evlist.h @@ -31,5 +31,6 @@ LIBPERF_API void perf_evlist__disable(struct perf_evlist *evlist); LIBPERF_API void perf_evlist__set_maps(struct perf_evlist *evlist, struct perf_cpu_map *cpus, struct perf_thread_map *threads); +LIBPERF_API int perf_evlist__poll(struct perf_evlist *evlist, int timeout); #endif /* __LIBPERF_EVLIST_H */ diff --git a/tools/perf/lib/libperf.map b/tools/perf/lib/libperf.map index 5eb0150ccdc6..ab8dbde1136c 100644 --- a/tools/perf/lib/libperf.map +++ b/tools/perf/lib/libperf.map @@ -39,6 +39,7 @@ LIBPERF_0.0.1 { perf_evlist__remove; perf_evlist__next; perf_evlist__set_maps; + perf_evlist__poll; local: *; }; diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index 4629fa33c8ad..2b5c46813053 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -127,7 +127,7 @@ int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest } if (nr_events == before) - perf_evlist__poll(evlist, 10); + evlist__poll(evlist, 10); if (++nr_polls > 5) { pr_debug("%s: no events!\n", __func__); diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 401e8d11427b..437426be29e9 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -287,7 +287,7 @@ int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unus * perf_event_attr.wakeup_events, just PERF_EVENT_SAMPLE does. */ if (total_events == before && false) - perf_evlist__poll(evlist, -1); + evlist__poll(evlist, -1); sleep(1); if (++wakeups > 5) { diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index 24565f83e07d..bce3a4cb4c89 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -130,7 +130,7 @@ retry: out_init: if (!exited || !nr_exit) { - perf_evlist__poll(evlist, -1); + evlist__poll(evlist, -1); goto retry; } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 051be9a31db9..d4c7fd125ce9 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -418,9 +418,9 @@ int evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask) perf_evlist__munmap_filtered, NULL); } -int perf_evlist__poll(struct evlist *evlist, int timeout) +int evlist__poll(struct evlist *evlist, int timeout) { - return fdarray__poll(&evlist->core.pollfd, timeout); + return perf_evlist__poll(&evlist->core, timeout); } static void perf_evlist__set_sid_idx(struct evlist *evlist, @@ -1736,7 +1736,7 @@ static void *perf_evlist__poll_thread(void *arg) draining = true; if (!draining) - perf_evlist__poll(evlist, 1000); + evlist__poll(evlist, 1000); for (i = 0; i < evlist->core.nr_mmaps; i++) { struct mmap *map = &evlist->mmap[i]; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 2eac8aab24a3..130d44d691b8 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -144,7 +144,7 @@ perf_evlist__find_tracepoint_by_name(struct evlist *evlist, int evlist__add_pollfd(struct evlist *evlist, int fd); int evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask); -int perf_evlist__poll(struct evlist *evlist, int timeout); +int evlist__poll(struct evlist *evlist, int timeout); struct evsel *perf_evlist__id2evsel(struct evlist *evlist, u64 id); struct evsel *perf_evlist__id2evsel_strict(struct evlist *evlist, diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 6c46d7dbd263..53f31053a27a 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -918,7 +918,7 @@ static PyObject *pyrf_evlist__poll(struct pyrf_evlist *pevlist, if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i", kwlist, &timeout)) return NULL; - n = perf_evlist__poll(evlist, timeout); + n = evlist__poll(evlist, timeout); if (n < 0) { PyErr_SetFromErrno(PyExc_OSError); return NULL; From 32ff3fec07b6d8e6c5cc2342f6cbbdcb224d484c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 24 Sep 2019 15:14:12 -0300 Subject: [PATCH 1173/2880] perf copyfile: Move copyfile routines to separate files Further reducing the util.c hodgepodge files. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-0i62zh7ok25znibyebgq0qs4@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/build-id.c | 3 +- tools/perf/util/copyfile.c | 144 +++++++++++++++++++++++++++++++++++ tools/perf/util/copyfile.h | 16 ++++ tools/perf/util/symbol-elf.c | 2 +- tools/perf/util/util.c | 135 -------------------------------- tools/perf/util/util.h | 5 -- 7 files changed, 164 insertions(+), 142 deletions(-) create mode 100644 tools/perf/util/copyfile.c create mode 100644 tools/perf/util/copyfile.h diff --git a/tools/perf/util/Build b/tools/perf/util/Build index fd89d6a8cd65..4d1894e38a81 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -3,6 +3,7 @@ perf-y += block-range.o perf-y += build-id.o perf-y += cacheline.o perf-y += config.o +perf-y += copyfile.o perf-y += ctype.o perf-y += db-export.o perf-y += env.o diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 7928c398a063..c076fc7fe025 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -7,12 +7,13 @@ * Copyright (C) 2009, 2010 Red Hat Inc. * Copyright (C) 2009, 2010 Arnaldo Carvalho de Melo */ -#include "util.h" // copyfile_ns(), lsdir(), mkdir_p(), rm_rf() +#include "util.h" // lsdir(), mkdir_p(), rm_rf() #include #include #include #include #include +#include "util/copyfile.h" #include "dso.h" #include "build-id.h" #include "event.h" diff --git a/tools/perf/util/copyfile.c b/tools/perf/util/copyfile.c new file mode 100644 index 000000000000..3fa0db136667 --- /dev/null +++ b/tools/perf/util/copyfile.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "util/copyfile.h" +#include "util/namespaces.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int slow_copyfile(const char *from, const char *to, struct nsinfo *nsi) +{ + int err = -1; + char *line = NULL; + size_t n; + FILE *from_fp, *to_fp; + struct nscookie nsc; + + nsinfo__mountns_enter(nsi, &nsc); + from_fp = fopen(from, "r"); + nsinfo__mountns_exit(&nsc); + if (from_fp == NULL) + goto out; + + to_fp = fopen(to, "w"); + if (to_fp == NULL) + goto out_fclose_from; + + while (getline(&line, &n, from_fp) > 0) + if (fputs(line, to_fp) == EOF) + goto out_fclose_to; + err = 0; +out_fclose_to: + fclose(to_fp); + free(line); +out_fclose_from: + fclose(from_fp); +out: + return err; +} + +int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size) +{ + void *ptr; + loff_t pgoff; + + pgoff = off_in & ~(page_size - 1); + off_in -= pgoff; + + ptr = mmap(NULL, off_in + size, PROT_READ, MAP_PRIVATE, ifd, pgoff); + if (ptr == MAP_FAILED) + return -1; + + while (size) { + ssize_t ret = pwrite(ofd, ptr + off_in, size, off_out); + if (ret < 0 && errno == EINTR) + continue; + if (ret <= 0) + break; + + size -= ret; + off_in += ret; + off_out += ret; + } + munmap(ptr, off_in + size); + + return size ? -1 : 0; +} + +static int copyfile_mode_ns(const char *from, const char *to, mode_t mode, + struct nsinfo *nsi) +{ + int fromfd, tofd; + struct stat st; + int err; + char *tmp = NULL, *ptr = NULL; + struct nscookie nsc; + + nsinfo__mountns_enter(nsi, &nsc); + err = stat(from, &st); + nsinfo__mountns_exit(&nsc); + if (err) + goto out; + err = -1; + + /* extra 'x' at the end is to reserve space for '.' */ + if (asprintf(&tmp, "%s.XXXXXXx", to) < 0) { + tmp = NULL; + goto out; + } + ptr = strrchr(tmp, '/'); + if (!ptr) + goto out; + ptr = memmove(ptr + 1, ptr, strlen(ptr) - 1); + *ptr = '.'; + + tofd = mkstemp(tmp); + if (tofd < 0) + goto out; + + if (fchmod(tofd, mode)) + goto out_close_to; + + if (st.st_size == 0) { /* /proc? do it slowly... */ + err = slow_copyfile(from, tmp, nsi); + goto out_close_to; + } + + nsinfo__mountns_enter(nsi, &nsc); + fromfd = open(from, O_RDONLY); + nsinfo__mountns_exit(&nsc); + if (fromfd < 0) + goto out_close_to; + + err = copyfile_offset(fromfd, 0, tofd, 0, st.st_size); + + close(fromfd); +out_close_to: + close(tofd); + if (!err) + err = link(tmp, to); + unlink(tmp); +out: + free(tmp); + return err; +} + +int copyfile_ns(const char *from, const char *to, struct nsinfo *nsi) +{ + return copyfile_mode_ns(from, to, 0755, nsi); +} + +int copyfile_mode(const char *from, const char *to, mode_t mode) +{ + return copyfile_mode_ns(from, to, mode, NULL); +} + +int copyfile(const char *from, const char *to) +{ + return copyfile_mode(from, to, 0755); +} diff --git a/tools/perf/util/copyfile.h b/tools/perf/util/copyfile.h new file mode 100644 index 000000000000..e85d2f22f3cc --- /dev/null +++ b/tools/perf/util/copyfile.h @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef PERF_COPYFILE_H_ +#define PERF_COPYFILE_H_ + +#include +#include +#include + +struct nsinfo; + +int copyfile(const char *from, const char *to); +int copyfile_mode(const char *from, const char *to, mode_t mode); +int copyfile_ns(const char *from, const char *to, struct nsinfo *nsi); +int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size); + +#endif // PERF_COPYFILE_H_ diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 6fbfdf8bf61f..66f4be1df573 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -17,7 +17,7 @@ #include "machine.h" #include "vdso.h" #include "debug.h" -#include "util.h" +#include "util/copyfile.h" #include #include #include diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index cb6fa4c98470..5eda6e19c947 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -2,10 +2,7 @@ #include "util.h" #include "debug.h" #include "event.h" -#include "namespaces.h" -#include #include -#include #include #include #include @@ -233,138 +230,6 @@ out: return list; } -static int slow_copyfile(const char *from, const char *to, struct nsinfo *nsi) -{ - int err = -1; - char *line = NULL; - size_t n; - FILE *from_fp, *to_fp; - struct nscookie nsc; - - nsinfo__mountns_enter(nsi, &nsc); - from_fp = fopen(from, "r"); - nsinfo__mountns_exit(&nsc); - if (from_fp == NULL) - goto out; - - to_fp = fopen(to, "w"); - if (to_fp == NULL) - goto out_fclose_from; - - while (getline(&line, &n, from_fp) > 0) - if (fputs(line, to_fp) == EOF) - goto out_fclose_to; - err = 0; -out_fclose_to: - fclose(to_fp); - free(line); -out_fclose_from: - fclose(from_fp); -out: - return err; -} - -int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size) -{ - void *ptr; - loff_t pgoff; - - pgoff = off_in & ~(page_size - 1); - off_in -= pgoff; - - ptr = mmap(NULL, off_in + size, PROT_READ, MAP_PRIVATE, ifd, pgoff); - if (ptr == MAP_FAILED) - return -1; - - while (size) { - ssize_t ret = pwrite(ofd, ptr + off_in, size, off_out); - if (ret < 0 && errno == EINTR) - continue; - if (ret <= 0) - break; - - size -= ret; - off_in += ret; - off_out += ret; - } - munmap(ptr, off_in + size); - - return size ? -1 : 0; -} - -static int copyfile_mode_ns(const char *from, const char *to, mode_t mode, - struct nsinfo *nsi) -{ - int fromfd, tofd; - struct stat st; - int err; - char *tmp = NULL, *ptr = NULL; - struct nscookie nsc; - - nsinfo__mountns_enter(nsi, &nsc); - err = stat(from, &st); - nsinfo__mountns_exit(&nsc); - if (err) - goto out; - err = -1; - - /* extra 'x' at the end is to reserve space for '.' */ - if (asprintf(&tmp, "%s.XXXXXXx", to) < 0) { - tmp = NULL; - goto out; - } - ptr = strrchr(tmp, '/'); - if (!ptr) - goto out; - ptr = memmove(ptr + 1, ptr, strlen(ptr) - 1); - *ptr = '.'; - - tofd = mkstemp(tmp); - if (tofd < 0) - goto out; - - if (fchmod(tofd, mode)) - goto out_close_to; - - if (st.st_size == 0) { /* /proc? do it slowly... */ - err = slow_copyfile(from, tmp, nsi); - goto out_close_to; - } - - nsinfo__mountns_enter(nsi, &nsc); - fromfd = open(from, O_RDONLY); - nsinfo__mountns_exit(&nsc); - if (fromfd < 0) - goto out_close_to; - - err = copyfile_offset(fromfd, 0, tofd, 0, st.st_size); - - close(fromfd); -out_close_to: - close(tofd); - if (!err) - err = link(tmp, to); - unlink(tmp); -out: - free(tmp); - return err; -} - -int copyfile_ns(const char *from, const char *to, struct nsinfo *nsi) -{ - return copyfile_mode_ns(from, to, 0755, nsi); -} - -int copyfile_mode(const char *from, const char *to, mode_t mode) -{ - return copyfile_mode_ns(from, to, mode, NULL); -} - -int copyfile(const char *from, const char *to) -{ - return copyfile_mode(from, to, 0755); -} - size_t hex_width(u64 v) { size_t n = 1; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index b78b73e5bb32..9969b8b46f7c 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -17,7 +17,6 @@ void usage(const char *err) __noreturn; void die(const char *err, ...) __noreturn __printf(1, 2); struct dirent; -struct nsinfo; struct strlist; int mkdir_p(char *path, mode_t mode); @@ -25,10 +24,6 @@ int rm_rf(const char *path); int rm_rf_perf_data(const char *path); struct strlist *lsdir(const char *name, bool (*filter)(const char *, struct dirent *)); bool lsdir_no_dot_filter(const char *name, struct dirent *d); -int copyfile(const char *from, const char *to); -int copyfile_mode(const char *from, const char *to, mode_t mode); -int copyfile_ns(const char *from, const char *to, struct nsinfo *nsi); -int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size); size_t hex_width(u64 v); From 90a48843a18663a25aa70c6d717d38a4443e5e29 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 25 Sep 2019 15:12:42 +0200 Subject: [PATCH 1174/2880] KVM: selftests: fix ucall on x86 After commit e8bb4755eea2("KVM: selftests: Split ucall.c into architecture specific files") selftests which use ucall on x86 started segfaulting and apparently it's gcc to blame: it "optimizes" ucall() function throwing away va_start/va_end part because it thinks the structure is not being used. Previously, it couldn't do that because the there was also MMIO version and the decision which particular implementation to use was done at runtime. With older gccs it's possible to solve the problem by adding 'volatile' to 'struct ucall' but at least with gcc-8.3 this trick doesn't work. 'memory' clobber seems to do the job. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/x86_64/ucall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c index 4bfc9a90b1de..da4d89ad5419 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -32,7 +32,7 @@ void ucall(uint64_t cmd, int nargs, ...) va_end(va); asm volatile("in %[port], %%al" - : : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax"); + : : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax", "memory"); } uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) From 4b526de50e39b38cd828396267379183c7c21354 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2019 13:41:06 -0700 Subject: [PATCH 1175/2880] KVM: x86: Check kvm_rebooting in kvm_spurious_fault() Explicitly check kvm_rebooting in kvm_spurious_fault() prior to invoking BUG(), as opposed to assuming the caller has already done so. Letting kvm_spurious_fault() be called "directly" will allow VMX to better optimize its low level assembly flows. As a happy side effect, kvm_spurious_fault() no longer needs to be marked as a dead end since it doesn't unconditionally BUG(). Acked-by: Paolo Bonzini Cc: Josh Poimboeuf Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/x86.c | 3 ++- tools/objtool/check.c | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c5ed92482e2c..de0c9d8097c2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1534,7 +1534,7 @@ enum { #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) -asmlinkage void __noreturn kvm_spurious_fault(void); +asmlinkage void kvm_spurious_fault(void); /* * Hardware virtualization extension instructions may fault if a diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c9a3d8efe1c2..0ed07d8d2caa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -360,7 +360,8 @@ EXPORT_SYMBOL_GPL(kvm_set_apic_base); asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ - BUG(); + if (!kvm_rebooting) + BUG(); } EXPORT_SYMBOL_GPL(kvm_spurious_fault); diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 176f2f084060..044c9a3cb247 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -138,7 +138,6 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, "do_task_dead", "__module_put_and_exit", "complete_and_exit", - "kvm_spurious_fault", "__reiserfs_panic", "lbug_with_loc", "fortify_panic", From 1fac4a54374f7ef385938f3c6cf7649c0fe4f6cd Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 23 Sep 2019 14:56:14 +0800 Subject: [PATCH 1176/2880] btrfs: relocation: fix use-after-free on dead relocation roots [BUG] One user reported a reproducible KASAN report about use-after-free: BTRFS info (device sdi1): balance: start -dvrange=1256811659264..1256811659265 BTRFS info (device sdi1): relocating block group 1256811659264 flags data|raid0 ================================================================== BUG: KASAN: use-after-free in btrfs_init_reloc_root+0x2cd/0x340 [btrfs] Write of size 8 at addr ffff88856f671710 by task kworker/u24:10/261579 CPU: 2 PID: 261579 Comm: kworker/u24:10 Tainted: P OE 5.2.11-arch1-1-kasan #4 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X99 Extreme4, BIOS P3.80 04/06/2018 Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs] Call Trace: dump_stack+0x7b/0xba print_address_description+0x6c/0x22e ? btrfs_init_reloc_root+0x2cd/0x340 [btrfs] __kasan_report.cold+0x1b/0x3b ? btrfs_init_reloc_root+0x2cd/0x340 [btrfs] kasan_report+0x12/0x17 __asan_report_store8_noabort+0x17/0x20 btrfs_init_reloc_root+0x2cd/0x340 [btrfs] record_root_in_trans+0x2a0/0x370 [btrfs] btrfs_record_root_in_trans+0xf4/0x140 [btrfs] start_transaction+0x1ab/0xe90 [btrfs] btrfs_join_transaction+0x1d/0x20 [btrfs] btrfs_finish_ordered_io+0x7bf/0x18a0 [btrfs] ? lock_repin_lock+0x400/0x400 ? __kmem_cache_shutdown.cold+0x140/0x1ad ? btrfs_unlink_subvol+0x9b0/0x9b0 [btrfs] finish_ordered_fn+0x15/0x20 [btrfs] normal_work_helper+0x1bd/0xca0 [btrfs] ? process_one_work+0x819/0x1720 ? kasan_check_read+0x11/0x20 btrfs_endio_write_helper+0x12/0x20 [btrfs] process_one_work+0x8c9/0x1720 ? pwq_dec_nr_in_flight+0x2f0/0x2f0 ? worker_thread+0x1d9/0x1030 worker_thread+0x98/0x1030 kthread+0x2bb/0x3b0 ? process_one_work+0x1720/0x1720 ? kthread_park+0x120/0x120 ret_from_fork+0x35/0x40 Allocated by task 369692: __kasan_kmalloc.part.0+0x44/0xc0 __kasan_kmalloc.constprop.0+0xba/0xc0 kasan_kmalloc+0x9/0x10 kmem_cache_alloc_trace+0x138/0x260 btrfs_read_tree_root+0x92/0x360 [btrfs] btrfs_read_fs_root+0x10/0xb0 [btrfs] create_reloc_root+0x47d/0xa10 [btrfs] btrfs_init_reloc_root+0x1e2/0x340 [btrfs] record_root_in_trans+0x2a0/0x370 [btrfs] btrfs_record_root_in_trans+0xf4/0x140 [btrfs] start_transaction+0x1ab/0xe90 [btrfs] btrfs_start_transaction+0x1e/0x20 [btrfs] __btrfs_prealloc_file_range+0x1c2/0xa00 [btrfs] btrfs_prealloc_file_range+0x13/0x20 [btrfs] prealloc_file_extent_cluster+0x29f/0x570 [btrfs] relocate_file_extent_cluster+0x193/0xc30 [btrfs] relocate_data_extent+0x1f8/0x490 [btrfs] relocate_block_group+0x600/0x1060 [btrfs] btrfs_relocate_block_group+0x3a0/0xa00 [btrfs] btrfs_relocate_chunk+0x9e/0x180 [btrfs] btrfs_balance+0x14e4/0x2fc0 [btrfs] btrfs_ioctl_balance+0x47f/0x640 [btrfs] btrfs_ioctl+0x119d/0x8380 [btrfs] do_vfs_ioctl+0x9f5/0x1060 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x73/0xb0 do_syscall_64+0xa5/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 369692: __kasan_slab_free+0x14f/0x210 kasan_slab_free+0xe/0x10 kfree+0xd8/0x270 btrfs_drop_snapshot+0x154c/0x1eb0 [btrfs] clean_dirty_subvols+0x227/0x340 [btrfs] relocate_block_group+0x972/0x1060 [btrfs] btrfs_relocate_block_group+0x3a0/0xa00 [btrfs] btrfs_relocate_chunk+0x9e/0x180 [btrfs] btrfs_balance+0x14e4/0x2fc0 [btrfs] btrfs_ioctl_balance+0x47f/0x640 [btrfs] btrfs_ioctl+0x119d/0x8380 [btrfs] do_vfs_ioctl+0x9f5/0x1060 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x73/0xb0 do_syscall_64+0xa5/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff88856f671100 which belongs to the cache kmalloc-4k of size 4096 The buggy address is located 1552 bytes inside of 4096-byte region [ffff88856f671100, ffff88856f672100) The buggy address belongs to the page: page:ffffea0015bd9c00 refcount:1 mapcount:0 mapping:ffff88864400e600 index:0x0 compound_mapcount: 0 flags: 0x2ffff0000010200(slab|head) raw: 02ffff0000010200 dead000000000100 dead000000000200 ffff88864400e600 raw: 0000000000000000 0000000000070007 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88856f671600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88856f671680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff88856f671700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88856f671780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88856f671800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== BTRFS info (device sdi1): 1 enospc errors during balance BTRFS info (device sdi1): balance: ended with status: -28 [CAUSE] The problem happens when finish_ordered_io() get called with balance still running, while the reloc root of that subvolume is already dead. (Tree is swap already done, but tree not yet deleted for possible qgroup usage.) That means root->reloc_root still exists, but that reloc_root can be under btrfs_drop_snapshot(), thus we shouldn't access it. The following race could cause the use-after-free problem: CPU1 | CPU2 -------------------------------------------------------------------------- | relocate_block_group() | |- unset_reloc_control(rc) | |- btrfs_commit_transaction() btrfs_finish_ordered_io() | |- clean_dirty_subvols() |- btrfs_join_transaction() | | |- record_root_in_trans() | | |- btrfs_init_reloc_root() | | |- if (root->reloc_root) | | | | |- root->reloc_root = NULL | | |- btrfs_drop_snapshot(reloc_root); |- reloc_root->last_trans| = trans->transid | ^^^^^^^^^^^^^^^^^^^^^^ Use after free [FIX] Fix it by the following modifications: - Test if the root has dead reloc tree before accessing root->reloc_root If the root has BTRFS_ROOT_DEAD_RELOC_TREE, then we don't need to create or update root->reloc_tree - Clear the BTRFS_ROOT_DEAD_RELOC_TREE flag until we have fully dropped reloc tree To co-operate with above modification, so as long as BTRFS_ROOT_DEAD_RELOC_TREE is still set, we won't try to re-create reloc tree at record_root_in_trans(). Reported-by: Cebtenzzre Fixes: d2311e698578 ("btrfs: relocation: Delay reloc tree deletion after merge_reloc_roots") CC: stable@vger.kernel.org # 5.1+ Reviewed-by: Josef Bacik Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2f0e25afa486..00504657b602 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1435,6 +1435,13 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int clear_rsv = 0; int ret; + /* + * The subvolume has reloc tree but the swap is finished, no need to + * create/update the dead reloc tree + */ + if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) + return 0; + if (root->reloc_root) { reloc_root = root->reloc_root; reloc_root->last_trans = trans->transid; @@ -2187,7 +2194,6 @@ static int clean_dirty_subvols(struct reloc_control *rc) /* Merged subvolume, cleanup its reloc root */ struct btrfs_root *reloc_root = root->reloc_root; - clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); list_del_init(&root->reloc_dirty_list); root->reloc_root = NULL; if (reloc_root) { @@ -2196,6 +2202,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) if (ret2 < 0 && !ret) ret = ret2; } + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); btrfs_put_fs_root(root); } else { /* Orphan reloc tree, just clean it up */ From 52a9fcbc73a30375cad8d9e7b519e1ad97b0880a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2019 13:41:07 -0700 Subject: [PATCH 1177/2880] KVM: VMX: Optimize VMX instruction error and fault handling Rework the VMX instruction helpers using asm-goto to branch directly to error/fault "handlers" in lieu of using __ex(), i.e. the generic ____kvm_handle_fault_on_reboot(). Branching directly to fault handling code during fixup avoids the extra JMP that is inserted after every VMX instruction when using the generic "fault on reboot" (see commit 3901336ed9887, "x86/kvm: Don't call kvm_spurious_fault() from .fixup"). Opportunistically clean up the helpers so that they all have consistent error handling and messages. Leave the usage of ____kvm_handle_fault_on_reboot() (via __ex()) in kvm_cpu_vmxoff() and nested_vmx_check_vmentry_hw() as is. The VMXOFF case is not a fast path, i.e. the cleanliness of __ex() is worth the JMP, and the extra JMP in nested_vmx_check_vmentry_hw() is unavoidable. Note, VMREAD cannot get the asm-goto treatment as output operands aren't compatible with GCC's asm-goto due to internal compiler restrictions. Acked-by: Paolo Bonzini Cc: Josh Poimboeuf Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/ops.h | 72 +++++++++++++++++++++++------------------- arch/x86/kvm/vmx/vmx.c | 34 ++++++++++++++++++++ 2 files changed, 74 insertions(+), 32 deletions(-) diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index 2200fb698dd0..79e25d49d4d9 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -14,6 +14,12 @@ #define __ex_clear(x, reg) \ ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) +void vmwrite_error(unsigned long field, unsigned long value); +void vmclear_error(struct vmcs *vmcs, u64 phys_addr); +void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); +void invvpid_error(unsigned long ext, u16 vpid, gva_t gva); +void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); + static __always_inline void vmcs_check16(unsigned long field) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, @@ -103,21 +109,39 @@ static __always_inline unsigned long vmcs_readl(unsigned long field) return __vmcs_readl(field); } -static noinline void vmwrite_error(unsigned long field, unsigned long value) -{ - printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", - field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); - dump_stack(); -} +#define vmx_asm1(insn, op1, error_args...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + ".byte 0x2e\n\t" /* branch not taken hint */ \ + "jna %l[error]\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + : : op1 : "cc" : error, fault); \ + return; \ +error: \ + insn##_error(error_args); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) + +#define vmx_asm2(insn, op1, op2, error_args...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + ".byte 0x2e\n\t" /* branch not taken hint */ \ + "jna %l[error]\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + : : op1, op2 : "cc" : error, fault); \ + return; \ +error: \ + insn##_error(error_args); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) { - bool error; - - asm volatile (__ex("vmwrite %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(field), "rm"(value)); - if (unlikely(error)) - vmwrite_error(field, value); + vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value); } static __always_inline void vmcs_write16(unsigned long field, u16 value) @@ -182,28 +206,18 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) static inline void vmcs_clear(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - bool error; - asm volatile (__ex("vmclear %1") CC_SET(na) - : CC_OUT(na) (error) : "m"(phys_addr)); - if (unlikely(error)) - printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", - vmcs, phys_addr); + vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr); } static inline void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - bool error; if (static_branch_unlikely(&enable_evmcs)) return evmcs_load(phys_addr); - asm volatile (__ex("vmptrld %1") CC_SET(na) - : CC_OUT(na) (error) : "m"(phys_addr)); - if (unlikely(error)) - printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", - vmcs, phys_addr); + vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr); } static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) @@ -213,11 +227,8 @@ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) u64 rsvd : 48; u64 gva; } operand = { vpid, 0, gva }; - bool error; - asm volatile (__ex("invvpid %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(ext), "m"(operand)); - BUG_ON(error); + vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva); } static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) @@ -225,11 +236,8 @@ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) struct { u64 eptp, gpa; } operand = {eptp, gpa}; - bool error; - asm volatile (__ex("invept %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(ext), "m"(operand)); - BUG_ON(error); + vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa); } static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a7c9922e3905..52c7614b5ecd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -343,6 +343,40 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit void vmx_vmexit(void); +#define vmx_insn_failed(fmt...) \ +do { \ + WARN_ONCE(1, fmt); \ + pr_warn_ratelimited(fmt); \ +} while (0) + +noinline void vmwrite_error(unsigned long field, unsigned long value) +{ + vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); +} + +noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) +{ + vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); +} + +noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) +{ + vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); +} + +noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) +{ + vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", + ext, vpid, gva); +} + +noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) +{ + vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", + ext, eptp, gpa); +} + static DEFINE_PER_CPU(struct vmcs *, vmxarea); DEFINE_PER_CPU(struct vmcs *, current_vmcs); /* From 6e2020977e3e692a8bb31258c1f8251521f4fdb7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2019 13:41:08 -0700 Subject: [PATCH 1178/2880] KVM: VMX: Add error handling to VMREAD helper Now that VMREAD flows require a taken branch, courtesy of commit 3901336ed9887 ("x86/kvm: Don't call kvm_spurious_fault() from .fixup") bite the bullet and add full error handling to VMREAD, i.e. replace the JMP added by __ex()/____kvm_handle_fault_on_reboot() with a hinted Jcc. To minimize the code footprint, add a helper function, vmread_error(), to handle both faults and failures so that the inline flow has a single CALL. Acked-by: Paolo Bonzini Cc: Josh Poimboeuf Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/ops.h | 21 +++++++++++++++++---- arch/x86/kvm/vmx/vmx.c | 8 ++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index 79e25d49d4d9..45eaedee2ac0 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -11,9 +11,8 @@ #include "vmcs.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) -#define __ex_clear(x, reg) \ - ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) +asmlinkage void vmread_error(unsigned long field, bool fault); void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); @@ -68,8 +67,22 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) { unsigned long value; - asm volatile (__ex_clear("vmread %1, %0", "%k0") - : "=r"(value) : "r"(field)); + asm volatile("1: vmread %2, %1\n\t" + ".byte 0x3e\n\t" /* branch taken hint */ + "ja 3f\n\t" + "mov %2, %%" _ASM_ARG1 "\n\t" + "xor %%" _ASM_ARG2 ", %%" _ASM_ARG2 "\n\t" + "2: call vmread_error\n\t" + "xor %k1, %k1\n\t" + "3:\n\t" + + ".pushsection .fixup, \"ax\"\n\t" + "4: mov %2, %%" _ASM_ARG1 "\n\t" + "mov $1, %%" _ASM_ARG2 "\n\t" + "jmp 2b\n\t" + ".popsection\n\t" + _ASM_EXTABLE(1b, 4b) + : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc"); return value; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 52c7614b5ecd..1852706e6acc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -349,6 +349,14 @@ do { \ pr_warn_ratelimited(fmt); \ } while (0) +asmlinkage void vmread_error(unsigned long field, bool fault) +{ + if (fault) + kvm_spurious_fault(); + else + vmx_insn_failed("kvm: vmread failed: field=%lx\n", field); +} + noinline void vmwrite_error(unsigned long field, unsigned long value) { vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", From 98cd382d509061a9c6ae4cac38ff0721be05adef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2019 13:41:09 -0700 Subject: [PATCH 1179/2880] KVM: x86: Drop ____kvm_handle_fault_on_reboot() Remove the variation of __kvm_handle_fault_on_reboot() that accepts a post-fault cleanup instruction now that its sole user (VMREAD) uses a different method for handling faults. Acked-by: Paolo Bonzini Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index de0c9d8097c2..202f4d4a892a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1542,7 +1542,7 @@ asmlinkage void kvm_spurious_fault(void); * Usually after catching the fault we just panic; during reboot * instead the instruction is ignored. */ -#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ +#define __kvm_handle_fault_on_reboot(insn) \ "666: \n\t" \ insn "\n\t" \ "jmp 668f \n\t" \ @@ -1551,16 +1551,12 @@ asmlinkage void kvm_spurious_fault(void); "668: \n\t" \ ".pushsection .fixup, \"ax\" \n\t" \ "700: \n\t" \ - cleanup_insn "\n\t" \ "cmpb $0, kvm_rebooting\n\t" \ "je 667b \n\t" \ "jmp 668b \n\t" \ ".popsection \n\t" \ _ASM_EXTABLE(666b, 700b) -#define __kvm_handle_fault_on_reboot(insn) \ - ____kvm_handle_fault_on_reboot(insn, "") - #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); From f209a26dd5a5038c53097b5c38e313b8cd042adb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2019 13:41:10 -0700 Subject: [PATCH 1180/2880] KVM: x86: Don't check kvm_rebooting in __kvm_handle_fault_on_reboot() Remove the kvm_rebooting check from VMX/SVM instruction exception fixup now that kvm_spurious_fault() conditions its BUG() on !kvm_rebooting. Because the 'cleanup_insn' functionally is also gone, deferring to kvm_spurious_fault() means __kvm_handle_fault_on_reboot() can eliminate its .fixup code entirely and have its exception table entry branch directly to the call to kvm_spurious_fault(). Cc: Paolo Bonzini Cc: Josh Poimboeuf Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 202f4d4a892a..23edf56cf577 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1549,13 +1549,7 @@ asmlinkage void kvm_spurious_fault(void); "667: \n\t" \ "call kvm_spurious_fault \n\t" \ "668: \n\t" \ - ".pushsection .fixup, \"ax\" \n\t" \ - "700: \n\t" \ - "cmpb $0, kvm_rebooting\n\t" \ - "je 667b \n\t" \ - "jmp 668b \n\t" \ - ".popsection \n\t" \ - _ASM_EXTABLE(666b, 700b) + _ASM_EXTABLE(666b, 667b) #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); From cab01850277a827c57270bba48be1d8fe312643d Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 25 Sep 2019 15:30:35 +0200 Subject: [PATCH 1181/2880] KVM: vmx: fix build warnings in hv_enable_direct_tlbflush() on i386 The following was reported on i386: arch/x86/kvm/vmx/vmx.c: In function 'hv_enable_direct_tlbflush': arch/x86/kvm/vmx/vmx.c:503:10: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] pr_debugs() in this function are more or less useless, let's just remove them. evmcs->hv_vm_id can use 'unsigned long' instead of 'u64'. Also, simplify the code a little bit. Reported-by: kbuild test robot Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1852706e6acc..d4575ffb3cec 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -537,23 +537,19 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) * Synthetic VM-Exit is not enabled in current code and so All * evmcs in singe VM shares same assist page. */ - if (!*p_hv_pa_pg) { + if (!*p_hv_pa_pg) *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!*p_hv_pa_pg) - return -ENOMEM; - pr_debug("KVM: Hyper-V: allocated PA_PG for %llx\n", - (u64)&vcpu->kvm); - } + + if (!*p_hv_pa_pg) + return -ENOMEM; evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; evmcs->partition_assist_page = __pa(*p_hv_pa_pg); - evmcs->hv_vm_id = (u64)vcpu->kvm; + evmcs->hv_vm_id = (unsigned long)vcpu->kvm; evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; - pr_debug("KVM: Hyper-V: enabled DIRECT flush for %llx\n", - (u64)vcpu->kvm); return 0; } From fab273595507a9ec7035df6d5512a955d80a80ba Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 25 Sep 2019 10:13:27 +0800 Subject: [PATCH 1182/2880] btrfs: Fix a regression which we can't convert to SINGLE profile [BUG] With v5.3 kernel, we can't convert to SINGLE profile: # btrfs balance start -f -dconvert=single $mnt ERROR: error during balancing '/mnt/btrfs': Invalid argument # dmesg -t | tail validate_convert_profile: data profile=0x1000000000000 allowed=0x20 is_valid=1 final=0x1000000000000 ret=1 BTRFS error (device dm-3): balance: invalid convert data profile single [CAUSE] With the extra debug output added, it shows that the @allowed bit is lacking the special in-memory only SINGLE profile bit. Thus we fail at that (profile & ~allowed) check. This regression is caused by commit 081db89b13cb ("btrfs: use raid_attr to get allowed profiles for balance conversion") and the fact that we don't use any bit to indicate SINGLE profile on-disk, but uses special in-memory only bit to help distinguish different profiles. [FIX] Add that BTRFS_AVAIL_ALLOC_BIT_SINGLE to @allowed, so the code should be the same as it was and fix the regression. Reported-by: Chris Murphy Fixes: 081db89b13cb ("btrfs: use raid_attr to get allowed profiles for balance conversion") CC: stable@vger.kernel.org # 5.3+ Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a324480bc88b..cdd7af424033 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4063,7 +4063,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } num_devices = btrfs_num_devices(fs_info); - allowed = 0; + + /* + * SINGLE profile on-disk has no profile bit, but in-memory we have a + * special bit for it, to make it easier to distinguish. Thus we need + * to set it manually, or balance would refuse the profile. + */ + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) if (num_devices >= btrfs_raid_array[i].devs_min) allowed |= btrfs_raid_array[i].bg_flag; From e41f9efb85d38d95744b9f35b9903109032b93d4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 25 Sep 2019 14:09:30 +0100 Subject: [PATCH 1183/2880] sunrpc: clean up indentation issue There are statements that are indented incorrectly, remove the extraneous spacing. Signed-off-by: Colin Ian King Signed-off-by: J. Bruce Fields --- net/sunrpc/svc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 220b79988000..d11b70552c33 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1233,8 +1233,8 @@ svc_generic_init_request(struct svc_rqst *rqstp, if (rqstp->rq_vers >= progp->pg_nvers ) goto err_bad_vers; - versp = progp->pg_vers[rqstp->rq_vers]; - if (!versp) + versp = progp->pg_vers[rqstp->rq_vers]; + if (!versp) goto err_bad_vers; /* From 3fbd7ee285b2bbc6eebd15a3c8786d9776a402a8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:33:34 -0500 Subject: [PATCH 1184/2880] tasks: Add a count of task RCU users Add a count of the number of RCU users (currently 1) of the task struct so that we can later add the scheduler case and get rid of the very subtle task_rcu_dereference(), and just use rcu_dereference(). As suggested by Oleg have the count overlap rcu_head so that no additional space in task_struct is required. Inspired-by: Linus Torvalds Inspired-by: Oleg Nesterov Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/87woebdplt.fsf_-_@x220.int.ebiederm.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 ++++- include/linux/sched/task.h | 1 + kernel/exit.c | 7 ++++++- kernel/fork.c | 7 +++---- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index e2e91960d79f..8e43e54a02c7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1147,7 +1147,10 @@ struct task_struct { struct tlbflush_unmap_batch tlb_ubc; - struct rcu_head rcu; + union { + refcount_t rcu_users; + struct rcu_head rcu; + }; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 3d90ed8f75f0..153a683646ac 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -120,6 +120,7 @@ static inline void put_task_struct(struct task_struct *t) } struct task_struct *task_rcu_dereference(struct task_struct **ptask); +void put_task_struct_rcu_user(struct task_struct *task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; diff --git a/kernel/exit.c b/kernel/exit.c index 22ab6a4bdc51..3bcaec2ea3ba 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(tsk); } +void put_task_struct_rcu_user(struct task_struct *task) +{ + if (refcount_dec_and_test(&task->rcu_users)) + call_rcu(&task->rcu, delayed_put_task_struct); +} void release_task(struct task_struct *p) { @@ -222,7 +227,7 @@ repeat: write_unlock_irq(&tasklist_lock); release_thread(p); - call_rcu(&p->rcu, delayed_put_task_struct); + put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) diff --git a/kernel/fork.c b/kernel/fork.c index 1d1cd06edbc1..7eefe338d7a2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -902,10 +902,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; - /* - * One for us, one for whoever does the "release_task()" (usually - * parent) - */ + /* One for the user space visible state that goes away when reaped. */ + refcount_set(&tsk->rcu_users, 1); + /* One for the rcu users, and one for the scheduler */ refcount_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; From 0ff7b2cfbae36ebcd216c6a5ad7f8534eebeaee2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:33:58 -0500 Subject: [PATCH 1185/2880] tasks, sched/core: Ensure tasks are available for a grace period after leaving the runqueue In the ordinary case today the RCU grace period for a task_struct is triggered when another process wait's for it's zombine and causes the kernel to call release_task(). As the waiting task has to receive a signal and then act upon it before this happens, typically this will occur after the original task as been removed from the runqueue. Unfortunaty in some cases such as self reaping tasks it can be shown that release_task() will be called starting the grace period for task_struct long before the task leaves the runqueue. Therefore use put_task_struct_rcu_user() in finish_task_switch() to guarantee that the there is a RCU lifetime after the task leaves the runqueue. Besides the change in the start of the RCU grace period for the task_struct this change may cause perf_event_delayed_put and trace_sched_process_free. The function perf_event_delayed_put boils down to just a WARN_ON for cases that I assume never show happen. So I don't see any problem with delaying it. The function trace_sched_process_free is a trace point and thus visible to user space. Occassionally userspace has the strangest dependencies so this has a miniscule chance of causing a regression. This change only changes the timing of when the tracepoint is called. The change in timing arguably gives userspace a more accurate picture of what is going on. So I don't expect there to be a regression. In the case where a task self reaps we are pretty much guaranteed that the RCU grace period is delayed. So we should get quite a bit of coverage in of this worst case for the change in a normal threaded workload. So I expect any issues to turn up quickly or not at all. I have lightly tested this change and everything appears to work fine. Inspired-by: Linus Torvalds Inspired-by: Oleg Nesterov Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/87r24jdpl5.fsf_-_@x220.int.ebiederm.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 11 +++++++---- kernel/sched/core.c | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 7eefe338d7a2..d6e552552e52 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -902,10 +902,13 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; - /* One for the user space visible state that goes away when reaped. */ - refcount_set(&tsk->rcu_users, 1); - /* One for the rcu users, and one for the scheduler */ - refcount_set(&tsk->usage, 2); + /* + * One for the user space visible state that goes away when reaped. + * One for the scheduler. + */ + refcount_set(&tsk->rcu_users, 2); + /* One for the rcu users */ + refcount_set(&tsk->usage, 1); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 06961b997ed6..5e5fefbd4cc7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3254,7 +3254,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) /* Task is done with its stack. */ put_task_stack(prev); - put_task_struct(prev); + put_task_struct_rcu_user(prev); } tick_nohz_task_switch(); From 154abafc68bfb7c2ef2ad5308a3b2de8968c3f61 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:34:30 -0500 Subject: [PATCH 1186/2880] tasks, sched/core: With a grace period after finish_task_switch(), remove unnecessary code Remove work arounds that were written before there was a grace period after tasks left the runqueue in finish_task_switch(). In particular now that there tasks exiting the runqueue exprience a RCU grace period none of the work performed by task_rcu_dereference() excpet the rcu_dereference() is necessary so replace task_rcu_dereference() with rcu_dereference(). Remove the code in rcuwait_wait_event() that checks to ensure the current task has not exited. It is no longer necessary as it is guaranteed that any running task will experience a RCU grace period after it leaves the run queueue. Remove the comment in rcuwait_wake_up() as it is no longer relevant. Ref: 8f95c90ceb54 ("sched/wait, RCU: Introduce rcuwait machinery") Ref: 150593bf8693 ("sched/api: Introduce task_rcu_dereference() and try_get_task_struct()") Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/87lfurdpk9.fsf_-_@x220.int.ebiederm.org Signed-off-by: Ingo Molnar --- include/linux/rcuwait.h | 20 +++--------- include/linux/sched/task.h | 1 - kernel/exit.c | 67 -------------------------------------- kernel/sched/fair.c | 2 +- kernel/sched/membarrier.c | 4 +-- 5 files changed, 7 insertions(+), 87 deletions(-) diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 563290fc194f..75c97e4bbc57 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -6,16 +6,11 @@ /* * rcuwait provides a way of blocking and waking up a single - * task in an rcu-safe manner; where it is forbidden to use - * after exit_notify(). task_struct is not properly rcu protected, - * unless dealing with rcu-aware lists, ie: find_task_by_*(). + * task in an rcu-safe manner. * - * Alternatively we have task_rcu_dereference(), but the return - * semantics have different implications which would break the - * wakeup side. The only time @task is non-nil is when a user is - * blocked (or checking if it needs to) on a condition, and reset - * as soon as we know that the condition has succeeded and are - * awoken. + * The only time @task is non-nil is when a user is blocked (or + * checking if it needs to) on a condition, and reset as soon as we + * know that the condition has succeeded and are awoken. */ struct rcuwait { struct task_struct __rcu *task; @@ -37,13 +32,6 @@ extern void rcuwait_wake_up(struct rcuwait *w); */ #define rcuwait_wait_event(w, condition) \ ({ \ - /* \ - * Complain if we are called after do_exit()/exit_notify(), \ - * as we cannot rely on the rcu critical region for the \ - * wakeup side. \ - */ \ - WARN_ON(current->exit_state); \ - \ rcu_assign_pointer((w)->task, current); \ for (;;) { \ /* \ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 153a683646ac..4b1c3b664f51 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -119,7 +119,6 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } -struct task_struct *task_rcu_dereference(struct task_struct **ptask); void put_task_struct_rcu_user(struct task_struct *task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT diff --git a/kernel/exit.c b/kernel/exit.c index 3bcaec2ea3ba..a46a50d67002 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -234,69 +234,6 @@ repeat: goto repeat; } -/* - * Note that if this function returns a valid task_struct pointer (!NULL) - * task->usage must remain >0 for the duration of the RCU critical section. - */ -struct task_struct *task_rcu_dereference(struct task_struct **ptask) -{ - struct sighand_struct *sighand; - struct task_struct *task; - - /* - * We need to verify that release_task() was not called and thus - * delayed_put_task_struct() can't run and drop the last reference - * before rcu_read_unlock(). We check task->sighand != NULL, - * but we can read the already freed and reused memory. - */ -retry: - task = rcu_dereference(*ptask); - if (!task) - return NULL; - - probe_kernel_address(&task->sighand, sighand); - - /* - * Pairs with atomic_dec_and_test() in put_task_struct(). If this task - * was already freed we can not miss the preceding update of this - * pointer. - */ - smp_rmb(); - if (unlikely(task != READ_ONCE(*ptask))) - goto retry; - - /* - * We've re-checked that "task == *ptask", now we have two different - * cases: - * - * 1. This is actually the same task/task_struct. In this case - * sighand != NULL tells us it is still alive. - * - * 2. This is another task which got the same memory for task_struct. - * We can't know this of course, and we can not trust - * sighand != NULL. - * - * In this case we actually return a random value, but this is - * correct. - * - * If we return NULL - we can pretend that we actually noticed that - * *ptask was updated when the previous task has exited. Or pretend - * that probe_slab_address(&sighand) reads NULL. - * - * If we return the new task (because sighand is not NULL for any - * reason) - this is fine too. This (new) task can't go away before - * another gp pass. - * - * And note: We could even eliminate the false positive if re-read - * task->sighand once again to avoid the falsely NULL. But this case - * is very unlikely so we don't care. - */ - if (!sighand) - return NULL; - - return task; -} - void rcuwait_wake_up(struct rcuwait *w) { struct task_struct *task; @@ -316,10 +253,6 @@ void rcuwait_wake_up(struct rcuwait *w) */ smp_mb(); /* (B) */ - /* - * Avoid using task_rcu_dereference() magic as long as we are careful, - * see comment in rcuwait_wait_event() regarding ->exit_state. - */ task = rcu_dereference(w->task); if (task) wake_up_process(task); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3101c662426d..5bc23996ffae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1602,7 +1602,7 @@ static void task_numa_compare(struct task_numa_env *env, return; rcu_read_lock(); - cur = task_rcu_dereference(&dst_rq->curr); + cur = rcu_dereference(dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index aa8d75804108..b14250a11608 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -71,7 +71,7 @@ static int membarrier_global_expedited(void) continue; rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); + p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { if (!fallback) @@ -150,7 +150,7 @@ static int membarrier_private_expedited(int flags) if (cpu == raw_smp_processor_id()) continue; rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); + p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm == current->mm) { if (!fallback) __cpumask_set_cpu(cpu, tmpmask); From 5311a98fef7d0dc2e8040ae0e18f5568d6d1dd5a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:35:02 -0500 Subject: [PATCH 1187/2880] tasks, sched/core: RCUify the assignment of rq->curr The current task on the runqueue is currently read with rcu_dereference(). To obtain ordinary RCU semantics for an rcu_dereference() of rq->curr it needs to be paired with rcu_assign_pointer() of rq->curr. Which provides the memory barrier necessary to order assignments to the task_struct and the assignment to rq->curr. Unfortunately the assignment of rq->curr in __schedule is a hot path, and it has already been show that additional barriers in that code will reduce the performance of the scheduler. So I will attempt to describe below why you can effectively have ordinary RCU semantics without any additional barriers. The assignment of rq->curr in init_idle is a slow path called once per cpu and that can use rcu_assign_pointer() without any concerns. As I write this there are effectively two users of rcu_dereference() on rq->curr. There is the membarrier code in kernel/sched/membarrier.c that only looks at "->mm" after the rcu_dereference(). Then there is task_numa_compare() in kernel/sched/fair.c. My best reading of the code shows that task_numa_compare only access: "->flags", "->cpus_ptr", "->numa_group", "->numa_faults[]", "->total_numa_faults", and "->se.cfs_rq". The code in __schedule() essentially does: rq_lock(...); smp_mb__after_spinlock(); next = pick_next_task(...); rq->curr = next; context_switch(prev, next); At the start of the function the rq_lock/smp_mb__after_spinlock pair provides a full memory barrier. Further there is a full memory barrier in context_switch(). This means that any task that has already run and modified itself (the common case) has already seen two memory barriers before __schedule() runs and begins executing. A task that modifies itself then sees a third full memory barrier pair with the rq_lock(); For a brand new task that is enqueued with wake_up_new_task() there are the memory barriers present from the taking and release the pi_lock and the rq_lock as the processes is enqueued as well as the full memory barrier at the start of __schedule() assuming __schedule() happens on the same cpu. This means that by the time we reach the assignment of rq->curr except for values on the task struct modified in pick_next_task the code has the same guarantees as if it used rcu_assign_pointer(). Reading through all of the implementations of pick_next_task it appears pick_next_task is limited to modifying the task_struct fields "->se", "->rt", "->dl". These fields are the sched_entity structures of the varies schedulers. Further "->se.cfs_rq" is only changed in cgroup attach/move operations initialized by userspace. Unless I have missed something this means that in practice that the users of "rcu_dereference(rq->curr)" get normal RCU semantics of rcu_dereference() for the fields the care about, despite the assignment of rq->curr in __schedule() ot using rcu_assign_pointer. Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20190903200603.GW2349@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e5fefbd4cc7..84c71160beb1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4033,7 +4033,11 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { rq->nr_switches++; - rq->curr = next; + /* + * RCU users of rcu_dereference(rq->curr) may not see + * changes to task_struct made by pick_next_task(). + */ + RCU_INIT_POINTER(rq->curr, next); /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -6060,7 +6064,8 @@ void init_idle(struct task_struct *idle, int cpu) __set_task_cpu(idle, cpu); rcu_read_unlock(); - rq->curr = rq->idle = idle; + rq->idle = idle; + rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP idle->on_cpu = 1; From fc0d77387cb5ae883fd774fc559e056a8dde024c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:36:59 -0400 Subject: [PATCH 1188/2880] sched/membarrier: Fix private expedited registration check Fix a logic flaw in the way membarrier_register_private_expedited() handles ready state checks for private expedited sync core and private expedited registrations. If a private expedited membarrier registration is first performed, and then a private expedited sync_core registration is performed, the ready state check will skip the second registration when it really should not. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-2-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index b14250a11608..d48b95fc4026 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -226,7 +226,7 @@ static int membarrier_register_private_expedited(int flags) * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if (atomic_read(&mm->membarrier_state) & state) + if ((atomic_read(&mm->membarrier_state) & state) == state) return 0; atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); if (flags & MEMBARRIER_FLAG_SYNC_CORE) From 09554009c0cad4cb2223dd943c813c9257c6883a Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:00 -0400 Subject: [PATCH 1189/2880] sched/membarrier: Remove redundant check Checking that the number of threads is 1 is redundant with checking mm_users == 1. No change in functionality intended. Suggested-by: Oleg Nesterov Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-3-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index d48b95fc4026..7ccbd0e19626 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -186,7 +186,7 @@ static int membarrier_register_global_expedited(void) MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) return 0; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); - if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { + if (atomic_read(&mm->mm_users) == 1) { /* * For single mm user, single threaded process, we can * simply issue a memory barrier after setting @@ -232,7 +232,7 @@ static int membarrier_register_private_expedited(int flags) if (flags & MEMBARRIER_FLAG_SYNC_CORE) atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, &mm->membarrier_state); - if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { + if (atomic_read(&mm->mm_users) != 1) { /* * Ensure all future scheduler executions will observe the * new thread flag state for this process. From 2840cf02fae627860156737e83326df354ee4ec6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:01 -0400 Subject: [PATCH 1190/2880] sched/membarrier: Call sync_core only before usermode for same mm When the prev and next task's mm change, switch_mm() provides the core serializing guarantees before returning to usermode. The only case where an explicit core serialization is needed is when the scheduler keeps the same mm for prev and next. Suggested-by: Oleg Nesterov Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-4-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- include/linux/sched/mm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 4a7944078cc3..8557ec664213 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -362,6 +362,8 @@ enum { static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { + if (current->mm != mm) + return; if (likely(!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))) return; From 227a4aadc75ba22fcb6c4e1c078817b8cbaae4ce Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:02 -0400 Subject: [PATCH 1191/2880] sched/membarrier: Fix p->mm->membarrier_state racy load The membarrier_state field is located within the mm_struct, which is not guaranteed to exist when used from runqueue-lock-free iteration on runqueues by the membarrier system call. Copy the membarrier_state from the mm_struct into the scheduler runqueue when the scheduler switches between mm. When registering membarrier for mm, after setting the registration bit in the mm membarrier state, issue a synchronize_rcu() to ensure the scheduler observes the change. In order to take care of the case where a runqueue keeps executing the target mm without swapping to other mm, iterate over each runqueue and issue an IPI to copy the membarrier_state from the mm_struct into each runqueue which have the same mm which state has just been modified. Move the mm membarrier_state field closer to pgd in mm_struct to use a cache line already touched by the scheduler switch_mm. The membarrier_execve() (now membarrier_exec_mmap) hook now needs to clear the runqueue's membarrier state in addition to clear the mm membarrier state, so move its implementation into the scheduler membarrier code so it can access the runqueue structure. Add memory barrier in membarrier_exec_mmap() prior to clearing the membarrier state, ensuring memory accesses executed prior to exec are not reordered with the stores clearing the membarrier state. As suggested by Linus, move all membarrier.c RCU read-side locks outside of the for each cpu loops. Suggested-by: Linus Torvalds Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-5-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- fs/exec.c | 2 +- include/linux/mm_types.h | 14 ++- include/linux/sched/mm.h | 8 +- kernel/sched/core.c | 4 +- kernel/sched/membarrier.c | 175 ++++++++++++++++++++++++++++---------- kernel/sched/sched.h | 34 ++++++++ 6 files changed, 183 insertions(+), 54 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index f7f6a140856a..555e93c7dec8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1033,6 +1033,7 @@ static int exec_mmap(struct mm_struct *mm) } task_lock(tsk); active_mm = tsk->active_mm; + membarrier_exec_mmap(mm); tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); @@ -1825,7 +1826,6 @@ static int __do_execve_file(int fd, struct filename *filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - membarrier_execve(current); rseq_execve(current); acct_update_integrals(current); task_numa_free(current, false); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a7a1083b6fb..ec9bd3a6c827 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -383,6 +383,16 @@ struct mm_struct { unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; +#ifdef CONFIG_MEMBARRIER + /** + * @membarrier_state: Flags controlling membarrier behavior. + * + * This field is close to @pgd to hopefully fit in the same + * cache-line, which needs to be touched by switch_mm(). + */ + atomic_t membarrier_state; +#endif + /** * @mm_users: The number of users including userspace. * @@ -452,9 +462,7 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access */ struct core_state *core_state; /* coredumping support */ -#ifdef CONFIG_MEMBARRIER - atomic_t membarrier_state; -#endif + #ifdef CONFIG_AIO spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8557ec664213..e6770012db18 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -370,10 +370,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) sync_core_before_usermode(); } -static inline void membarrier_execve(struct task_struct *t) -{ - atomic_set(&t->mm->membarrier_state, 0); -} +extern void membarrier_exec_mmap(struct mm_struct *mm); + #else #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS static inline void membarrier_arch_switch_mm(struct mm_struct *prev, @@ -382,7 +380,7 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev, { } #endif -static inline void membarrier_execve(struct task_struct *t) +static inline void membarrier_exec_mmap(struct mm_struct *mm) { } static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 84c71160beb1..2d9a3947bef4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev, else prev->active_mm = NULL; } else { // to user + membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting - * rq->curr and returning to userspace. + * rq->curr / membarrier_switch_mm() and returning to userspace. * * The below provides this either through switch_mm(), or in * case 'prev->active_mm == next->mm' through * finish_task_switch()'s mmdrop(). */ - switch_mm_irqs_off(prev->active_mm, next->mm, next); if (!prev->mm) { // from kernel diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 7ccbd0e19626..070cf433bb9a 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -30,6 +30,39 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_rq_state(void *info) +{ + struct mm_struct *mm = (struct mm_struct *) info; + + if (current->mm != mm) + return; + this_cpu_write(runqueues.membarrier_state, + atomic_read(&mm->membarrier_state)); + /* + * Issue a memory barrier after setting + * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to + * guarantee that no memory access following registration is reordered + * before registration. + */ + smp_mb(); +} + +void membarrier_exec_mmap(struct mm_struct *mm) +{ + /* + * Issue a memory barrier before clearing membarrier_state to + * guarantee that no memory access prior to exec is reordered after + * clearing this state. + */ + smp_mb(); + atomic_set(&mm->membarrier_state, 0); + /* + * Keep the runqueue membarrier_state in sync with this mm + * membarrier_state. + */ + this_cpu_write(runqueues.membarrier_state, 0); +} + static int membarrier_global_expedited(void) { int cpu; @@ -56,6 +89,7 @@ static int membarrier_global_expedited(void) } cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -70,17 +104,25 @@ static int membarrier_global_expedited(void) if (cpu == raw_smp_processor_id()) continue; - rcu_read_lock(); + if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) + continue; + + /* + * Skip the CPU if it runs a kernel thread. The scheduler + * leaves the prior task mm in place as an optimization when + * scheduling a kthread. + */ p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & - MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } - rcu_read_unlock(); + if (p->flags & PF_KTHREAD) + continue; + + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); } + rcu_read_unlock(); if (!fallback) { preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); @@ -136,6 +178,7 @@ static int membarrier_private_expedited(int flags) } cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -157,8 +200,8 @@ static int membarrier_private_expedited(int flags) else smp_call_function_single(cpu, ipi_mb, NULL, 1); } - rcu_read_unlock(); } + rcu_read_unlock(); if (!fallback) { preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); @@ -177,32 +220,78 @@ static int membarrier_private_expedited(int flags) return 0; } +static int sync_runqueues_membarrier_state(struct mm_struct *mm) +{ + int membarrier_state = atomic_read(&mm->membarrier_state); + cpumask_var_t tmpmask; + int cpu; + + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { + this_cpu_write(runqueues.membarrier_state, membarrier_state); + + /* + * For single mm user, we can simply issue a memory barrier + * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the + * mm and in the current runqueue to guarantee that no memory + * access following registration is reordered before + * registration. + */ + smp_mb(); + return 0; + } + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + /* + * For mm with multiple users, we need to ensure all future + * scheduler executions will observe @mm's new membarrier + * state. + */ + synchronize_rcu(); + + /* + * For each cpu runqueue, if the task's mm match @mm, ensure that all + * @mm's membarrier state set bits are also set in in the runqueue's + * membarrier state. This ensures that a runqueue scheduling + * between threads which are users of @mm has its membarrier state + * updated. + */ + cpus_read_lock(); + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct task_struct *p; + + p = rcu_dereference(&rq->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); + cpus_read_unlock(); + + return 0; +} + static int membarrier_register_global_expedited(void) { struct task_struct *p = current; struct mm_struct *mm = p->mm; + int ret; if (atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) return 0; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); - if (atomic_read(&mm->mm_users) == 1) { - /* - * For single mm user, single threaded process, we can - * simply issue a memory barrier after setting - * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that - * no memory access following registration is reordered - * before registration. - */ - smp_mb(); - } else { - /* - * For multi-mm user threads, we need to ensure all - * future scheduler executions will observe the new - * thread flag state for this mm. - */ - synchronize_rcu(); - } + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, &mm->membarrier_state); @@ -213,12 +302,15 @@ static int membarrier_register_private_expedited(int flags) { struct task_struct *p = current; struct mm_struct *mm = p->mm; - int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; + int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, + ret; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; } /* @@ -226,20 +318,15 @@ static int membarrier_register_private_expedited(int flags) * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if ((atomic_read(&mm->membarrier_state) & state) == state) + if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state) return 0; - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); if (flags & MEMBARRIER_FLAG_SYNC_CORE) - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, - &mm->membarrier_state); - if (atomic_read(&mm->mm_users) != 1) { - /* - * Ensure all future scheduler executions will observe the - * new thread flag state for this process. - */ - synchronize_rcu(); - } - atomic_or(state, &mm->membarrier_state); + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + atomic_or(set_state, &mm->membarrier_state); + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; + atomic_or(ready_state, &mm->membarrier_state); return 0; } @@ -253,8 +340,10 @@ static int membarrier_register_private_expedited(int flags) * command specified does not exist, not available on the running * kernel, or if the command argument is invalid, this system call * returns -EINVAL. For a given command, with flags argument set to 0, - * this system call is guaranteed to always return the same value until - * reboot. + * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to + * always return the same value until reboot. In addition, it can return + * -ENOMEM if there is not enough memory available to perform the system + * call. * * All memory accesses performed in program order from each targeted thread * is guaranteed to be ordered with respect to sys_membarrier(). If we use diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3cb895d14a2..0db2c1b3361e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -911,6 +911,10 @@ struct rq { atomic_t nr_iowait; +#ifdef CONFIG_MEMBARRIER + int membarrier_state; +#endif + #ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -2438,3 +2442,33 @@ static inline bool sched_energy_enabled(void) static inline bool sched_energy_enabled(void) { return false; } #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_MEMBARRIER +/* + * The scheduler provides memory barriers required by membarrier between: + * - prior user-space memory accesses and store to rq->membarrier_state, + * - store to rq->membarrier_state and following user-space memory accesses. + * In the same way it provides those guarantees around store to rq->curr. + */ +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + int membarrier_state; + + if (prev_mm == next_mm) + return; + + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} +#else +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ +} +#endif From 19a4ff534bb09686f53800564cb977bad2177c00 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:03 -0400 Subject: [PATCH 1192/2880] selftests, sched/membarrier: Add multi-threaded test membarrier commands cover very different code paths if they are in a single-threaded vs multi-threaded process. Therefore, exercise both scenarios in the kernel selftests to increase coverage of this selftest. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Shuah Khan Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-6-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- tools/testing/selftests/membarrier/.gitignore | 3 +- tools/testing/selftests/membarrier/Makefile | 5 +- ...mbarrier_test.c => membarrier_test_impl.h} | 40 +++++----- .../membarrier/membarrier_test_multi_thread.c | 73 +++++++++++++++++++ .../membarrier_test_single_thread.c | 24 ++++++ 5 files changed, 124 insertions(+), 21 deletions(-) rename tools/testing/selftests/membarrier/{membarrier_test.c => membarrier_test_impl.h} (95%) create mode 100644 tools/testing/selftests/membarrier/membarrier_test_multi_thread.c create mode 100644 tools/testing/selftests/membarrier/membarrier_test_single_thread.c diff --git a/tools/testing/selftests/membarrier/.gitignore b/tools/testing/selftests/membarrier/.gitignore index 020c44f49a9e..f2f7ec0a99b4 100644 --- a/tools/testing/selftests/membarrier/.gitignore +++ b/tools/testing/selftests/membarrier/.gitignore @@ -1 +1,2 @@ -membarrier_test +membarrier_test_multi_thread +membarrier_test_single_thread diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile index 97e3bdf3d1e9..34d1c81a2324 100644 --- a/tools/testing/selftests/membarrier/Makefile +++ b/tools/testing/selftests/membarrier/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -g -I../../../../usr/include/ +LDLIBS += -lpthread -TEST_GEN_PROGS := membarrier_test +TEST_GEN_PROGS := membarrier_test_single_thread \ + membarrier_test_multi_thread include ../lib.mk - diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test_impl.h similarity index 95% rename from tools/testing/selftests/membarrier/membarrier_test.c rename to tools/testing/selftests/membarrier/membarrier_test_impl.h index 70b4ddbf126b..186be69f0a59 100644 --- a/tools/testing/selftests/membarrier/membarrier_test.c +++ b/tools/testing/selftests/membarrier/membarrier_test_impl.h @@ -1,10 +1,11 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #define _GNU_SOURCE #include #include #include #include #include +#include #include "../kselftest.h" @@ -223,7 +224,7 @@ static int test_membarrier_global_expedited_success(void) return 0; } -static int test_membarrier(void) +static int test_membarrier_fail(void) { int status; @@ -233,10 +234,27 @@ static int test_membarrier(void) status = test_membarrier_flags_fail(); if (status) return status; - status = test_membarrier_global_success(); + status = test_membarrier_private_expedited_fail(); if (status) return status; - status = test_membarrier_private_expedited_fail(); + status = sys_membarrier(MEMBARRIER_CMD_QUERY, 0); + if (status < 0) { + ksft_test_result_fail("sys_membarrier() failed\n"); + return status; + } + if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) { + status = test_membarrier_private_expedited_sync_core_fail(); + if (status) + return status; + } + return 0; +} + +static int test_membarrier_success(void) +{ + int status; + + status = test_membarrier_global_success(); if (status) return status; status = test_membarrier_register_private_expedited_success(); @@ -251,9 +269,6 @@ static int test_membarrier(void) return status; } if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) { - status = test_membarrier_private_expedited_sync_core_fail(); - if (status) - return status; status = test_membarrier_register_private_expedited_sync_core_success(); if (status) return status; @@ -300,14 +315,3 @@ static int test_membarrier_query(void) ksft_test_result_pass("sys_membarrier available\n"); return 0; } - -int main(int argc, char **argv) -{ - ksft_print_header(); - ksft_set_plan(13); - - test_membarrier_query(); - test_membarrier(); - - return ksft_exit_pass(); -} diff --git a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c new file mode 100644 index 000000000000..ac5613e5b0eb --- /dev/null +++ b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "membarrier_test_impl.h" + +static int thread_ready, thread_quit; +static pthread_mutex_t test_membarrier_thread_mutex = + PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t test_membarrier_thread_cond = + PTHREAD_COND_INITIALIZER; + +void *test_membarrier_thread(void *arg) +{ + pthread_mutex_lock(&test_membarrier_thread_mutex); + thread_ready = 1; + pthread_cond_broadcast(&test_membarrier_thread_cond); + pthread_mutex_unlock(&test_membarrier_thread_mutex); + + pthread_mutex_lock(&test_membarrier_thread_mutex); + while (!thread_quit) + pthread_cond_wait(&test_membarrier_thread_cond, + &test_membarrier_thread_mutex); + pthread_mutex_unlock(&test_membarrier_thread_mutex); + + return NULL; +} + +static int test_mt_membarrier(void) +{ + int i; + pthread_t test_thread; + + pthread_create(&test_thread, NULL, + test_membarrier_thread, NULL); + + pthread_mutex_lock(&test_membarrier_thread_mutex); + while (!thread_ready) + pthread_cond_wait(&test_membarrier_thread_cond, + &test_membarrier_thread_mutex); + pthread_mutex_unlock(&test_membarrier_thread_mutex); + + test_membarrier_fail(); + + test_membarrier_success(); + + pthread_mutex_lock(&test_membarrier_thread_mutex); + thread_quit = 1; + pthread_cond_broadcast(&test_membarrier_thread_cond); + pthread_mutex_unlock(&test_membarrier_thread_mutex); + + pthread_join(test_thread, NULL); + + return 0; +} + +int main(int argc, char **argv) +{ + ksft_print_header(); + ksft_set_plan(13); + + test_membarrier_query(); + + /* Multi-threaded */ + test_mt_membarrier(); + + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c new file mode 100644 index 000000000000..c1c963902854 --- /dev/null +++ b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "membarrier_test_impl.h" + +int main(int argc, char **argv) +{ + ksft_print_header(); + ksft_set_plan(13); + + test_membarrier_query(); + + test_membarrier_fail(); + + test_membarrier_success(); + + return ksft_exit_pass(); +} From c6d68c1c4a4d6611fc0f8145d764226571d737ca Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:04 -0400 Subject: [PATCH 1193/2880] sched/membarrier: Skip IPIs when mm->mm_users == 1 If there is only a single mm_user for the mm, the private expedited membarrier command can skip the IPIs, because only a single thread is using the mm. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-7-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 070cf433bb9a..fced54ad0f3d 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -145,20 +145,21 @@ static int membarrier_private_expedited(int flags) int cpu; bool fallback = false; cpumask_var_t tmpmask; + struct mm_struct *mm = current->mm; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - if (!(atomic_read(¤t->mm->membarrier_state) & + if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; } else { - if (!(atomic_read(¤t->mm->membarrier_state) & + if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; } - if (num_online_cpus() == 1) + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) return 0; /* @@ -194,7 +195,7 @@ static int membarrier_private_expedited(int flags) continue; rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == current->mm) { + if (p && p->mm == mm) { if (!fallback) __cpumask_set_cpu(cpu, tmpmask); else From c172e0a3e8e65a4c6fffec5bc4d6de08d6f894f7 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:05 -0400 Subject: [PATCH 1194/2880] sched/membarrier: Return -ENOMEM to userspace on memory allocation failure Remove the IPI fallback code from membarrier to deal with very infrequent cpumask memory allocation failure. Use GFP_KERNEL rather than GFP_NOWAIT, and relax the blocking guarantees for the expedited membarrier system call commands, allowing it to block if waiting for memory to be made available. In addition, now -ENOMEM can be returned to user-space if the cpumask memory allocation fails. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-8-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 63 +++++++++++++-------------------------- 1 file changed, 20 insertions(+), 43 deletions(-) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index fced54ad0f3d..a39bed2c784f 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -66,7 +66,6 @@ void membarrier_exec_mmap(struct mm_struct *mm) static int membarrier_global_expedited(void) { int cpu; - bool fallback = false; cpumask_var_t tmpmask; if (num_online_cpus() == 1) @@ -78,15 +77,8 @@ static int membarrier_global_expedited(void) */ smp_mb(); /* system call entry is not a mb. */ - /* - * Expedited membarrier commands guarantee that they won't - * block, hence the GFP_NOWAIT allocation flag and fallback - * implementation. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { - /* Fallback for OOM. */ - fallback = true; - } + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; cpus_read_lock(); rcu_read_lock(); @@ -117,18 +109,15 @@ static int membarrier_global_expedited(void) if (p->flags & PF_KTHREAD) continue; - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); + __cpumask_set_cpu(cpu, tmpmask); } rcu_read_unlock(); - if (!fallback) { - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - free_cpumask_var(tmpmask); - } + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -143,7 +132,6 @@ static int membarrier_global_expedited(void) static int membarrier_private_expedited(int flags) { int cpu; - bool fallback = false; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; @@ -168,15 +156,8 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - /* - * Expedited membarrier commands guarantee that they won't - * block, hence the GFP_NOWAIT allocation flag and fallback - * implementation. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { - /* Fallback for OOM. */ - fallback = true; - } + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; cpus_read_lock(); rcu_read_lock(); @@ -195,20 +176,16 @@ static int membarrier_private_expedited(int flags) continue; rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == mm) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); } rcu_read_unlock(); - if (!fallback) { - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - free_cpumask_var(tmpmask); - } + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -264,7 +241,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) struct rq *rq = cpu_rq(cpu); struct task_struct *p; - p = rcu_dereference(&rq->curr); + p = rcu_dereference(rq->curr); if (p && p->mm == mm) __cpumask_set_cpu(cpu, tmpmask); } From 714e501e16cd473538b609b3e351b2cc9f7f09ed Mon Sep 17 00:00:00 2001 From: KeMeng Shi Date: Mon, 16 Sep 2019 06:53:28 +0000 Subject: [PATCH 1195/2880] sched/core: Fix migration to invalid CPU in __set_cpus_allowed_ptr() An oops can be triggered in the scheduler when running qemu on arm64: Unable to handle kernel paging request at virtual address ffff000008effe40 Internal error: Oops: 96000007 [#1] SMP Process migration/0 (pid: 12, stack limit = 0x00000000084e3736) pstate: 20000085 (nzCv daIf -PAN -UAO) pc : __ll_sc___cmpxchg_case_acq_4+0x4/0x20 lr : move_queued_task.isra.21+0x124/0x298 ... Call trace: __ll_sc___cmpxchg_case_acq_4+0x4/0x20 __migrate_task+0xc8/0xe0 migration_cpu_stop+0x170/0x180 cpu_stopper_thread+0xec/0x178 smpboot_thread_fn+0x1ac/0x1e8 kthread+0x134/0x138 ret_from_fork+0x10/0x18 __set_cpus_allowed_ptr() will choose an active dest_cpu in affinity mask to migrage the process if process is not currently running on any one of the CPUs specified in affinity mask. __set_cpus_allowed_ptr() will choose an invalid dest_cpu (dest_cpu >= nr_cpu_ids, 1024 in my virtual machine) if CPUS in an affinity mask are deactived by cpu_down after cpumask_intersects check. cpumask_test_cpu() of dest_cpu afterwards is overflown and may pass if corresponding bit is coincidentally set. As a consequence, kernel will access an invalid rq address associate with the invalid CPU in migration_cpu_stop->__migrate_task->move_queued_task and the Oops occurs. The reproduce the crash: 1) A process repeatedly binds itself to cpu0 and cpu1 in turn by calling sched_setaffinity. 2) A shell script repeatedly does "echo 0 > /sys/devices/system/cpu/cpu1/online" and "echo 1 > /sys/devices/system/cpu/cpu1/online" in turn. 3) Oops appears if the invalid CPU is set in memory after tested cpumask. Signed-off-by: KeMeng Shi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/1568616808-16808-1-git-send-email-shikemeng@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d9a3947bef4..83ea23e9e91f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1656,7 +1656,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(p->cpus_ptr, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } @@ -1677,7 +1678,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ From 763a9ec06c409dcde2a761aac4bb83ff3938e0b3 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 20 Aug 2019 14:40:55 -0400 Subject: [PATCH 1196/2880] sched/fair: Fix -Wunused-but-set-variable warnings Commit: de53fd7aedb1 ("sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices") introduced a few compilation warnings: kernel/sched/fair.c: In function '__refill_cfs_bandwidth_runtime': kernel/sched/fair.c:4365:6: warning: variable 'now' set but not used [-Wunused-but-set-variable] kernel/sched/fair.c: In function 'start_cfs_bandwidth': kernel/sched/fair.c:4992:6: warning: variable 'overrun' set but not used [-Wunused-but-set-variable] Also, __refill_cfs_bandwidth_runtime() does no longer update the expiration time, so fix the comments accordingly. Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Reviewed-by: Dave Chiluk Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: pauld@redhat.com Fixes: de53fd7aedb1 ("sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices") Link: https://lkml.kernel.org/r/1566326455-8038-1-git-send-email-cai@lca.pw Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5bc23996ffae..dfdac90fd211 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4353,21 +4353,16 @@ static inline u64 sched_cfs_bandwidth_slice(void) } /* - * Replenish runtime according to assigned quota and update expiration time. - * We use sched_clock_cpu directly instead of rq->clock to avoid adding - * additional synchronization around rq->lock. + * Replenish runtime according to assigned quota. We use sched_clock_cpu + * directly instead of rq->clock to avoid adding additional synchronization + * around rq->lock. * * requires cfs_b->lock */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - u64 now; - - if (cfs_b->quota == RUNTIME_INF) - return; - - now = sched_clock_cpu(smp_processor_id()); - cfs_b->runtime = cfs_b->quota; + if (cfs_b->quota != RUNTIME_INF) + cfs_b->runtime = cfs_b->quota; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4983,15 +4978,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { - u64 overrun; - lockdep_assert_held(&cfs_b->lock); if (cfs_b->period_active) return; cfs_b->period_active = 1; - overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } From a49b4f4012ef233143c5f7ce44f97851e54d5ef9 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 23 Sep 2019 15:36:12 +0100 Subject: [PATCH 1197/2880] sched/core: Fix preempt_schedule() interrupt return comment preempt_schedule_irq() is the one that should be called on return from interrupt, clean up the comment to avoid any ambiguity. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-m68k@lists.linux-m68k.org Cc: linux-riscv@lists.infradead.org Cc: uclinux-h8-devel@lists.sourceforge.jp Link: https://lkml.kernel.org/r/20190923143620.29334-2-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83ea23e9e91f..00ef44c8f7b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4218,9 +4218,8 @@ static void __sched notrace preempt_schedule_common(void) #ifdef CONFIG_PREEMPTION /* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. + * This is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. */ asmlinkage __visible void __sched notrace preempt_schedule(void) { @@ -4291,7 +4290,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #endif /* CONFIG_PREEMPTION */ /* - * this is the entry point to schedule() from kernel preemption + * This is the entry point to schedule() from kernel preemption * off of irq context. * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. From 9fc41acc89e58262c917b4be89ef00a87485804f Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 23 Sep 2019 10:30:17 +0100 Subject: [PATCH 1198/2880] sched/core: Remove double update_max_interval() call on CPU startup update_max_interval() is called in both CPUHP_AP_SCHED_STARTING's startup and teardown callbacks, but it turns out it's also called at the end of the startup callback of CPUHP_AP_ACTIVE (which is further down the startup sequence). There's no point in repeating this interval update in the startup sequence since the CPU will remain online until it goes down the teardown path. Remove the redundant call in sched_cpu_activate() (CPUHP_AP_ACTIVE). Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20190923093017.11755-1-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 00ef44c8f7b5..1b61cf48eee8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6425,8 +6425,6 @@ int sched_cpu_activate(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); - update_max_interval(); - return 0; } From 4892f51ad54ddff2883a60b6ad4323c1f632a9d6 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 20 Sep 2019 11:41:15 +0200 Subject: [PATCH 1199/2880] sched/fair: Avoid redundant EAS calculation The EAS wake-up path computes the system energy for several CPU candidates: the CPU with maximum spare capacity in each performance domain, and the prev_cpu. However, if prev_cpu also happens to be the CPU with maximum spare capacity in its performance domain, the energy calculation is still done twice, unnecessarily. Add a condition to filter out this corner case before doing the energy calculation. Reported-by: Pavan Kondeti Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: qais.yousef@arm.com Cc: rjw@rjwysocki.net Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Fixes: eb92692b2544 ("sched/fair: Speed-up energy-aware wake-ups") Link: https://lkml.kernel.org/r/20190920094115.GA11503@qperret.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dfdac90fd211..83ab35e2374f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6389,7 +6389,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0) { + if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { cur_delta = compute_energy(p, max_spare_cap_cpu, pd); cur_delta -= base_energy_pd; if (cur_delta < best_delta) { From 8ad8041b98c665b6147e607b749586d6e20ba73a Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 24 Sep 2019 09:25:52 -0700 Subject: [PATCH 1200/2880] ARM: OMAP2+: Fix missing reset done flag for am3 and am43 For ti,sysc-omap4 compatible devices with no sysstatus register, we do have reset done status available in the SOFTRESET bit that clears when the reset is done. This is documented for example in am437x TRM for DMTIMER_TIOCP_CFG register. The am335x TRM just says that SOFTRESET bit value 1 means reset is ongoing, but it behaves the same way clearing after reset is done. With the ti-sysc driver handling this automatically based on no sysstatus register defined, we see warnings if SYSC_HAS_RESET_STATUS is missing in the legacy platform data: ti-sysc 48042000.target-module: sysc_flags 00000222 != 00000022 ti-sysc 48044000.target-module: sysc_flags 00000222 != 00000022 ti-sysc 48046000.target-module: sysc_flags 00000222 != 00000022 ... Let's fix these warnings by adding SYSC_HAS_RESET_STATUS. Let's also remove the useless parentheses while at it. If it turns out we do have ti,sysc-omap4 compatible devices without a working SOFTRESET bit we can set up additional quirk handling for it. Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/omap_hwmod_33xx_43xx_ipblock_data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/mach-omap2/omap_hwmod_33xx_43xx_ipblock_data.c b/arch/arm/mach-omap2/omap_hwmod_33xx_43xx_ipblock_data.c index adb6271f819b..7773876d165f 100644 --- a/arch/arm/mach-omap2/omap_hwmod_33xx_43xx_ipblock_data.c +++ b/arch/arm/mach-omap2/omap_hwmod_33xx_43xx_ipblock_data.c @@ -811,7 +811,8 @@ static struct omap_hwmod_class_sysconfig am33xx_timer_sysc = { .rev_offs = 0x0000, .sysc_offs = 0x0010, .syss_offs = 0x0014, - .sysc_flags = (SYSC_HAS_SIDLEMODE | SYSC_HAS_SOFTRESET), + .sysc_flags = SYSC_HAS_SIDLEMODE | SYSC_HAS_SOFTRESET | + SYSC_HAS_RESET_STATUS, .idlemodes = (SIDLE_FORCE | SIDLE_NO | SIDLE_SMART | SIDLE_SMART_WKUP), .sysc_fields = &omap_hwmod_sysc_type2, From 17529d43b21c72466e9109d602c6f5c360a1a9e8 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 24 Sep 2019 09:25:51 -0700 Subject: [PATCH 1201/2880] ARM: OMAP2+: Add missing LCDC midlemode for am335x TRM "Table 13-34. SYSCONFIG Register Field Descriptions" lists both standbymode and idlemode that should be just the sidle and midle registers where midle is currently unconfigured for lcdc_sysc. As the dts data has been generated based on lcdc_sysc, we now have an empty "ti,sysc-midle" property. And so we currently get a warning for lcdc because of a difference with dts provided configuration compared to the legacy platform data. This is because lcdc has SYSC_HAS_MIDLEMODE configured in the platform data without configuring the modes. Let's fix the issue by adding the missing midlemode to lcdc_sysc, and configuring the "ti,sysc-midle" property based on the TRM values. Fixes: f711c575cfec ("ARM: dts: am335x: Add l4 interconnect hierarchy and ti-sysc data") Cc: Jyri Sarha Cc: Keerthy Cc: Robert Nelson Cc: Suman Anna Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am33xx-l4.dtsi | 4 +++- arch/arm/mach-omap2/omap_hwmod_33xx_data.c | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/am33xx-l4.dtsi b/arch/arm/boot/dts/am33xx-l4.dtsi index 1515f4f91499..3287cf695b5a 100644 --- a/arch/arm/boot/dts/am33xx-l4.dtsi +++ b/arch/arm/boot/dts/am33xx-l4.dtsi @@ -2038,7 +2038,9 @@ reg = <0xe000 0x4>, <0xe054 0x4>; reg-names = "rev", "sysc"; - ti,sysc-midle ; + ti,sysc-midle = , + , + ; ti,sysc-sidle = , , ; diff --git a/arch/arm/mach-omap2/omap_hwmod_33xx_data.c b/arch/arm/mach-omap2/omap_hwmod_33xx_data.c index c965af275e34..81d9912f17c8 100644 --- a/arch/arm/mach-omap2/omap_hwmod_33xx_data.c +++ b/arch/arm/mach-omap2/omap_hwmod_33xx_data.c @@ -231,8 +231,9 @@ static struct omap_hwmod am33xx_control_hwmod = { static struct omap_hwmod_class_sysconfig lcdc_sysc = { .rev_offs = 0x0, .sysc_offs = 0x54, - .sysc_flags = (SYSC_HAS_SIDLEMODE | SYSC_HAS_MIDLEMODE), - .idlemodes = (SIDLE_FORCE | SIDLE_NO | SIDLE_SMART), + .sysc_flags = SYSC_HAS_SIDLEMODE | SYSC_HAS_MIDLEMODE, + .idlemodes = SIDLE_FORCE | SIDLE_NO | SIDLE_SMART | + MSTANDBY_FORCE | MSTANDBY_NO | MSTANDBY_SMART, .sysc_fields = &omap_hwmod_sysc_type2, }; From cf395f7ddb9ebc6b2d28d83b53d18aa4e7c19701 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 24 Sep 2019 16:19:00 -0700 Subject: [PATCH 1202/2880] ARM: OMAP2+: Fix warnings with broken omap2_set_init_voltage() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This code is currently unable to find the dts opp tables as ti-cpufreq needs to set them up first based on speed binning. We stopped initializing the opp tables with platform code years ago for device tree based booting with commit 92d51856d740 ("ARM: OMAP3+: do not register non-dt OPP tables for device tree boot"), and all of mach-omap2 is now booting using device tree. We currently get the following errors on init: omap2_set_init_voltage: unable to find boot up OPP for vdd_mpu omap2_set_init_voltage: unable to set vdd_mpu omap2_set_init_voltage: unable to find boot up OPP for vdd_core omap2_set_init_voltage: unable to set vdd_core omap2_set_init_voltage: unable to find boot up OPP for vdd_iva omap2_set_init_voltage: unable to set vdd_iva Let's just drop the unused code. Nowadays ti-cpufreq should be used to to initialize things properly. Cc: Adam Ford Cc: André Roth Cc: "H. Nikolaus Schaller" Cc: Nishanth Menon Cc: Tero Kristo Tested-by: Adam Ford #logicpd-torpedo-37xx-devkit Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/pm.c | 100 --------------------------------------- 1 file changed, 100 deletions(-) diff --git a/arch/arm/mach-omap2/pm.c b/arch/arm/mach-omap2/pm.c index 1fde1bf53fb6..7ac9af56762d 100644 --- a/arch/arm/mach-omap2/pm.c +++ b/arch/arm/mach-omap2/pm.c @@ -74,83 +74,6 @@ int omap_pm_clkdms_setup(struct clockdomain *clkdm, void *unused) return 0; } -/* - * This API is to be called during init to set the various voltage - * domains to the voltage as per the opp table. Typically we boot up - * at the nominal voltage. So this function finds out the rate of - * the clock associated with the voltage domain, finds out the correct - * opp entry and sets the voltage domain to the voltage specified - * in the opp entry - */ -static int __init omap2_set_init_voltage(char *vdd_name, char *clk_name, - const char *oh_name) -{ - struct voltagedomain *voltdm; - struct clk *clk; - struct dev_pm_opp *opp; - unsigned long freq, bootup_volt; - struct device *dev; - - if (!vdd_name || !clk_name || !oh_name) { - pr_err("%s: invalid parameters\n", __func__); - goto exit; - } - - if (!strncmp(oh_name, "mpu", 3)) - /* - * All current OMAPs share voltage rail and clock - * source, so CPU0 is used to represent the MPU-SS. - */ - dev = get_cpu_device(0); - else - dev = omap_device_get_by_hwmod_name(oh_name); - - if (IS_ERR(dev)) { - pr_err("%s: Unable to get dev pointer for hwmod %s\n", - __func__, oh_name); - goto exit; - } - - voltdm = voltdm_lookup(vdd_name); - if (!voltdm) { - pr_err("%s: unable to get vdd pointer for vdd_%s\n", - __func__, vdd_name); - goto exit; - } - - clk = clk_get(NULL, clk_name); - if (IS_ERR(clk)) { - pr_err("%s: unable to get clk %s\n", __func__, clk_name); - goto exit; - } - - freq = clk_get_rate(clk); - clk_put(clk); - - opp = dev_pm_opp_find_freq_ceil(dev, &freq); - if (IS_ERR(opp)) { - pr_err("%s: unable to find boot up OPP for vdd_%s\n", - __func__, vdd_name); - goto exit; - } - - bootup_volt = dev_pm_opp_get_voltage(opp); - dev_pm_opp_put(opp); - - if (!bootup_volt) { - pr_err("%s: unable to find voltage corresponding to the bootup OPP for vdd_%s\n", - __func__, vdd_name); - goto exit; - } - - voltdm_scale(voltdm, bootup_volt); - return 0; - -exit: - pr_err("%s: unable to set vdd_%s\n", __func__, vdd_name); - return -EINVAL; -} - #ifdef CONFIG_SUSPEND static int omap_pm_enter(suspend_state_t suspend_state) { @@ -208,25 +131,6 @@ void omap_common_suspend_init(void *pm_suspend) } #endif /* CONFIG_SUSPEND */ -static void __init omap3_init_voltages(void) -{ - if (!soc_is_omap34xx()) - return; - - omap2_set_init_voltage("mpu_iva", "dpll1_ck", "mpu"); - omap2_set_init_voltage("core", "l3_ick", "l3_main"); -} - -static void __init omap4_init_voltages(void) -{ - if (!soc_is_omap44xx()) - return; - - omap2_set_init_voltage("mpu", "dpll_mpu_ck", "mpu"); - omap2_set_init_voltage("core", "l3_div_ck", "l3_main_1"); - omap2_set_init_voltage("iva", "dpll_iva_m5x2_ck", "iva"); -} - int __maybe_unused omap_pm_nop_init(void) { return 0; @@ -246,10 +150,6 @@ int __init omap2_common_pm_late_init(void) omap4_twl_init(); omap_voltage_late_init(); - /* Initialize the voltages */ - omap3_init_voltages(); - omap4_init_voltages(); - /* Smartreflex device init */ omap_devinit_smartreflex(); From fd3edd4a9066f28de99a16685a586d68a9f551f8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 25 Sep 2019 18:33:53 +0200 Subject: [PATCH 1203/2880] KVM: nVMX: cleanup and fix host 64-bit mode checks KVM was incorrectly checking vmcs12->host_ia32_efer even if the "load IA32_EFER" exit control was reset. Also, some checks were not using the new CC macro for tracing. Cleanup everything so that the vCPU's 64-bit mode is determined directly from EFER_LMA and the VMCS checks are based on that, which matches section 26.2.4 of the SDM. Cc: Sean Christopherson Cc: Krish Sadhukhan Fixes: 5845038c111db27902bc220a4f70070fe945871c Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 50 +++++++++++++++------------------------ 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 70d59d9304f2..41abc62c9a8a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2664,8 +2664,23 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) return -EINVAL; - ia32e = (vmcs12->vm_exit_controls & - VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; +#ifdef CONFIG_X86_64 + ia32e = !!(vcpu->arch.efer & EFER_LMA); +#else + ia32e = false; +#endif + + if (ia32e) { + if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || + CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) + return -EINVAL; + } else { + if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || + CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || + CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || + CC((vmcs12->host_rip) >> 32)) + return -EINVAL; + } if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || @@ -2684,35 +2699,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu))) - return -EINVAL; - - if (!(vmcs12->host_ia32_efer & EFER_LMA) && - ((vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || - (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE))) { - return -EINVAL; - } - - if ((vmcs12->host_ia32_efer & EFER_LMA) && - !(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) { - return -EINVAL; - } - - if (!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) && - ((vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || - (vmcs12->host_cr4 & X86_CR4_PCIDE) || - (((vmcs12->host_rip) >> 32) & 0xffffffff))) { - return -EINVAL; - } - - if ((vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) && - ((!(vmcs12->host_cr4 & X86_CR4_PAE)) || - (is_noncanonical_address(vmcs12->host_rip, vcpu)))) { - return -EINVAL; - } -#else - if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE || - vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) return -EINVAL; #endif From 9620bc361ac6e292ad2d6997b2f59f41f4e17862 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 25 Sep 2019 15:06:59 -0300 Subject: [PATCH 1204/2880] perf evsel: Remove need for symbol_conf in evsel_fprintf.c So that we an later link it to the python binding without having to drag the symbol object files. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-8823tveyasocnuoelq4qopwf@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 2 +- tools/perf/builtin-script.c | 6 ++++-- tools/perf/builtin-trace.c | 2 +- tools/perf/util/evsel.h | 7 ++++--- tools/perf/util/evsel_fprintf.c | 14 ++++++-------- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 079e67a36904..f9706306fea0 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -2055,7 +2055,7 @@ static void timehist_print_sample(struct perf_sched *sched, EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE | EVSEL__PRINT_CALLCHAIN_ARROW | EVSEL__PRINT_SKIP_IGNORED, - &callchain_cursor, stdout); + &callchain_cursor, symbol_conf.bt_stop_list, stdout); out: printf("\n"); diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 22c1d114014c..0d43e2e5afff 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1325,7 +1325,8 @@ static int perf_sample__fprintf_bts(struct perf_sample *sample, } else printed += fprintf(fp, "\n"); - printed += sample__fprintf_sym(sample, al, 0, print_opts, cursor, fp); + printed += sample__fprintf_sym(sample, al, 0, print_opts, cursor, + symbol_conf.bt_stop_list, fp); } /* print branch_to information */ @@ -1867,7 +1868,8 @@ static void process_event(struct perf_script *script, cursor = &callchain_cursor; fputc(cursor ? '\n' : ' ', fp); - sample__fprintf_sym(sample, al, 0, output[type].print_ip_opts, cursor, fp); + sample__fprintf_sym(sample, al, 0, output[type].print_ip_opts, cursor, + symbol_conf.bt_stop_list, fp); } if (PRINT_FIELD(IREGS)) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index c52c3120a811..318225c8d7a7 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2076,7 +2076,7 @@ static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sam EVSEL__PRINT_DSO | EVSEL__PRINT_UNKNOWN_AS_ADDR; - return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output); + return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, symbol_conf.bt_stop_list, trace->output); } static const char *errno_to_name(struct evsel *evsel, int err) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index a5b29ac10da0..4a4c64833893 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -428,12 +428,13 @@ int perf_evsel__fprintf(struct evsel *evsel, struct callchain_cursor; int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, - unsigned int print_opts, - struct callchain_cursor *cursor, FILE *fp); + unsigned int print_opts, struct callchain_cursor *cursor, + struct strlist *bt_stop_list, FILE *fp); int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, int left_alignment, unsigned int print_opts, - struct callchain_cursor *cursor, FILE *fp); + struct callchain_cursor *cursor, + struct strlist *bt_stop_list, FILE *fp); bool perf_evsel__fallback(struct evsel *evsel, int err, char *msg, size_t msgsize); diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index a8cbdf4c2753..756b1e852db7 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -102,7 +102,7 @@ out: int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, unsigned int print_opts, struct callchain_cursor *cursor, - FILE *fp) + struct strlist *bt_stop_list, FILE *fp) { int printed = 0; struct callchain_cursor_node *node; @@ -175,10 +175,8 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, printed += fprintf(fp, "\n"); /* Add srccode here too? */ - if (symbol_conf.bt_stop_list && - node->sym && - strlist__has_entry(symbol_conf.bt_stop_list, - node->sym->name)) { + if (bt_stop_list && node->sym && + strlist__has_entry(bt_stop_list, node->sym->name)) { break; } @@ -193,7 +191,7 @@ next: int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, int left_alignment, unsigned int print_opts, - struct callchain_cursor *cursor, FILE *fp) + struct callchain_cursor *cursor, struct strlist *bt_stop_list, FILE *fp) { int printed = 0; int print_ip = print_opts & EVSEL__PRINT_IP; @@ -204,8 +202,8 @@ int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; if (cursor != NULL) { - printed += sample__fprintf_callchain(sample, left_alignment, - print_opts, cursor, fp); + printed += sample__fprintf_callchain(sample, left_alignment, print_opts, + cursor, bt_stop_list, fp); } else { printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); From ca1252779f48ece225c6003e01c675abb91cf1b4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 24 Sep 2019 15:41:51 -0300 Subject: [PATCH 1205/2880] perf evsel: Introduce evsel_fprintf.h We already had evsel_fprintf.c, add its counterpart, so that we can reduce evsel.h a bit more. We needed a new perf_event_attr_fprintf.c file so as to have a separate object to link with the python binding in tools/perf/util/python-ext-sources and not drag symbol_conf, etc into the python binding. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-06bdmt1062d9unzgqmxwlv88@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-evlist.c | 1 + tools/perf/builtin-sched.c | 1 + tools/perf/builtin-script.c | 1 + tools/perf/builtin-trace.c | 2 + tools/perf/util/Build | 1 + tools/perf/util/evsel.c | 153 +--------------------- tools/perf/util/evsel.h | 51 +------- tools/perf/util/evsel_fprintf.c | 1 + tools/perf/util/evsel_fprintf.h | 50 +++++++ tools/perf/util/header.c | 1 + tools/perf/util/perf_event_attr_fprintf.c | 148 +++++++++++++++++++++ tools/perf/util/python-ext-sources | 1 + 12 files changed, 218 insertions(+), 193 deletions(-) create mode 100644 tools/perf/util/evsel_fprintf.h create mode 100644 tools/perf/util/perf_event_attr_fprintf.c diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index 60509ce4dd28..440501994931 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -10,6 +10,7 @@ #include "perf.h" #include "util/evlist.h" #include "util/evsel.h" +#include "util/evsel_fprintf.h" #include "util/parse-events.h" #include #include "util/session.h" diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index f9706306fea0..5cacc4f84c8d 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -6,6 +6,7 @@ #include "util/cpumap.h" #include "util/evlist.h" #include "util/evsel.h" +#include "util/evsel_fprintf.h" #include "util/symbol.h" #include "util/thread.h" #include "util/header.h" diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 0d43e2e5afff..286fc70d7402 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -17,6 +17,7 @@ #include "util/trace-event.h" #include "util/evlist.h" #include "util/evsel.h" +#include "util/evsel_fprintf.h" #include "util/evswitch.h" #include "util/sort.h" #include "util/data.h" diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 318225c8d7a7..bb5130d02155 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -28,6 +28,8 @@ #include "util/dso.h" #include "util/env.h" #include "util/event.h" +#include "util/evsel.h" +#include "util/evsel_fprintf.h" #include "util/synthetic-events.h" #include "util/evlist.h" #include "util/evswitch.h" diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 4d1894e38a81..8dcfca1a882f 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -11,6 +11,7 @@ perf-y += event.o perf-y += evlist.o perf-y += evsel.o perf-y += evsel_fprintf.o +perf-y += perf_event_attr_fprintf.o perf-y += evswitch.o perf-y += find_bit.o perf-y += get_current_dir_name.o diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 9c284d2adcea..6323b0c60f6c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -30,6 +30,7 @@ #include "counts.h" #include "event.h" #include "evsel.h" +#include "util/evsel_fprintf.h" #include "evlist.h" #include #include "thread_map.h" @@ -1443,152 +1444,6 @@ static int get_group_fd(struct evsel *evsel, int cpu, int thread) return fd; } -struct bit_names { - int bit; - const char *name; -}; - -static void __p_bits(char *buf, size_t size, u64 value, struct bit_names *bits) -{ - bool first_bit = true; - int i = 0; - - do { - if (value & bits[i].bit) { - buf += scnprintf(buf, size, "%s%s", first_bit ? "" : "|", bits[i].name); - first_bit = false; - } - } while (bits[++i].name != NULL); -} - -static void __p_sample_type(char *buf, size_t size, u64 value) -{ -#define bit_name(n) { PERF_SAMPLE_##n, #n } - struct bit_names bits[] = { - bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR), - bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU), - bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW), - bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER), - bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC), - bit_name(WEIGHT), bit_name(PHYS_ADDR), - { .name = NULL, } - }; -#undef bit_name - __p_bits(buf, size, value, bits); -} - -static void __p_branch_sample_type(char *buf, size_t size, u64 value) -{ -#define bit_name(n) { PERF_SAMPLE_BRANCH_##n, #n } - struct bit_names bits[] = { - bit_name(USER), bit_name(KERNEL), bit_name(HV), bit_name(ANY), - bit_name(ANY_CALL), bit_name(ANY_RETURN), bit_name(IND_CALL), - bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX), - bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP), - bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES), - { .name = NULL, } - }; -#undef bit_name - __p_bits(buf, size, value, bits); -} - -static void __p_read_format(char *buf, size_t size, u64 value) -{ -#define bit_name(n) { PERF_FORMAT_##n, #n } - struct bit_names bits[] = { - bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING), - bit_name(ID), bit_name(GROUP), - { .name = NULL, } - }; -#undef bit_name - __p_bits(buf, size, value, bits); -} - -#define BUF_SIZE 1024 - -#define p_hex(val) snprintf(buf, BUF_SIZE, "%#"PRIx64, (uint64_t)(val)) -#define p_unsigned(val) snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val)) -#define p_signed(val) snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val)) -#define p_sample_type(val) __p_sample_type(buf, BUF_SIZE, val) -#define p_branch_sample_type(val) __p_branch_sample_type(buf, BUF_SIZE, val) -#define p_read_format(val) __p_read_format(buf, BUF_SIZE, val) - -#define PRINT_ATTRn(_n, _f, _p) \ -do { \ - if (attr->_f) { \ - _p(attr->_f); \ - ret += attr__fprintf(fp, _n, buf, priv);\ - } \ -} while (0) - -#define PRINT_ATTRf(_f, _p) PRINT_ATTRn(#_f, _f, _p) - -int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, - attr__fprintf_f attr__fprintf, void *priv) -{ - char buf[BUF_SIZE]; - int ret = 0; - - PRINT_ATTRf(type, p_unsigned); - PRINT_ATTRf(size, p_unsigned); - PRINT_ATTRf(config, p_hex); - PRINT_ATTRn("{ sample_period, sample_freq }", sample_period, p_unsigned); - PRINT_ATTRf(sample_type, p_sample_type); - PRINT_ATTRf(read_format, p_read_format); - - PRINT_ATTRf(disabled, p_unsigned); - PRINT_ATTRf(inherit, p_unsigned); - PRINT_ATTRf(pinned, p_unsigned); - PRINT_ATTRf(exclusive, p_unsigned); - PRINT_ATTRf(exclude_user, p_unsigned); - PRINT_ATTRf(exclude_kernel, p_unsigned); - PRINT_ATTRf(exclude_hv, p_unsigned); - PRINT_ATTRf(exclude_idle, p_unsigned); - PRINT_ATTRf(mmap, p_unsigned); - PRINT_ATTRf(comm, p_unsigned); - PRINT_ATTRf(freq, p_unsigned); - PRINT_ATTRf(inherit_stat, p_unsigned); - PRINT_ATTRf(enable_on_exec, p_unsigned); - PRINT_ATTRf(task, p_unsigned); - PRINT_ATTRf(watermark, p_unsigned); - PRINT_ATTRf(precise_ip, p_unsigned); - PRINT_ATTRf(mmap_data, p_unsigned); - PRINT_ATTRf(sample_id_all, p_unsigned); - PRINT_ATTRf(exclude_host, p_unsigned); - PRINT_ATTRf(exclude_guest, p_unsigned); - PRINT_ATTRf(exclude_callchain_kernel, p_unsigned); - PRINT_ATTRf(exclude_callchain_user, p_unsigned); - PRINT_ATTRf(mmap2, p_unsigned); - PRINT_ATTRf(comm_exec, p_unsigned); - PRINT_ATTRf(use_clockid, p_unsigned); - PRINT_ATTRf(context_switch, p_unsigned); - PRINT_ATTRf(write_backward, p_unsigned); - PRINT_ATTRf(namespaces, p_unsigned); - PRINT_ATTRf(ksymbol, p_unsigned); - PRINT_ATTRf(bpf_event, p_unsigned); - PRINT_ATTRf(aux_output, p_unsigned); - - PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned); - PRINT_ATTRf(bp_type, p_unsigned); - PRINT_ATTRn("{ bp_addr, config1 }", bp_addr, p_hex); - PRINT_ATTRn("{ bp_len, config2 }", bp_len, p_hex); - PRINT_ATTRf(branch_sample_type, p_branch_sample_type); - PRINT_ATTRf(sample_regs_user, p_hex); - PRINT_ATTRf(sample_stack_user, p_unsigned); - PRINT_ATTRf(clockid, p_signed); - PRINT_ATTRf(sample_regs_intr, p_hex); - PRINT_ATTRf(aux_watermark, p_unsigned); - PRINT_ATTRf(sample_max_stack, p_unsigned); - - return ret; -} - -static int __open_attr__fprintf(FILE *fp, const char *name, const char *val, - void *priv __maybe_unused) -{ - return fprintf(fp, " %-32s %s\n", name, val); -} - static void perf_evsel__remove_fd(struct evsel *pos, int nr_cpus, int nr_threads, int thread_idx) @@ -1659,6 +1514,12 @@ static bool ignore_missing_thread(struct evsel *evsel, return true; } +static int __open_attr__fprintf(FILE *fp, const char *name, const char *val, + void *priv __maybe_unused) +{ + return fprintf(fp, " %-32s %s\n", name, val); +} + static void display_attr(struct perf_event_attr *attr) { if (verbose >= 2) { diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 4a4c64833893..48183b5f5f83 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -4,7 +4,6 @@ #include #include -#include #include #include #include @@ -13,12 +12,6 @@ #include "symbol_conf.h" #include -struct addr_location; -struct evsel; -union perf_event; - -struct cgroup; - /* * The 'struct perf_evsel_config_term' is used to pass event * specific configuration data to perf_evsel__config routine. @@ -62,7 +55,11 @@ struct perf_evsel_config_term { bool weak; }; +struct bpf_object; +struct cgroup; +struct perf_counts; struct perf_stat_evsel; +union perf_event; typedef int (perf_evsel__sb_cb_t)(union perf_event *event, void *data); @@ -71,9 +68,6 @@ enum perf_tool_event { PERF_TOOL_DURATION_TIME = 1, }; -struct bpf_object; -struct perf_counts; - /** struct evsel - event selector * * @evlist - evlist this evsel is in, if it is in one. @@ -404,38 +398,6 @@ static inline bool perf_evsel__is_clock(struct evsel *evsel) perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK); } -struct perf_attr_details { - bool freq; - bool verbose; - bool event_group; - bool force; - bool trace_fields; -}; - -int perf_evsel__fprintf(struct evsel *evsel, - struct perf_attr_details *details, FILE *fp); - -#define EVSEL__PRINT_IP (1<<0) -#define EVSEL__PRINT_SYM (1<<1) -#define EVSEL__PRINT_DSO (1<<2) -#define EVSEL__PRINT_SYMOFFSET (1<<3) -#define EVSEL__PRINT_ONELINE (1<<4) -#define EVSEL__PRINT_SRCLINE (1<<5) -#define EVSEL__PRINT_UNKNOWN_AS_ADDR (1<<6) -#define EVSEL__PRINT_CALLCHAIN_ARROW (1<<7) -#define EVSEL__PRINT_SKIP_IGNORED (1<<8) - -struct callchain_cursor; - -int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, - unsigned int print_opts, struct callchain_cursor *cursor, - struct strlist *bt_stop_list, FILE *fp); - -int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, - int left_alignment, unsigned int print_opts, - struct callchain_cursor *cursor, - struct strlist *bt_stop_list, FILE *fp); - bool perf_evsel__fallback(struct evsel *evsel, int err, char *msg, size_t msgsize); int perf_evsel__open_strerror(struct evsel *evsel, struct target *target, @@ -468,11 +430,6 @@ static inline bool evsel__has_callchain(const struct evsel *evsel) return (evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0; } -typedef int (*attr__fprintf_f)(FILE *, const char *, const char *, void *); - -int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, - attr__fprintf_f attr__fprintf, void *priv); - struct perf_env *perf_evsel__env(struct evsel *evsel); int perf_evsel__store_ids(struct evsel *evsel, struct evlist *evlist); diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index 756b1e852db7..028df7afb0dc 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -4,6 +4,7 @@ #include #include #include "evsel.h" +#include "util/evsel_fprintf.h" #include "util/event.h" #include "callchain.h" #include "map.h" diff --git a/tools/perf/util/evsel_fprintf.h b/tools/perf/util/evsel_fprintf.h new file mode 100644 index 000000000000..47e6c8456bb1 --- /dev/null +++ b/tools/perf/util/evsel_fprintf.h @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __PERF_EVSEL_FPRINTF_H +#define __PERF_EVSEL_FPRINTF_H 1 + +#include +#include + +struct evsel; + +struct perf_attr_details { + bool freq; + bool verbose; + bool event_group; + bool force; + bool trace_fields; +}; + +int perf_evsel__fprintf(struct evsel *evsel, + struct perf_attr_details *details, FILE *fp); + +#define EVSEL__PRINT_IP (1<<0) +#define EVSEL__PRINT_SYM (1<<1) +#define EVSEL__PRINT_DSO (1<<2) +#define EVSEL__PRINT_SYMOFFSET (1<<3) +#define EVSEL__PRINT_ONELINE (1<<4) +#define EVSEL__PRINT_SRCLINE (1<<5) +#define EVSEL__PRINT_UNKNOWN_AS_ADDR (1<<6) +#define EVSEL__PRINT_CALLCHAIN_ARROW (1<<7) +#define EVSEL__PRINT_SKIP_IGNORED (1<<8) + +struct addr_location; +struct perf_event_attr; +struct perf_sample; +struct callchain_cursor; +struct strlist; + +int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, + unsigned int print_opts, struct callchain_cursor *cursor, + struct strlist *bt_stop_list, FILE *fp); + +int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, + int left_alignment, unsigned int print_opts, + struct callchain_cursor *cursor, + struct strlist *bt_stop_list, FILE *fp); + +typedef int (*attr__fprintf_f)(FILE *, const char *, const char *, void *); + +int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, + attr__fprintf_f attr__fprintf, void *priv); +#endif // __PERF_EVSEL_H diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 3b24b4974c5f..86d9396cb131 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -25,6 +25,7 @@ #include "dso.h" #include "evlist.h" #include "evsel.h" +#include "util/evsel_fprintf.h" #include "header.h" #include "memswap.h" #include "trace-event.h" diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c new file mode 100644 index 000000000000..d4ad3f04923a --- /dev/null +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include "util/evsel_fprintf.h" + +struct bit_names { + int bit; + const char *name; +}; + +static void __p_bits(char *buf, size_t size, u64 value, struct bit_names *bits) +{ + bool first_bit = true; + int i = 0; + + do { + if (value & bits[i].bit) { + buf += scnprintf(buf, size, "%s%s", first_bit ? "" : "|", bits[i].name); + first_bit = false; + } + } while (bits[++i].name != NULL); +} + +static void __p_sample_type(char *buf, size_t size, u64 value) +{ +#define bit_name(n) { PERF_SAMPLE_##n, #n } + struct bit_names bits[] = { + bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR), + bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU), + bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW), + bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER), + bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC), + bit_name(WEIGHT), bit_name(PHYS_ADDR), + { .name = NULL, } + }; +#undef bit_name + __p_bits(buf, size, value, bits); +} + +static void __p_branch_sample_type(char *buf, size_t size, u64 value) +{ +#define bit_name(n) { PERF_SAMPLE_BRANCH_##n, #n } + struct bit_names bits[] = { + bit_name(USER), bit_name(KERNEL), bit_name(HV), bit_name(ANY), + bit_name(ANY_CALL), bit_name(ANY_RETURN), bit_name(IND_CALL), + bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX), + bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP), + bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES), + { .name = NULL, } + }; +#undef bit_name + __p_bits(buf, size, value, bits); +} + +static void __p_read_format(char *buf, size_t size, u64 value) +{ +#define bit_name(n) { PERF_FORMAT_##n, #n } + struct bit_names bits[] = { + bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING), + bit_name(ID), bit_name(GROUP), + { .name = NULL, } + }; +#undef bit_name + __p_bits(buf, size, value, bits); +} + +#define BUF_SIZE 1024 + +#define p_hex(val) snprintf(buf, BUF_SIZE, "%#"PRIx64, (uint64_t)(val)) +#define p_unsigned(val) snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val)) +#define p_signed(val) snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val)) +#define p_sample_type(val) __p_sample_type(buf, BUF_SIZE, val) +#define p_branch_sample_type(val) __p_branch_sample_type(buf, BUF_SIZE, val) +#define p_read_format(val) __p_read_format(buf, BUF_SIZE, val) + +#define PRINT_ATTRn(_n, _f, _p) \ +do { \ + if (attr->_f) { \ + _p(attr->_f); \ + ret += attr__fprintf(fp, _n, buf, priv);\ + } \ +} while (0) + +#define PRINT_ATTRf(_f, _p) PRINT_ATTRn(#_f, _f, _p) + +int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, + attr__fprintf_f attr__fprintf, void *priv) +{ + char buf[BUF_SIZE]; + int ret = 0; + + PRINT_ATTRf(type, p_unsigned); + PRINT_ATTRf(size, p_unsigned); + PRINT_ATTRf(config, p_hex); + PRINT_ATTRn("{ sample_period, sample_freq }", sample_period, p_unsigned); + PRINT_ATTRf(sample_type, p_sample_type); + PRINT_ATTRf(read_format, p_read_format); + + PRINT_ATTRf(disabled, p_unsigned); + PRINT_ATTRf(inherit, p_unsigned); + PRINT_ATTRf(pinned, p_unsigned); + PRINT_ATTRf(exclusive, p_unsigned); + PRINT_ATTRf(exclude_user, p_unsigned); + PRINT_ATTRf(exclude_kernel, p_unsigned); + PRINT_ATTRf(exclude_hv, p_unsigned); + PRINT_ATTRf(exclude_idle, p_unsigned); + PRINT_ATTRf(mmap, p_unsigned); + PRINT_ATTRf(comm, p_unsigned); + PRINT_ATTRf(freq, p_unsigned); + PRINT_ATTRf(inherit_stat, p_unsigned); + PRINT_ATTRf(enable_on_exec, p_unsigned); + PRINT_ATTRf(task, p_unsigned); + PRINT_ATTRf(watermark, p_unsigned); + PRINT_ATTRf(precise_ip, p_unsigned); + PRINT_ATTRf(mmap_data, p_unsigned); + PRINT_ATTRf(sample_id_all, p_unsigned); + PRINT_ATTRf(exclude_host, p_unsigned); + PRINT_ATTRf(exclude_guest, p_unsigned); + PRINT_ATTRf(exclude_callchain_kernel, p_unsigned); + PRINT_ATTRf(exclude_callchain_user, p_unsigned); + PRINT_ATTRf(mmap2, p_unsigned); + PRINT_ATTRf(comm_exec, p_unsigned); + PRINT_ATTRf(use_clockid, p_unsigned); + PRINT_ATTRf(context_switch, p_unsigned); + PRINT_ATTRf(write_backward, p_unsigned); + PRINT_ATTRf(namespaces, p_unsigned); + PRINT_ATTRf(ksymbol, p_unsigned); + PRINT_ATTRf(bpf_event, p_unsigned); + PRINT_ATTRf(aux_output, p_unsigned); + + PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned); + PRINT_ATTRf(bp_type, p_unsigned); + PRINT_ATTRn("{ bp_addr, config1 }", bp_addr, p_hex); + PRINT_ATTRn("{ bp_len, config2 }", bp_len, p_hex); + PRINT_ATTRf(branch_sample_type, p_branch_sample_type); + PRINT_ATTRf(sample_regs_user, p_hex); + PRINT_ATTRf(sample_stack_user, p_unsigned); + PRINT_ATTRf(clockid, p_signed); + PRINT_ATTRf(sample_regs_intr, p_hex); + PRINT_ATTRf(aux_watermark, p_unsigned); + PRINT_ATTRf(sample_max_stack, p_unsigned); + + return ret; +} diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources index c6dd478956f1..9af183860fbd 100644 --- a/tools/perf/util/python-ext-sources +++ b/tools/perf/util/python-ext-sources @@ -10,6 +10,7 @@ util/python.c util/cap.c util/evlist.c util/evsel.c +util/perf_event_attr_fprintf.c util/cpumap.c util/memswap.c util/mmap.c From bd70462062f36bec9d93d3900addfca4f8ec718f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 24 Sep 2019 15:45:21 -0300 Subject: [PATCH 1206/2880] perf evlist: Remove unused perf_evlist__fprintf() method Ditch it, noone is using it, one more stdio.h include in a hot header. Fix the fallout in parse-events.y, where we end up using a FILE pointer, I think due to YYDEBUG being set and in some places, like Amazon Linux 1 we don't get stdio.h included by luck, like in most other places, add a explicit stdio.h include directive. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-37k5q0lhdbo2hvvfbnnzn7og@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 13 ------------- tools/perf/util/evlist.h | 3 --- tools/perf/util/parse-events.y | 1 + 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index d4c7fd125ce9..84c42790dab3 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1421,19 +1421,6 @@ int perf_evlist__parse_sample_timestamp(struct evlist *evlist, return perf_evsel__parse_sample_timestamp(evsel, event, timestamp); } -size_t perf_evlist__fprintf(struct evlist *evlist, FILE *fp) -{ - struct evsel *evsel; - size_t printed = 0; - - evlist__for_each_entry(evlist, evsel) { - printed += fprintf(fp, "%s%s", evsel->idx ? ", " : "", - perf_evsel__name(evsel)); - } - - return printed + fprintf(fp, "\n"); -} - int perf_evlist__strerror_open(struct evlist *evlist, int err, char *buf, size_t size) { diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 130d44d691b8..7cfe75522ba5 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include "events_stats.h" @@ -249,8 +248,6 @@ static inline struct evsel *evlist__last(struct evlist *evlist) return container_of(evsel, struct evsel, core); } -size_t perf_evlist__fprintf(struct evlist *evlist, FILE *fp); - int perf_evlist__strerror_open(struct evlist *evlist, int err, char *buf, size_t size); int perf_evlist__strerror_mmap(struct evlist *evlist, int err, char *buf, size_t size); diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index f1c36ed1cf36..65809ad67808 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -9,6 +9,7 @@ #define YYDEBUG 1 #include +#include #include #include #include From 95be9d197da6f9006f6a70a0d141498ea2488858 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 24 Sep 2019 15:56:14 -0300 Subject: [PATCH 1207/2880] perf evsel: Move config terms to a separate header Further reducing the size of util/evsel.h. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-20zr7di9eynm0272mtjfdhfc@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 1 + tools/perf/builtin-top.c | 1 + tools/perf/util/evsel.c | 1 + tools/perf/util/evsel.h | 43 -------------------------- tools/perf/util/evsel_config.h | 50 +++++++++++++++++++++++++++++++ tools/perf/util/parse-events.c | 1 + 6 files changed, 54 insertions(+), 43 deletions(-) create mode 100644 tools/perf/util/evsel_config.h diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 2e4de211d881..ede040cf82ad 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -23,6 +23,7 @@ #include "../../util/event.h" #include "../../util/evlist.h" #include "../../util/evsel.h" +#include "../../util/evsel_config.h" #include "../../util/pmu.h" #include "../../util/cs-etm.h" #include // page_size diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 30d8eb614377..1f60124eb19b 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -27,6 +27,7 @@ #include "util/dso.h" #include "util/evlist.h" #include "util/evsel.h" +#include "util/evsel_config.h" #include "util/event.h" #include "util/machine.h" #include "util/map.h" diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 6323b0c60f6c..5591af81a070 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -30,6 +30,7 @@ #include "counts.h" #include "event.h" #include "evsel.h" +#include "util/evsel_config.h" #include "util/evsel_fprintf.h" #include "evlist.h" #include diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 48183b5f5f83..ddc5ee6f6592 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -12,49 +12,6 @@ #include "symbol_conf.h" #include -/* - * The 'struct perf_evsel_config_term' is used to pass event - * specific configuration data to perf_evsel__config routine. - * It is allocated within event parsing and attached to - * perf_evsel::config_terms list head. -*/ -enum term_type { - PERF_EVSEL__CONFIG_TERM_PERIOD, - PERF_EVSEL__CONFIG_TERM_FREQ, - PERF_EVSEL__CONFIG_TERM_TIME, - PERF_EVSEL__CONFIG_TERM_CALLGRAPH, - PERF_EVSEL__CONFIG_TERM_STACK_USER, - PERF_EVSEL__CONFIG_TERM_INHERIT, - PERF_EVSEL__CONFIG_TERM_MAX_STACK, - PERF_EVSEL__CONFIG_TERM_MAX_EVENTS, - PERF_EVSEL__CONFIG_TERM_OVERWRITE, - PERF_EVSEL__CONFIG_TERM_DRV_CFG, - PERF_EVSEL__CONFIG_TERM_BRANCH, - PERF_EVSEL__CONFIG_TERM_PERCORE, - PERF_EVSEL__CONFIG_TERM_AUX_OUTPUT, -}; - -struct perf_evsel_config_term { - struct list_head list; - enum term_type type; - union { - u64 period; - u64 freq; - bool time; - char *callgraph; - char *drv_cfg; - u64 stack_user; - int max_stack; - bool inherit; - bool overwrite; - char *branch; - unsigned long max_events; - bool percore; - bool aux_output; - } val; - bool weak; -}; - struct bpf_object; struct cgroup; struct perf_counts; diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h new file mode 100644 index 000000000000..8a7648037c18 --- /dev/null +++ b/tools/perf/util/evsel_config.h @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __PERF_EVSEL_CONFIG_H +#define __PERF_EVSEL_CONFIG_H 1 + +#include +#include + +/* + * The 'struct perf_evsel_config_term' is used to pass event + * specific configuration data to perf_evsel__config routine. + * It is allocated within event parsing and attached to + * perf_evsel::config_terms list head. +*/ +enum evsel_term_type { + PERF_EVSEL__CONFIG_TERM_PERIOD, + PERF_EVSEL__CONFIG_TERM_FREQ, + PERF_EVSEL__CONFIG_TERM_TIME, + PERF_EVSEL__CONFIG_TERM_CALLGRAPH, + PERF_EVSEL__CONFIG_TERM_STACK_USER, + PERF_EVSEL__CONFIG_TERM_INHERIT, + PERF_EVSEL__CONFIG_TERM_MAX_STACK, + PERF_EVSEL__CONFIG_TERM_MAX_EVENTS, + PERF_EVSEL__CONFIG_TERM_OVERWRITE, + PERF_EVSEL__CONFIG_TERM_DRV_CFG, + PERF_EVSEL__CONFIG_TERM_BRANCH, + PERF_EVSEL__CONFIG_TERM_PERCORE, + PERF_EVSEL__CONFIG_TERM_AUX_OUTPUT, +}; + +struct perf_evsel_config_term { + struct list_head list; + enum evsel_term_type type; + union { + u64 period; + u64 freq; + bool time; + char *callgraph; + char *drv_cfg; + u64 stack_user; + int max_stack; + bool inherit; + bool overwrite; + char *branch; + unsigned long max_events; + bool percore; + bool aux_output; + } val; + bool weak; +}; +#endif // __PERF_EVSEL_CONFIG_H diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index d69ff746cda5..50737046fa63 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -34,6 +34,7 @@ #include "asm/bug.h" #include "util/parse-branch-options.h" #include "metricgroup.h" +#include "util/evsel_config.h" #include "util/mmap.h" #define MAX_NAME_LEN 100 From 252a2fdc742bc34b94da203282773952d3b54279 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 24 Sep 2019 16:07:59 -0300 Subject: [PATCH 1208/2880] perf tools: Replace needless mmap.h with what is needed, event.h The perf_sample struct definition and the event_attr_init() are in util/event.h, but some places were getting it thru an otherwise needless util/mmap.h header, fix it by including util/event.h directly. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-p1anwyjdbbvghrkl9dlxv7h5@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 2 +- tools/perf/util/parse-events.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 69d0a1991b29..e830eadfca2a 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -34,7 +34,7 @@ #include "bpf-event.h" #include "block-range.h" #include "string2.h" -#include "util/mmap.h" +#include "util/event.h" #include "arch/common.h" #include #include diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 50737046fa63..b5e2adef49de 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -35,7 +35,7 @@ #include "util/parse-branch-options.h" #include "metricgroup.h" #include "util/evsel_config.h" -#include "util/mmap.h" +#include "util/event.h" #define MAX_NAME_LEN 100 From 6f6473c37d34b00676a152ec7e60770c78ed7c2f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 23 Sep 2019 16:33:39 -0700 Subject: [PATCH 1209/2880] perf stat: Fix free memory access / memory leaks in metrics Make sure to not free the name passed in by the caller, but free all the allocated ids when parsing expressions. The loop at the end knows that the first entry shouldn't be freed, so make sure the caller name is the first entry. Fixes % perf stat -M IpB,IpCall,IpTB,IPC,Retiring_SMT,Frontend_Bound_SMT,Kernel_Utilization,CPU_Utilization --metric-only -a -I 1000 sleep 2 valgrind: 1.009943231 ==21527== Invalid read of size 1 ==21527== at 0x483CB74: strcmp (vg_replace_strmem.c:849) ==21527== by 0x582CF8: collect_all_aliases (stat-display.c:554) ==21527== by 0x582EB3: collect_data (stat-display.c:577) ==21527== by 0x583A32: print_counter_aggr (stat-display.c:806) ==21527== by 0x584FAD: perf_evlist__print_counters (stat-display.c:1200) ==21527== by 0x45133A: print_counters (builtin-stat.c:655) ==21527== by 0x450629: process_interval (builtin-stat.c:353) ==21527== by 0x450FBD: __run_perf_stat (builtin-stat.c:564) ==21527== by 0x451285: run_perf_stat (builtin-stat.c:636) ==21527== by 0x454619: cmd_stat (builtin-stat.c:1966) ==21527== by 0x4D557D: run_builtin (perf.c:310) ==21527== by 0x4D57EA: handle_internal_command (perf.c:362) ==21527== Address 0x12826cd0 is 0 bytes inside a block of size 25 free'd ==21527== at 0x4839A0C: free (vg_replace_malloc.c:540) ==21527== by 0x627041: __zfree (zalloc.c:13) ==21527== by 0x57F66A: generic_metric (stat-shadow.c:814) ==21527== by 0x580B21: perf_stat__print_shadow_stats (stat-shadow.c:1057) ==21527== by 0x58418E: print_metric_headers (stat-display.c:943) ==21527== by 0x5844BC: print_interval (stat-display.c:1004) ==21527== by 0x584DEB: perf_evlist__print_counters (stat-display.c:1172) ==21527== by 0x45133A: print_counters (builtin-stat.c:655) ==21527== by 0x450629: process_interval (builtin-stat.c:353) ==21527== by 0x450FBD: __run_perf_stat (builtin-stat.c:564) ==21527== by 0x451285: run_perf_stat (builtin-stat.c:636) ==21527== by 0x454619: cmd_stat (builtin-stat.c:1966) ==21527== Block was alloc'd at ==21527== at 0x483880B: malloc (vg_replace_malloc.c:309) ==21527== by 0x51677DE: strdup (in /usr/lib64/libc-2.29.so) ==21527== by 0x506457: parse_events_name (parse-events.c:1754) ==21527== by 0x5550BB: parse_events_parse (parse-events.y:214) ==21527== by 0x50694D: parse_events__scanner (parse-events.c:1887) ==21527== by 0x506AEF: parse_events (parse-events.c:1927) ==21527== by 0x521D8B: metricgroup__parse_groups (metricgroup.c:527) ==21527== by 0x45156F: parse_metric_groups (builtin-stat.c:721) ==21527== by 0x6228A9: get_value (parse-options.c:243) ==21527== by 0x62363F: parse_short_opt (parse-options.c:348) ==21527== by 0x62363F: parse_options_step (parse-options.c:536) ==21527== by 0x62363F: parse_options_subcommand (parse-options.c:651) ==21527== by 0x453C1D: cmd_stat (builtin-stat.c:1718) ==21527== by 0x4D557D: run_builtin (perf.c:310) and also a leak report. Committer testing: Before: # perf stat -M IpB,IpCall,IpTB,IPC,Retiring_SMT,Frontend_Bound_SMT,Kernel_Utilization,CPU_Utilization --metric-only -a -I 1000 sleep 2 # time CPU_Utilization 1.000470810 free(): double free detected in tcache 2 Aborted (core dumped) # After: # perf stat -M IpB,IpCall,IpTB,IPC,Retiring_SMT,Frontend_Bound_SMT,Kernel_Utilization,CPU_Utilization --metric-only -a -I 1000 sleep 2 # time CPU_Utilization 1.000494752 0.1 2.001105112 0.1 # Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Link: http://lore.kernel.org/lkml/20190923233339.25326-3-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-shadow.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 70c87fdb2a43..2c41d47f6f83 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -738,6 +738,8 @@ static void generic_metric(struct perf_stat_config *config, char *n, *pn; expr__ctx_init(&pctx); + /* Must be first id entry */ + expr__add_id(&pctx, name, avg); for (i = 0; metric_events[i]; i++) { struct saved_value *v; struct stats *stats; @@ -776,8 +778,6 @@ static void generic_metric(struct perf_stat_config *config, expr__add_id(&pctx, n, avg_stats(stats)*scale); } - expr__add_id(&pctx, name, avg); - if (!metric_events[i]) { const char *p = metric_expr; From 7834fa948bebe648e8ce03393fb6b213c33f0a3a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 23 Sep 2019 16:33:37 -0700 Subject: [PATCH 1210/2880] perf evlist: Fix access of freed id arrays I'm not fully sure if this is the correct fix, but without this I get crashes on more complex perf stat metric usages. The problem is that part of the state gets freed when a weak group fails, but then is later still used. Just don't free the ids, we're going to reuse them anyways on the weak group retry. For example: % perf stat -M IpB,IpCall,IpTB,IPC,Retiring_SMT,Frontend_Bound_SMT,Kernel_Utilization,CPU_Utilization --metric-only -a -I 1000 sleep 2 crashes and gives in valgrind: =21527== Invalid write of size 8 ==21527== at 0x4EE582: hlist_add_head (list.h:644) ==21527== by 0x4EFD3C: perf_evlist__id_hash (evlist.c:477) ==21527== by 0x4EFD99: perf_evlist__id_add (evlist.c:483) ==21527== by 0x4EFF15: perf_evlist__id_add_fd (evlist.c:524) ==21527== by 0x4FC693: store_evsel_ids (evsel.c:2969) ==21527== by 0x4FC76C: perf_evsel__store_ids (evsel.c:2986) ==21527== by 0x450DA7: __run_perf_stat (builtin-stat.c:519) ==21527== by 0x451285: run_perf_stat (builtin-stat.c:636) ==21527== by 0x454619: cmd_stat (builtin-stat.c:1966) ==21527== by 0x4D557D: run_builtin (perf.c:310) ==21527== by 0x4D57EA: handle_internal_command (perf.c:362) ==21527== by 0x4D5931: run_argv (perf.c:406) ==21527== Address 0x12e3f008 is 104 bytes inside a block of size 2,056 free'd ==21527== at 0x4839A0C: free (vg_replace_malloc.c:540) ==21527== by 0x627139: xyarray__delete (xyarray.c:32) ==21527== by 0x4F6BE4: perf_evsel__free_id (evsel.c:1253) ==21527== by 0x4FA11F: evsel__close (evsel.c:1994) ==21527== by 0x4F30A3: perf_evlist__reset_weak_group (evlist.c:1783) ==21527== by 0x450B47: __run_perf_stat (builtin-stat.c:466) ==21527== by 0x451285: run_perf_stat (builtin-stat.c:636) ==21527== by 0x454619: cmd_stat (builtin-stat.c:1966) ==21527== by 0x4D557D: run_builtin (perf.c:310) ==21527== by 0x4D57EA: handle_internal_command (perf.c:362) ==21527== by 0x4D5931: run_argv (perf.c:406) ==21527== by 0x4D5CAE: main (perf.c:531) ==21527== Block was alloc'd at ==21527== at 0x483AB1A: calloc (vg_replace_malloc.c:762) ==21527== by 0x627024: zalloc (zalloc.c:8) ==21527== by 0x627088: xyarray__new (xyarray.c:10) ==21527== by 0x4F6B20: perf_evsel__alloc_id (evsel.c:1237) ==21527== by 0x4FC74E: perf_evsel__store_ids (evsel.c:2983) ==21527== by 0x450DA7: __run_perf_stat (builtin-stat.c:519) ==21527== by 0x451285: run_perf_stat (builtin-stat.c:636) ==21527== by 0x454619: cmd_stat (builtin-stat.c:1966) ==21527== by 0x4D557D: run_builtin (perf.c:310) ==21527== by 0x4D57EA: handle_internal_command (perf.c:362) ==21527== by 0x4D5931: run_argv (perf.c:406) ==21527== by 0x4D5CAE: main (perf.c:531) Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Link: http://lore.kernel.org/lkml/20190923233339.25326-1-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 84c42790dab3..d277a98e62df 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1659,7 +1659,7 @@ struct evsel *perf_evlist__reset_weak_group(struct evlist *evsel_list, is_open = false; if (c2->leader == leader) { if (is_open) - evsel__close(c2); + perf_evsel__close(&evsel->core); c2->leader = c2; c2->core.nr_members = 0; } From 28b951760cebdf1de0d32f38f325b667c2494564 Mon Sep 17 00:00:00 2001 From: Mamatha Inamdar Date: Mon, 9 Sep 2019 12:33:33 +0530 Subject: [PATCH 1211/2880] perf vendor events: Remove P8 HW events which are not supported This patch is to remove following hardware events from JSON file which are not supported on POWER8. pm_l3_p0_grp_pump pm_l3_p0_lco_data pm_l3_p0_lco_no_data pm_l3_p0_lco_rty Note: Unfortunately power8 event list is not publicly available. Fixes: c3b4d5c4afb0 ("perf vendor events: Remove P8 HW events which are not supported") Signed-off-by: Mamatha Inamdar Acked-by: Ravi Bangoria Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20190909065624.11956.3992.stgit@localhost.localdomain Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/powerpc/power8/other.json | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/tools/perf/pmu-events/arch/powerpc/power8/other.json b/tools/perf/pmu-events/arch/powerpc/power8/other.json index 9dc2f6b70354..b2a3df07fbc4 100644 --- a/tools/perf/pmu-events/arch/powerpc/power8/other.json +++ b/tools/perf/pmu-events/arch/powerpc/power8/other.json @@ -1775,30 +1775,6 @@ "BriefDescription": "L3 Load Prefetches", "PublicDescription": "" }, - {, - "EventCode": "0xa29084", - "EventName": "PM_L3_P0_GRP_PUMP", - "BriefDescription": "L3 pf sent with grp scope port 0", - "PublicDescription": "" - }, - {, - "EventCode": "0x528084", - "EventName": "PM_L3_P0_LCO_DATA", - "BriefDescription": "lco sent with data port 0", - "PublicDescription": "" - }, - {, - "EventCode": "0x518080", - "EventName": "PM_L3_P0_LCO_NO_DATA", - "BriefDescription": "dataless l3 lco sent port 0", - "PublicDescription": "" - }, - {, - "EventCode": "0xa4908c", - "EventName": "PM_L3_P0_LCO_RTY", - "BriefDescription": "L3 LCO received retry port 0", - "PublicDescription": "" - }, {, "EventCode": "0x84908d", "EventName": "PM_L3_PF0_ALLOC", From 61bf4ee29d5a44399bacec9cda9ef314acdbecda Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 9 Sep 2019 13:41:15 +0200 Subject: [PATCH 1212/2880] perf jvmti: Include JVMTI support for s390 Enable JVMTI support for s390 perf tool chain. Signed-off-by: Thomas Richter Cc: Heiko Carstens Cc: Hendrik Brueckner Cc: Vasily Gorbik Link: http://lore.kernel.org/lkml/20190909114116.50469-3-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/s390/Makefile | 1 + tools/perf/util/genelf.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/tools/perf/arch/s390/Makefile b/tools/perf/arch/s390/Makefile index cb198787570a..6ac8887be7c9 100644 --- a/tools/perf/arch/s390/Makefile +++ b/tools/perf/arch/s390/Makefile @@ -4,6 +4,7 @@ PERF_HAVE_DWARF_REGS := 1 endif HAVE_KVM_STAT_SUPPORT := 1 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1 +PERF_HAVE_JITDUMP := 1 # # Syscall table generation for perf diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h index b72440bf9a79..d4137559be05 100644 --- a/tools/perf/util/genelf.h +++ b/tools/perf/util/genelf.h @@ -35,6 +35,9 @@ int jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_ent #elif defined(__sparc__) #define GEN_ELF_ARCH EM_SPARC #define GEN_ELF_CLASS ELFCLASS32 +#elif defined(__s390x__) +#define GEN_ELF_ARCH EM_S390 +#define GEN_ELF_CLASS ELFCLASS64 #else #error "unsupported architecture" #endif From 815c1560bf8fd522b8d93a1d727868b910c1cc24 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 9 Sep 2019 13:41:16 +0200 Subject: [PATCH 1213/2880] perf build: Add detection of java-11-openjdk-devel package With Java 11 there is no seperate JRE anymore. Details: https://coderanch.com/t/701603/java/JRE-JDK Therefore the detection of the JRE needs to be adapted. This change works for s390 and x86. I have not tested other platforms. Committer testing: Continues to work with the OpenJDK 8: $ rm -f ~acme/lib64/libperf-jvmti.so $ rpm -qa | grep jdk-devel java-1.8.0-openjdk-devel-1.8.0.222.b10-0.fc30.x86_64 $ git log --oneline -1 a51937170f33 (HEAD -> perf/core) perf build: Add detection of java-11-openjdk-devel package $ rm -rf /tmp/build/perf ; mkdir -p /tmp/build/perf ; make -C tools/perf O=/tmp/build/perf install > /dev/null 2>1 $ ls -la ~acme/lib64/libperf-jvmti.so -rwxr-xr-x. 1 acme acme 230744 Sep 24 16:46 /home/acme/lib64/libperf-jvmti.so $ Suggested-by: Andreas Krebbel Signed-off-by: Thomas Richter Tested-by: Arnaldo Carvalho de Melo Cc: Heiko Carstens Cc: Hendrik Brueckner Cc: Vasily Gorbik Link: http://lore.kernel.org/lkml/20190909114116.50469-4-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index a269d78456b6..46f7fba2306c 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -924,7 +924,7 @@ ifndef NO_JVMTI JDIR=$(shell /usr/sbin/update-java-alternatives -l | head -1 | awk '{print $$3}') else ifneq (,$(wildcard /usr/sbin/alternatives)) - JDIR=$(shell /usr/sbin/alternatives --display java | tail -1 | cut -d' ' -f 5 | sed 's%/jre/bin/java.%%g') + JDIR=$(shell /usr/sbin/alternatives --display java | tail -1 | cut -d' ' -f 5 | sed -e 's%/jre/bin/java.%%g' -e 's%/bin/java.%%g') endif endif ifndef JDIR From d6840d87b2d148e19e244ad2b44d28ba07f437a0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 25 Sep 2019 09:27:05 -0300 Subject: [PATCH 1214/2880] perf parser: Remove needless include directives They go on accumulating there like the debug.h one, that was introduced here: f23610245c1a ("perf list: Add debug support for outputing alias string") But then, when that need is removed via: 2073ad3326b7 ("perf tools: Factor out PMU matching in parser") The thing stays there, so continue the house cleaning spree... list.h not needed, no macros from there are used, and 'struct list_head' is in linux/types.h, ditto for util.h, no need for that as well. Cc: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-zkxr3mf6inun8m5mbnil4u0d@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.y | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 65809ad67808..48126ae4cd13 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -11,12 +11,9 @@ #include #include #include -#include #include -#include "util.h" #include "pmu.h" #include "evsel.h" -#include "debug.h" #include "parse-events.h" #include "parse-events-bison.h" From ddef29578a81a1d4d8f2b26a7adbfe21407ee3ea Mon Sep 17 00:00:00 2001 From: "Wunderlich, Mark" Date: Wed, 18 Sep 2019 23:36:37 +0000 Subject: [PATCH 1215/2880] nvme-tcp: fix wrong stop condition in io_work Allow the do/while statement to continue if current time is not after the proposed time 'deadline'. Intent is to allow loop to proceed for a specific time period. Currently the loop, as coded, will exit after first pass. Signed-off-by: Mark Wunderlich Reviewed-by: Sagi Grimberg Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4ffd5957637a..385a5212c10f 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1042,7 +1042,7 @@ static void nvme_tcp_io_work(struct work_struct *w) { struct nvme_tcp_queue *queue = container_of(w, struct nvme_tcp_queue, io_work); - unsigned long start = jiffies + msecs_to_jiffies(1); + unsigned long deadline = jiffies + msecs_to_jiffies(1); do { bool pending = false; @@ -1067,7 +1067,7 @@ static void nvme_tcp_io_work(struct work_struct *w) if (!pending) return; - } while (time_after(jiffies, start)); /* quota is exhausted */ + } while (!time_after(jiffies, deadline)); /* quota is exhausted */ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } From 7cbb5c6f9aa7cfda7175d82a9cf77a92965b0c5e Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 18 Sep 2019 13:15:55 -0500 Subject: [PATCH 1216/2880] nvme-pci: Save PCI state before putting drive into deepest state The action of saving the PCI state will cause numerous PCI configuration space reads which depending upon the vendor implementation may cause the drive to exit the deepest NVMe state. In these cases ASPM will typically resolve the PCIe link state and APST may resolve the NVMe power state. However it has also been observed that this register access after quiesced will cause PC10 failure on some device combinations. To resolve this, move the PCI state saving to before SetFeatures has been called. This has been proven to resolve the issue across a 5000 sample test on previously failing disk/system combinations. Signed-off-by: Mario Limonciello Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6b4d7b064b38..d3f63f0c212a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2944,11 +2944,21 @@ static int nvme_suspend(struct device *dev) if (ret < 0) goto unfreeze; + /* + * A saved state prevents pci pm from generically controlling the + * device's power. If we're using protocol specific settings, we don't + * want pci interfering. + */ + pci_save_state(pdev); + ret = nvme_set_power_state(ctrl, ctrl->npss); if (ret < 0) goto unfreeze; if (ret) { + /* discard the saved state */ + pci_load_saved_state(pdev, NULL); + /* * Clearing npss forces a controller reset on resume. The * correct value will be resdicovered then. @@ -2956,14 +2966,7 @@ static int nvme_suspend(struct device *dev) nvme_dev_disable(ndev, true); ctrl->npss = 0; ret = 0; - goto unfreeze; } - /* - * A saved state prevents pci pm from generically controlling the - * device's power. If we're using protocol specific settings, we don't - * want pci interfering. - */ - pci_save_state(pdev); unfreeze: nvme_unfreeze(ctrl); return ret; From bc4f6e06a90ea016855fc67212b4d500145f0b8a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 23 Sep 2019 17:18:36 +0300 Subject: [PATCH 1217/2880] nvme: fix an error code in nvme_init_subsystem() "ret" should be a negative error code here, but it's either success or possibly uninitialized. Fixes: 32fd90c40768 ("nvme: change locking for the per-subsystem controller list") Signed-off-by: Dan Carpenter Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0c385b1994fe..d3c9df62a9d5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2543,8 +2543,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) list_add_tail(&subsys->entry, &nvme_subsystems); } - if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, - dev_name(ctrl->device))) { + ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, + dev_name(ctrl->device)); + if (ret) { dev_err(ctrl->device, "failed to create sysfs link from subsystem.\n"); goto out_put_subsystem; From ff13c1b87c97275b82b2af99044b4abf6861b28f Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sat, 21 Sep 2019 23:58:19 +0300 Subject: [PATCH 1218/2880] nvme-rdma: Fix max_hw_sectors calculation By default, the NVMe/RDMA driver should support max io_size of 1MiB (or upto the maximum supported size by the HCA). Currently, one will see that /sys/class/block//queue/max_hw_sectors_kb is 1020 instead of 1024. A non power of 2 value can cause performance degradation due to unnecessary splitting of IO requests and unoptimized allocation units. The number of pages per MR has been fixed here, so there is no longer any need to reduce max_sectors by 1. Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index dfa07bb9dfeb..9d16dfc29368 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -427,7 +427,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev) { return min_t(u32, NVME_RDMA_MAX_SEGMENTS, - ibdev->attrs.max_fast_reg_page_list_len); + ibdev->attrs.max_fast_reg_page_list_len - 1); } static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) @@ -437,7 +437,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) const int cq_factor = send_wr_factor + 1; /* + RECV */ int comp_vector, idx = nvme_rdma_queue_idx(queue); enum ib_poll_context poll_ctx; - int ret; + int ret, pages_per_mr; queue->device = nvme_rdma_find_get_device(queue->cm_id); if (!queue->device) { @@ -479,10 +479,16 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) goto out_destroy_qp; } + /* + * Currently we don't use SG_GAPS MR's so if the first entry is + * misaligned we'll end up using two entries for a single data page, + * so one additional entry is required. + */ + pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1; ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, queue->queue_size, IB_MR_TYPE_MEM_REG, - nvme_rdma_get_max_fr_pages(ibdev), 0); + pages_per_mr, 0); if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize MR pool sized %d for QID %d\n", @@ -820,8 +826,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (error) goto out_stop_queue; - ctrl->ctrl.max_hw_sectors = - (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9); + ctrl->ctrl.max_segments = ctrl->max_fr_pages; + ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9); blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); From f03e42c6af60f778a6d1ccfb857db9b2ec835279 Mon Sep 17 00:00:00 2001 From: Gabriel Craciunescu Date: Mon, 23 Sep 2019 20:22:56 +0200 Subject: [PATCH 1219/2880] Added QUIRKs for ADATA XPG SX8200 Pro 512GB Booting with default_ps_max_latency_us >6000 makes the device fail. Also SUBNQN is NULL and gives a warning on each boot/resume. $ nvme id-ctrl /dev/nvme0 | grep ^subnqn subnqn : (null) I use this device with an Acer Nitro 5 (AN515-43-R8BF) Laptop. To be sure is not a Laptop issue only, I tested the device on my server board with the same results. ( with 2x,4x link on the board and 4x link on a PCI-E card ). Signed-off-by: Gabriel Craciunescu Reviewed-by: Sagi Grimberg Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d3f63f0c212a..7ab07f4dce83 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3091,6 +3091,9 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, From 30f27d57c06e0199a9d3e0775113e13e2ae9078e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 13 Sep 2019 10:36:40 -0700 Subject: [PATCH 1220/2880] nvmet-tcp: remove superflous check on request sgl Now that sgl_free is null safe, drop the superflous check. Signed-off-by: Sagi Grimberg --- drivers/nvme/target/tcp.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index bf4f03474e89..d535080b781f 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -348,8 +348,7 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) return 0; err: - if (cmd->req.sg_cnt) - sgl_free(cmd->req.sg); + sgl_free(cmd->req.sg); return NVME_SC_INTERNAL; } @@ -554,8 +553,7 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd) if (queue->nvme_sq.sqhd_disabled) { kfree(cmd->iov); - if (cmd->req.sg_cnt) - sgl_free(cmd->req.sg); + sgl_free(cmd->req.sg); } return 1; @@ -586,8 +584,7 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd, return -EAGAIN; kfree(cmd->iov); - if (cmd->req.sg_cnt) - sgl_free(cmd->req.sg); + sgl_free(cmd->req.sg); cmd->queue->snd_cmd = NULL; nvmet_tcp_put_cmd(cmd); return 1; @@ -1310,8 +1307,7 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) nvmet_req_uninit(&cmd->req); nvmet_tcp_unmap_pdu_iovec(cmd); kfree(cmd->iov); - if (cmd->req.sg_cnt) - sgl_free(cmd->req.sg); + sgl_free(cmd->req.sg); } static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) From 19ea025e1d28c629b369c3532a85b3df478cc5c6 Mon Sep 17 00:00:00 2001 From: Jian-Hong Pan Date: Tue, 24 Sep 2019 17:42:41 +0800 Subject: [PATCH 1221/2880] nvme: Add quirk for Kingston NVME SSD running FW E8FK11.T Kingston NVME SSD with firmware version E8FK11.T has no interrupt after resume with actions related to suspend to idle. This patch applied NVME_QUIRK_SIMPLE_SUSPEND quirk to fix this issue. Fixes: d916b1be94b6 ("nvme-pci: use host managed power state for suspend") Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=204887 Signed-off-by: Jian-Hong Pan Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d3c9df62a9d5..adff57ea149e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2292,6 +2292,16 @@ static const struct nvme_core_quirk_entry core_quirks[] = { .vid = 0x14a4, .fr = "22301111", .quirks = NVME_QUIRK_SIMPLE_SUSPEND, + }, + { + /* + * This Kingston E8FK11.T firmware version has no interrupt + * after resume with actions related to suspend to idle + * https://bugzilla.kernel.org/show_bug.cgi?id=204887 + */ + .vid = 0x2646, + .fr = "E8FK11.T", + .quirks = NVME_QUIRK_SIMPLE_SUSPEND, } }; From 65e68edce0db433aa0c2b26d7dc14fbbbeb89fbb Mon Sep 17 00:00:00 2001 From: Marta Rybczynska Date: Tue, 24 Sep 2019 15:14:52 +0200 Subject: [PATCH 1222/2880] nvme: allow 64-bit results in passthru commands It is not possible to get 64-bit results from the passthru commands, what prevents from getting for the Capabilities (CAP) property value. As a result, it is not possible to implement IOL's NVMe Conformance test 4.3 Case 1 for Fabrics targets [1] (page 123). This issue has been already discussed [2], but without a solution. This patch solves the problem by adding new ioctls with a new passthru structure, including 64-bit results. The older ioctls stay unchanged. [1] https://www.iol.unh.edu/sites/default/files/testsuites/nvme/UNH-IOL_NVMe_Conformance_Test_Suite_v11.0.pdf [2] http://lists.infradead.org/pipermail/linux-nvme/2018-June/018791.html Signed-off-by: Marta Rybczynska Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 108 +++++++++++++++++++++++++++----- include/uapi/linux/nvme_ioctl.h | 23 +++++++ 2 files changed, 115 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index adff57ea149e..87ed437a46b8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -850,7 +850,7 @@ out: static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u32 *result, unsigned timeout) + u32 meta_seed, u64 *result, unsigned timeout) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -891,7 +891,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, else ret = nvme_req(req)->status; if (result) - *result = le32_to_cpu(nvme_req(req)->result.u32); + *result = le64_to_cpu(nvme_req(req)->result.u64); if (meta && !ret && !write) { if (copy_to_user(meta_buffer, meta, meta_len)) ret = -EFAULT; @@ -1338,6 +1338,54 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_command c; unsigned timeout = 0; u32 effects; + u64 result; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + if (cmd.flags) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + effects = nvme_passthru_start(ctrl, ns, cmd.opcode); + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + (void __user *)(uintptr_t)cmd.addr, cmd.data_len, + (void __user *)(uintptr_t)cmd.metadata, + cmd.metadata_len, 0, &result, timeout); + nvme_passthru_end(ctrl, effects); + + if (status >= 0) { + if (put_user(result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd64 __user *ucmd) +{ + struct nvme_passthru_cmd64 cmd; + struct nvme_command c; + unsigned timeout = 0; + u32 effects; int status; if (!capable(CAP_SYS_ADMIN)) @@ -1408,6 +1456,41 @@ static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) srcu_read_unlock(&head->srcu, idx); } +static bool is_ctrl_ioctl(unsigned int cmd) +{ + if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) + return true; + if (is_sed_ioctl(cmd)) + return true; + return false; +} + +static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, + void __user *argp, + struct nvme_ns_head *head, + int srcu_idx) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + int ret; + + nvme_get_ctrl(ns->ctrl); + nvme_put_ns_from_disk(head, srcu_idx); + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + ret = nvme_user_cmd(ctrl, NULL, argp); + break; + case NVME_IOCTL_ADMIN64_CMD: + ret = nvme_user_cmd64(ctrl, NULL, argp); + break; + default: + ret = sed_ioctl(ctrl->opal_dev, cmd, argp); + break; + } + nvme_put_ctrl(ctrl); + return ret; +} + static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1425,20 +1508,8 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, * seperately and drop the ns SRCU reference early. This avoids a * deadlock when deleting namespaces using the passthrough interface. */ - if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) { - struct nvme_ctrl *ctrl = ns->ctrl; - - nvme_get_ctrl(ns->ctrl); - nvme_put_ns_from_disk(head, srcu_idx); - - if (cmd == NVME_IOCTL_ADMIN_CMD) - ret = nvme_user_cmd(ctrl, NULL, argp); - else - ret = sed_ioctl(ctrl->opal_dev, cmd, argp); - - nvme_put_ctrl(ctrl); - return ret; - } + if (is_ctrl_ioctl(cmd)) + return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); switch (cmd) { case NVME_IOCTL_ID: @@ -1451,6 +1522,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, case NVME_IOCTL_SUBMIT_IO: ret = nvme_submit_io(ns, argp); break; + case NVME_IOCTL_IO64_CMD: + ret = nvme_user_cmd64(ns->ctrl, ns, argp); + break; default: if (ns->ndev) ret = nvme_nvm_ioctl(ns, cmd, arg); @@ -2852,6 +2926,8 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_ADMIN64_CMD: + return nvme_user_cmd64(ctrl, NULL, argp); case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index 1c215ea1798e..e168dc59e9a0 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -45,6 +45,27 @@ struct nvme_passthru_cmd { __u32 result; }; +struct nvme_passthru_cmd64 { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u64 result; +}; + #define nvme_admin_cmd nvme_passthru_cmd #define NVME_IOCTL_ID _IO('N', 0x40) @@ -54,5 +75,7 @@ struct nvme_passthru_cmd { #define NVME_IOCTL_RESET _IO('N', 0x44) #define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) #define NVME_IOCTL_RESCAN _IO('N', 0x46) +#define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64) +#define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) #endif /* _UAPI_LINUX_NVME_IOCTL_H */ From 104c307147ad379617472dd91a5bcb368d72bd6d Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Tue, 24 Sep 2019 23:23:56 -0500 Subject: [PATCH 1223/2880] drm/amd/display: prevent memory leak In dcn*_create_resource_pool the allocated memory should be released if construct pool fails. Reviewed-by: Harry Wentland Signed-off-by: Navid Emamdoost Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c | 1 + drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c | 1 + drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c | 1 + drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c | 1 + drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c b/drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c index afc61055eca1..1787b9bf800a 100644 --- a/drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c @@ -1091,6 +1091,7 @@ struct resource_pool *dce100_create_resource_pool( if (construct(num_virtual_links, dc, pool)) return &pool->base; + kfree(pool); BREAK_TO_DEBUGGER(); return NULL; } diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c index c66fe170e1e8..318e9c2e2ca8 100644 --- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c @@ -1462,6 +1462,7 @@ struct resource_pool *dce110_create_resource_pool( if (construct(num_virtual_links, dc, pool, asic_id)) return &pool->base; + kfree(pool); BREAK_TO_DEBUGGER(); return NULL; } diff --git a/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c b/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c index 2b3a2917c168..83e1878161c9 100644 --- a/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c @@ -1342,6 +1342,7 @@ struct resource_pool *dce112_create_resource_pool( if (construct(num_virtual_links, dc, pool)) return &pool->base; + kfree(pool); BREAK_TO_DEBUGGER(); return NULL; } diff --git a/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c b/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c index 236c4c0324b1..8b85e5274bba 100644 --- a/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c @@ -1208,6 +1208,7 @@ struct resource_pool *dce120_create_resource_pool( if (construct(num_virtual_links, dc, pool)) return &pool->base; + kfree(pool); BREAK_TO_DEBUGGER(); return NULL; } diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c index 5a89e462e7cc..59305e411a66 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c @@ -1570,6 +1570,7 @@ struct resource_pool *dcn10_create_resource_pool( if (construct(init_data->num_virtual_links, dc, pool)) return &pool->base; + kfree(pool); BREAK_TO_DEBUGGER(); return NULL; } From 2b1ff255d2d043aa4e6d62f50c478b283f9943ac Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 24 Sep 2019 14:22:08 -0700 Subject: [PATCH 1224/2880] nvme: Add ctrl attributes for queue_count and sqsize Current controller interrogation requires a lot of guesswork on how many io queues were created and what the io sq size is. The numbers are dependent upon core/fabric defaults, connect arguments, and target responses. Add sysfs attributes for queue_count and sqsize. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 87ed437a46b8..fd7dea36c3b6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3135,6 +3135,8 @@ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); nvme_show_int_function(cntlid); nvme_show_int_function(numa_node); +nvme_show_int_function(queue_count); +nvme_show_int_function(sqsize); static ssize_t nvme_sysfs_delete(struct device *dev, struct device_attribute *attr, const char *buf, @@ -3215,6 +3217,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_address.attr, &dev_attr_state.attr, &dev_attr_numa_node.attr, + &dev_attr_queue_count.attr, + &dev_attr_sqsize.attr, NULL }; From 8a03222f508bf09e03cf38f6bd77b34b450c1d60 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 23 Sep 2019 11:41:12 -0700 Subject: [PATCH 1225/2880] selftests/bpf: test_progs: fix client/server race in tcp_rtt This is the same problem I found earlier in test_sockopt_inherit: there is a race between server thread doing accept() and client thread doing connect(). Let's explicitly synchronize them via pthread conditional variable. v2: * don't exit from server_thread without signaling condvar, fixes possible issue where main() would wait forever (Andrii Nakryiko) Fixes: b55873984dab ("selftests/bpf: test BPF_SOCK_OPS_RTT_CB") Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- .../selftests/bpf/prog_tests/tcp_rtt.c | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index fdc0b3614a9e..a82da555b1b0 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -203,14 +203,24 @@ static int start_server(void) return fd; } +static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER; + static void *server_thread(void *arg) { struct sockaddr_storage addr; socklen_t len = sizeof(addr); int fd = *(int *)arg; int client_fd; + int err; - if (CHECK_FAIL(listen(fd, 1)) < 0) { + err = listen(fd, 1); + + pthread_mutex_lock(&server_started_mtx); + pthread_cond_signal(&server_started); + pthread_mutex_unlock(&server_started_mtx); + + if (CHECK_FAIL(err < 0)) { perror("Failed to listed on socket"); return NULL; } @@ -248,7 +258,14 @@ void test_tcp_rtt(void) if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; - pthread_create(&tid, NULL, server_thread, (void *)&server_fd); + if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread, + (void *)&server_fd))) + goto close_cgroup_fd; + + pthread_mutex_lock(&server_started_mtx); + pthread_cond_wait(&server_started, &server_started_mtx); + pthread_mutex_unlock(&server_started_mtx); + CHECK_FAIL(run_test(cgroup_fd, server_fd)); close(server_fd); close_cgroup_fd: From fcd30ae0665c778e283f73c1c885c7fd26d12ef2 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Tue, 24 Sep 2019 09:25:21 -0700 Subject: [PATCH 1226/2880] bpf/xskmap: Return ERR_PTR for failure case instead of NULL. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When kzalloc() failed, NULL was returned to the caller, which tested the pointer with IS_ERR(), which didn't match, so the pointer was used later, resulting in a NULL dereference. Return ERR_PTR(-ENOMEM) instead of NULL. Reported-by: syzbot+491c1b7565ba9069ecae@syzkaller.appspotmail.com Fixes: 0402acd683c6 ("xsk: remove AF_XDP socket from map when the socket is released") Signed-off-by: Jonathan Lemon Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 942c662e2eed..82a1ffe15dfa 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -37,7 +37,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); if (!node) - return NULL; + return ERR_PTR(-ENOMEM); err = xsk_map_inc(map); if (err) { From aef70a1f44c0b570e6345c02c2d240471859f0a4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 25 Sep 2019 11:30:38 -0700 Subject: [PATCH 1227/2880] libbpf: fix false uninitialized variable warning Some compilers emit warning for potential uninitialized next_id usage. The code is correct, but control flow is too complicated for some compilers to figure this out. Re-initialize next_id to satisfy compiler. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/lib/bpf/btf_dump.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 715967762312..84b0661db7f3 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1167,6 +1167,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, return; } + next_id = decls->ids[decls->cnt - 1]; next_t = btf__type_by_id(d->btf, next_id); multidim = btf_is_array(next_t); /* we need space if we have named non-pointer */ From d778c30a056ac352d1c0c58b5850e0fcc5655a58 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 25 Sep 2019 11:36:14 -0700 Subject: [PATCH 1228/2880] selftests/bpf: delete unused variables in test_sysctl Remove no longer used variables and avoid compiler warnings. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_sysctl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c index 4f8ec1f10a80..a320e3844b17 100644 --- a/tools/testing/selftests/bpf/test_sysctl.c +++ b/tools/testing/selftests/bpf/test_sysctl.c @@ -1385,7 +1385,6 @@ static int fixup_sysctl_value(const char *buf, size_t buf_len, uint8_t raw[sizeof(uint64_t)]; uint64_t num; } value = {}; - uint8_t c, i; if (buf_len > sizeof(value)) { log_err("Value is too big (%zd) to use in fixup", buf_len); From 4670d68b9254710fdeaf794cad54d8b2c9929e0a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 25 Sep 2019 11:52:05 -0700 Subject: [PATCH 1229/2880] selftests/bpf: adjust strobemeta loop to satisfy latest clang Some recent changes in latest Clang started causing the following warning when unrolling strobemeta test case main loop: progs/strobemeta.h:416:2: warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning] This patch simplifies loop's exit condition to depend only on constant max iteration number (STROBE_MAX_MAP_ENTRIES), while moving early termination logic inside the loop body. The changes are equivalent from program logic standpoint, but fixes the warning. It also appears to improve generated BPF code, as it fixes previously failing non-unrolled strobemeta test cases. Cc: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/progs/strobemeta.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index 8a399bdfd920..067eb625d01c 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -413,7 +413,10 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, #else #pragma unroll #endif - for (int i = 0; i < STROBE_MAX_MAP_ENTRIES && i < map.cnt; ++i) { + for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) { + if (i >= map.cnt) + break; + descr->key_lens[i] = 0; len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, map.entries[i].key); From e55d9d9bfb69405bd7615c0f8d229d8fafb3e9b8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 25 Sep 2019 16:45:53 -0700 Subject: [PATCH 1230/2880] memcg, kmem: do not fail __GFP_NOFAIL charges Thomas has noticed the following NULL ptr dereference when using cgroup v1 kmem limit: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 16923 Comm: gtk-update-icon Not tainted 4.19.51 #42 Hardware name: Gigabyte Technology Co., Ltd. Z97X-Gaming G1/Z97X-Gaming G1, BIOS F9 07/31/2015 RIP: 0010:create_empty_buffers+0x24/0x100 Code: cd 0f 1f 44 00 00 0f 1f 44 00 00 41 54 49 89 d4 ba 01 00 00 00 55 53 48 89 fb e8 97 fe ff ff 48 89 c5 48 89 c2 eb 03 48 89 ca <48> 8b 4a 08 4c 09 22 48 85 c9 75 f1 48 89 6a 08 48 8b 43 18 48 8d RSP: 0018:ffff927ac1b37bf8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: fffff2d4429fd740 RCX: 0000000100097149 RDX: 0000000000000000 RSI: 0000000000000082 RDI: ffff9075a99fbe00 RBP: 0000000000000000 R08: fffff2d440949cc8 R09: 00000000000960c0 R10: 0000000000000002 R11: 0000000000000000 R12: 0000000000000000 R13: ffff907601f18360 R14: 0000000000002000 R15: 0000000000001000 FS: 00007fb55b288bc0(0000) GS:ffff90761f8c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 000000007aebc002 CR4: 00000000001606e0 Call Trace: create_page_buffers+0x4d/0x60 __block_write_begin_int+0x8e/0x5a0 ? ext4_inode_attach_jinode.part.82+0xb0/0xb0 ? jbd2__journal_start+0xd7/0x1f0 ext4_da_write_begin+0x112/0x3d0 generic_perform_write+0xf1/0x1b0 ? file_update_time+0x70/0x140 __generic_file_write_iter+0x141/0x1a0 ext4_file_write_iter+0xef/0x3b0 __vfs_write+0x17e/0x1e0 vfs_write+0xa5/0x1a0 ksys_write+0x57/0xd0 do_syscall_64+0x55/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Tetsuo then noticed that this is because the __memcg_kmem_charge_memcg fails __GFP_NOFAIL charge when the kmem limit is reached. This is a wrong behavior because nofail allocations are not allowed to fail. Normal charge path simply forces the charge even if that means to cross the limit. Kmem accounting should be doing the same. Link: http://lkml.kernel.org/r/20190906125608.32129-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Thomas Lindroth Debugged-by: Tetsuo Handa Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Andrey Ryabinin Cc: Thomas Lindroth Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2156ef775d04..c313c49074ca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2943,6 +2943,16 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { + + /* + * Enforce __GFP_NOFAIL allocation because callers are not + * prepared to see failures and likely do not have any failure + * handling code. + */ + if (gfp & __GFP_NOFAIL) { + page_counter_charge(&memcg->kmem, nr_pages); + return 0; + } cancel_charge(memcg, nr_pages); return -ENOMEM; } From 541be05095437a7e5e08e7d13a13e03ec0994ae7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 25 Sep 2019 16:45:56 -0700 Subject: [PATCH 1231/2880] linux/coff.h: add include guard Add a header include guard just in case. My motivation is to allow Kbuild to detect missing include guard: https://patchwork.kernel.org/patch/11063011/ Before I enable this checker I want to fix as many headers as possible. Link: http://lkml.kernel.org/r/20190728154728.11126-1-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/coff.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/uapi/linux/coff.h b/include/uapi/linux/coff.h index e4a79f80b9a0..ab5c7e847eed 100644 --- a/include/uapi/linux/coff.h +++ b/include/uapi/linux/coff.h @@ -11,6 +11,9 @@ more information about COFF, then O'Reilly has a very excellent book. */ +#ifndef _UAPI_LINUX_COFF_H +#define _UAPI_LINUX_COFF_H + #define E_SYMNMLEN 8 /* Number of characters in a symbol name */ #define E_FILNMLEN 14 /* Number of characters in a file name */ #define E_DIMNUM 4 /* Number of array dimensions in auxiliary entry */ @@ -350,3 +353,5 @@ struct COFF_reloc { /* For new sections we haven't heard of before */ #define COFF_DEF_SECTION_ALIGNMENT 4 + +#endif /* _UAPI_LINUX_COFF_H */ From 0f74914071ab7e7b78731ed62bf350e3a344e0a5 Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Wed, 25 Sep 2019 16:45:59 -0700 Subject: [PATCH 1232/2880] kernel/elfcore.c: include proper prototypes When building with W=1, gcc properly complains that there's no prototypes: CC kernel/elfcore.o kernel/elfcore.c:7:17: warning: no previous prototype for 'elf_core_extra_phdrs' [-Wmissing-prototypes] 7 | Elf_Half __weak elf_core_extra_phdrs(void) | ^~~~~~~~~~~~~~~~~~~~ kernel/elfcore.c:12:12: warning: no previous prototype for 'elf_core_write_extra_phdrs' [-Wmissing-prototypes] 12 | int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/elfcore.c:17:12: warning: no previous prototype for 'elf_core_write_extra_data' [-Wmissing-prototypes] 17 | int __weak elf_core_write_extra_data(struct coredump_params *cprm) | ^~~~~~~~~~~~~~~~~~~~~~~~~ kernel/elfcore.c:22:15: warning: no previous prototype for 'elf_core_extra_data_size' [-Wmissing-prototypes] 22 | size_t __weak elf_core_extra_data_size(void) | ^~~~~~~~~~~~~~~~~~~~~~~~ Provide the include file so gcc is happy, and we don't have potential code drift Link: http://lkml.kernel.org/r/29875.1565224705@turing-police Signed-off-by: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/elfcore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/elfcore.c b/kernel/elfcore.c index fc482c8e0bd8..57fb4dcff434 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -3,6 +3,7 @@ #include #include #include +#include Elf_Half __weak elf_core_extra_phdrs(void) { From c7d4f7eeb6da9408e9ba7475fe2624bdb4d837d0 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 25 Sep 2019 16:46:02 -0700 Subject: [PATCH 1233/2880] rbtree: avoid generating code twice for the cached versions (tools copy) As was already noted in rbtree.h, the logic to cache rb_first (or rb_last) can easily be implemented externally to the core rbtree api. This commit takes the changes applied to the include/linux/ and lib/ rbtree files in 9f973cb38088 ("lib/rbtree: avoid generating code twice for the cached versions"), and applies these to the tools/include/linux/ and tools/lib/ files as well to keep them synchronized. Link: http://lkml.kernel.org/r/20190703034812.53002-1-walken@google.com Signed-off-by: Michel Lespinasse Cc: David Howells Cc: Davidlohr Bueso Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/linux/rbtree.h | 71 +++++++++++++++++--------- tools/include/linux/rbtree_augmented.h | 31 +++++------ tools/lib/rbtree.c | 37 ++------------ 3 files changed, 62 insertions(+), 77 deletions(-) diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h index d83763a5327c..e03b1ea23e0e 100644 --- a/tools/include/linux/rbtree.h +++ b/tools/include/linux/rbtree.h @@ -31,25 +31,9 @@ struct rb_root { struct rb_node *rb_node; }; -/* - * Leftmost-cached rbtrees. - * - * We do not cache the rightmost node based on footprint - * size vs number of potential users that could benefit - * from O(1) rb_last(). Just not worth it, users that want - * this feature can always implement the logic explicitly. - * Furthermore, users that want to cache both pointers may - * find it a bit asymmetric, but that's ok. - */ -struct rb_root_cached { - struct rb_root rb_root; - struct rb_node *rb_leftmost; -}; - #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } -#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) @@ -71,12 +55,6 @@ extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); -extern void rb_insert_color_cached(struct rb_node *, - struct rb_root_cached *, bool); -extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); -/* Same as rb_first(), but O(1) */ -#define rb_first_cached(root) (root)->rb_leftmost - /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); @@ -84,8 +62,6 @@ extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); -extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, - struct rb_root_cached *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) @@ -129,4 +105,51 @@ static inline void rb_erase_init(struct rb_node *n, struct rb_root *root) rb_erase(n, root); RB_CLEAR_NODE(n); } + +/* + * Leftmost-cached rbtrees. + * + * We do not cache the rightmost node based on footprint + * size vs number of potential users that could benefit + * from O(1) rb_last(). Just not worth it, users that want + * this feature can always implement the logic explicitly. + * Furthermore, users that want to cache both pointers may + * find it a bit asymmetric, but that's ok. + */ +struct rb_root_cached { + struct rb_root rb_root; + struct rb_node *rb_leftmost; +}; + +#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } + +/* Same as rb_first(), but O(1) */ +#define rb_first_cached(root) (root)->rb_leftmost + +static inline void rb_insert_color_cached(struct rb_node *node, + struct rb_root_cached *root, + bool leftmost) +{ + if (leftmost) + root->rb_leftmost = node; + rb_insert_color(node, &root->rb_root); +} + +static inline void rb_erase_cached(struct rb_node *node, + struct rb_root_cached *root) +{ + if (root->rb_leftmost == node) + root->rb_leftmost = rb_next(node); + rb_erase(node, &root->rb_root); +} + +static inline void rb_replace_node_cached(struct rb_node *victim, + struct rb_node *new, + struct rb_root_cached *root) +{ + if (root->rb_leftmost == victim) + root->rb_leftmost = new; + rb_replace_node(victim, new, &root->rb_root); +} + #endif /* __TOOLS_LINUX_PERF_RBTREE_H */ diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index ddd01006ece5..467a3eefe1d2 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -32,17 +32,16 @@ struct rb_augment_callbacks { void (*rotate)(struct rb_node *old, struct rb_node *new); }; -extern void __rb_insert_augmented(struct rb_node *node, - struct rb_root *root, - bool newleft, struct rb_node **leftmost, +extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); + /* * Fixup the rbtree and update the augmented information when rebalancing. * * On insertion, the user must update the augmented information on the path * leading to the inserted node, then call rb_link_node() as usual and - * rb_augment_inserted() instead of the usual rb_insert_color() call. - * If rb_augment_inserted() rebalances the rbtree, it will callback into + * rb_insert_augmented() instead of the usual rb_insert_color() call. + * If rb_insert_augmented() rebalances the rbtree, it will callback into * a user provided function to update the augmented information on the * affected subtrees. */ @@ -50,7 +49,7 @@ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - __rb_insert_augmented(node, root, false, NULL, augment->rotate); + __rb_insert_augmented(node, root, augment->rotate); } static inline void @@ -58,8 +57,9 @@ rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *root, bool newleft, const struct rb_augment_callbacks *augment) { - __rb_insert_augmented(node, &root->rb_root, - newleft, &root->rb_leftmost, augment->rotate); + if (newleft) + root->rb_leftmost = node; + rb_insert_augmented(node, &root->rb_root, augment); } #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ @@ -139,7 +139,6 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, - struct rb_node **leftmost, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; @@ -147,9 +146,6 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, struct rb_node *parent, *rebalance; unsigned long pc; - if (leftmost && node == *leftmost) - *leftmost = rb_next(node); - if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) @@ -249,8 +245,7 @@ static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - struct rb_node *rebalance = __rb_erase_augmented(node, root, - NULL, augment); + struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } @@ -259,11 +254,9 @@ static __always_inline void rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, const struct rb_augment_callbacks *augment) { - struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root, - &root->rb_leftmost, - augment); - if (rebalance) - __rb_erase_color(rebalance, &root->rb_root, augment->rotate); + if (root->rb_leftmost == node) + root->rb_leftmost = rb_next(node); + rb_erase_augmented(node, &root->rb_root, augment); } #endif /* _TOOLS_LINUX_RBTREE_AUGMENTED_H */ diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c index 804f145e3113..2548ff8c4d9c 100644 --- a/tools/lib/rbtree.c +++ b/tools/lib/rbtree.c @@ -83,14 +83,10 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, static __always_inline void __rb_insert(struct rb_node *node, struct rb_root *root, - bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; - if (newleft) - *leftmost = node; - while (true) { /* * Loop invariant: node is red. @@ -436,34 +432,17 @@ static const struct rb_augment_callbacks dummy_callbacks = { void rb_insert_color(struct rb_node *node, struct rb_root *root) { - __rb_insert(node, root, false, NULL, dummy_rotate); + __rb_insert(node, root, dummy_rotate); } void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *rebalance; - rebalance = __rb_erase_augmented(node, root, - NULL, &dummy_callbacks); + rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); if (rebalance) ____rb_erase_color(rebalance, root, dummy_rotate); } -void rb_insert_color_cached(struct rb_node *node, - struct rb_root_cached *root, bool leftmost) -{ - __rb_insert(node, &root->rb_root, leftmost, - &root->rb_leftmost, dummy_rotate); -} - -void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) -{ - struct rb_node *rebalance; - rebalance = __rb_erase_augmented(node, &root->rb_root, - &root->rb_leftmost, &dummy_callbacks); - if (rebalance) - ____rb_erase_color(rebalance, &root->rb_root, dummy_rotate); -} - /* * Augmented rbtree manipulation functions. * @@ -472,10 +451,9 @@ void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) */ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, - bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - __rb_insert(node, root, newleft, leftmost, augment_rotate); + __rb_insert(node, root, augment_rotate); } /* @@ -580,15 +558,6 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, __rb_change_child(victim, new, parent, root); } -void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, - struct rb_root_cached *root) -{ - rb_replace_node(victim, new, &root->rb_root); - - if (root->rb_leftmost == victim) - root->rb_leftmost = new; -} - static struct rb_node *rb_left_deepest_node(const struct rb_node *node) { for (;;) { From 444b8a83f1e01584ff2d53f5951d8e836c0070b5 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 25 Sep 2019 16:46:04 -0700 Subject: [PATCH 1234/2880] augmented rbtree: add comments for RB_DECLARE_CALLBACKS macro Patch series "make RB_DECLARE_CALLBACKS more generic", v3. These changes are intended to make the RB_DECLARE_CALLBACKS macro more generic (allowing the aubmented subtree information to be a struct instead of a scalar). I have verified the compiled lib/interval_tree.o and mm/mmap.o files to check that they didn't change. This held as expected for interval_tree.o; mmap.o did have some changes which could be reverted by marking __vma_link_rb as noinline. I did not add such a change to the patchset; I felt it was reasonable enough to leave the inlining decision up to the compiler. This patch (of 3): Add a short comment summarizing the arguments to RB_DECLARE_CALLBACKS. The arguments are also now capitalized. This copies the style of the INTERVAL_TREE_DEFINE macro. No functional changes in this commit, only comments and capitalization. Link: http://lkml.kernel.org/r/20190703040156.56953-2-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree_augmented.h | 54 ++++++++++++++++---------- tools/include/linux/rbtree_augmented.h | 54 ++++++++++++++++---------- 2 files changed, 66 insertions(+), 42 deletions(-) diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 179faab29f52..979941600082 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -60,39 +60,51 @@ rb_insert_augmented_cached(struct rb_node *node, rb_insert_augmented(node, &root->rb_root, augment); } -#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ - rbtype, rbaugmented, rbcompute) \ +/* + * Template for declaring augmented rbtree callbacks + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data + */ + +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline void \ -rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ - rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ - rbtype augmented = rbcompute(node); \ - if (node->rbaugmented == augmented) \ + RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ + RBTYPE augmented = RBCOMPUTE(node); \ + if (node->RBAUGMENTED == augmented) \ break; \ - node->rbaugmented = augmented; \ - rb = rb_parent(&node->rbfield); \ + node->RBAUGMENTED = augmented; \ + rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ -rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ -rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ - old->rbaugmented = rbcompute(old); \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ + old->RBAUGMENTED = RBCOMPUTE(old); \ } \ -rbstatic const struct rb_augment_callbacks rbname = { \ - .propagate = rbname ## _propagate, \ - .copy = rbname ## _copy, \ - .rotate = rbname ## _rotate \ +RBSTATIC const struct rb_augment_callbacks RBNAME = { \ + .propagate = RBNAME ## _propagate, \ + .copy = RBNAME ## _copy, \ + .rotate = RBNAME ## _rotate \ }; diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 467a3eefe1d2..de3a480204ba 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -62,39 +62,51 @@ rb_insert_augmented_cached(struct rb_node *node, rb_insert_augmented(node, &root->rb_root, augment); } -#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ - rbtype, rbaugmented, rbcompute) \ +/* + * Template for declaring augmented rbtree callbacks + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data + */ + +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline void \ -rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ - rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ - rbtype augmented = rbcompute(node); \ - if (node->rbaugmented == augmented) \ + RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ + RBTYPE augmented = RBCOMPUTE(node); \ + if (node->RBAUGMENTED == augmented) \ break; \ - node->rbaugmented = augmented; \ - rb = rb_parent(&node->rbfield); \ + node->RBAUGMENTED = augmented; \ + rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ -rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ -rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ - old->rbaugmented = rbcompute(old); \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ + old->RBAUGMENTED = RBCOMPUTE(old); \ } \ -rbstatic const struct rb_augment_callbacks rbname = { \ - .propagate = rbname ## _propagate, \ - .copy = rbname ## _copy, \ - .rotate = rbname ## _rotate \ +RBSTATIC const struct rb_augment_callbacks RBNAME = { \ + .propagate = RBNAME ## _propagate, \ + .copy = RBNAME ## _copy, \ + .rotate = RBNAME ## _rotate \ }; From 315cc066b8ae8349a27887ad7a34e1916e9797fe Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 25 Sep 2019 16:46:07 -0700 Subject: [PATCH 1235/2880] augmented rbtree: add new RB_DECLARE_CALLBACKS_MAX macro Add RB_DECLARE_CALLBACKS_MAX, which generates augmented rbtree callbacks for the case where the augmented value is a scalar whose definition follows a max(f(node)) pattern. This actually covers all present uses of RB_DECLARE_CALLBACKS, and saves some (source) code duplication in the various RBCOMPUTE function definitions. [walken@google.com: fix mm/vmalloc.c] Link: http://lkml.kernel.org/r/CANN689FXgK13wDYNh1zKxdipeTuALG4eKvKpsdZqKFJ-rvtGiQ@mail.gmail.com [walken@google.com: re-add check to check_augmented()] Link: http://lkml.kernel.org/r/20190727022027.GA86863@google.com Link: http://lkml.kernel.org/r/20190703040156.56953-3-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Davidlohr Bueso Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat_rbtree.c | 19 +++---------- drivers/block/drbd/drbd_interval.c | 29 +++----------------- include/linux/interval_tree_generic.h | 22 ++------------- include/linux/rbtree_augmented.h | 36 ++++++++++++++++++++++++- lib/rbtree_test.c | 37 ++++++++++++-------------- mm/mmap.c | 29 ++++++++++++-------- mm/vmalloc.c | 5 ++-- tools/include/linux/rbtree_augmented.h | 36 ++++++++++++++++++++++++- 8 files changed, 115 insertions(+), 98 deletions(-) diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index fa16036fa592..65ebe4b88f7c 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -54,23 +54,10 @@ static u64 get_subtree_max_end(struct rb_node *node) return ret; } -static u64 compute_subtree_max_end(struct memtype *data) -{ - u64 max_end = data->end, child_max_end; +#define NODE_END(node) ((node)->end) - child_max_end = get_subtree_max_end(data->rb.rb_right); - if (child_max_end > max_end) - max_end = child_max_end; - - child_max_end = get_subtree_max_end(data->rb.rb_left); - if (child_max_end > max_end) - max_end = child_max_end; - - return max_end; -} - -RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, - u64, subtree_max_end, compute_subtree_max_end) +RB_DECLARE_CALLBACKS_MAX(static, memtype_rb_augment_cb, + struct memtype, rb, u64, subtree_max_end, NODE_END) /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c index c58986556161..651bd0236a99 100644 --- a/drivers/block/drbd/drbd_interval.c +++ b/drivers/block/drbd/drbd_interval.c @@ -13,33 +13,10 @@ sector_t interval_end(struct rb_node *node) return this->end; } -/** - * compute_subtree_last - compute end of @node - * - * The end of an interval is the highest (start + (size >> 9)) value of this - * node and of its children. Called for @node and its parents whenever the end - * may have changed. - */ -static inline sector_t -compute_subtree_last(struct drbd_interval *node) -{ - sector_t max = node->sector + (node->size >> 9); +#define NODE_END(node) ((node)->sector + ((node)->size >> 9)) - if (node->rb.rb_left) { - sector_t left = interval_end(node->rb.rb_left); - if (left > max) - max = left; - } - if (node->rb.rb_right) { - sector_t right = interval_end(node->rb.rb_right); - if (right > max) - max = right; - } - return max; -} - -RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb, - sector_t, end, compute_subtree_last); +RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks, + struct drbd_interval, rb, sector_t, end, NODE_END); /** * drbd_insert_interval - insert a new interval into a tree diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h index 855476145fe1..aaa8a0767aa3 100644 --- a/include/linux/interval_tree_generic.h +++ b/include/linux/interval_tree_generic.h @@ -30,26 +30,8 @@ \ /* Callbacks for augmented rbtree insert and remove */ \ \ -static inline ITTYPE ITPREFIX ## _compute_subtree_last(ITSTRUCT *node) \ -{ \ - ITTYPE max = ITLAST(node), subtree_last; \ - if (node->ITRB.rb_left) { \ - subtree_last = rb_entry(node->ITRB.rb_left, \ - ITSTRUCT, ITRB)->ITSUBTREE; \ - if (max < subtree_last) \ - max = subtree_last; \ - } \ - if (node->ITRB.rb_right) { \ - subtree_last = rb_entry(node->ITRB.rb_right, \ - ITSTRUCT, ITRB)->ITSUBTREE; \ - if (max < subtree_last) \ - max = subtree_last; \ - } \ - return max; \ -} \ - \ -RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB, \ - ITTYPE, ITSUBTREE, ITPREFIX ## _compute_subtree_last) \ +RB_DECLARE_CALLBACKS_MAX(static, ITPREFIX ## _augment, \ + ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, ITLAST) \ \ /* Insert / remove interval nodes from the tree */ \ \ diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 979941600082..e5937e387e02 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -61,7 +61,7 @@ rb_insert_augmented_cached(struct rb_node *node, } /* - * Template for declaring augmented rbtree callbacks + * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure @@ -107,6 +107,40 @@ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .rotate = RBNAME ## _rotate \ }; +/* + * Template for declaring augmented rbtree callbacks, + * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar + */ + +#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ +{ \ + RBSTRUCT *child; \ + RBTYPE max = RBCOMPUTE(node); \ + if (node->RBFIELD.rb_left) { \ + child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + if (node->RBFIELD.rb_right) { \ + child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + return max; \ +} \ +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBNAME ## _compute_max) + #define RB_RED 0 #define RB_BLACK 1 diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 62b8ee92643d..41ae3c7570d3 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -77,26 +77,10 @@ static inline void erase_cached(struct test_node *node, struct rb_root_cached *r } -static inline u32 augment_recompute(struct test_node *node) -{ - u32 max = node->val, child_augmented; - if (node->rb.rb_left) { - child_augmented = rb_entry(node->rb.rb_left, struct test_node, - rb)->augmented; - if (max < child_augmented) - max = child_augmented; - } - if (node->rb.rb_right) { - child_augmented = rb_entry(node->rb.rb_right, struct test_node, - rb)->augmented; - if (max < child_augmented) - max = child_augmented; - } - return max; -} +#define NODE_VAL(node) ((node)->val) -RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb, - u32, augmented, augment_recompute) +RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks, + struct test_node, rb, u32, augmented, NODE_VAL) static void insert_augmented(struct test_node *node, struct rb_root_cached *root) @@ -238,7 +222,20 @@ static void check_augmented(int nr_nodes) check(nr_nodes); for (rb = rb_first(&root.rb_root); rb; rb = rb_next(rb)) { struct test_node *node = rb_entry(rb, struct test_node, rb); - WARN_ON_ONCE(node->augmented != augment_recompute(node)); + u32 subtree, max = node->val; + if (node->rb.rb_left) { + subtree = rb_entry(node->rb.rb_left, struct test_node, + rb)->augmented; + if (max < subtree) + max = subtree; + } + if (node->rb.rb_right) { + subtree = rb_entry(node->rb.rb_right, struct test_node, + rb)->augmented; + if (max < subtree) + max = subtree; + } + WARN_ON_ONCE(node->augmented != max); } } diff --git a/mm/mmap.c b/mm/mmap.c index f1e8c7f93e04..14b7da317ec0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -289,9 +289,9 @@ out: return retval; } -static long vma_compute_subtree_gap(struct vm_area_struct *vma) +static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) { - unsigned long max, prev_end, subtree_gap; + unsigned long gap, prev_end; /* * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we @@ -299,14 +299,21 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) * an unmapped area; whereas when expanding we only require one. * That's a little inconsistent, but keeps the code here simpler. */ - max = vm_start_gap(vma); + gap = vm_start_gap(vma); if (vma->vm_prev) { prev_end = vm_end_gap(vma->vm_prev); - if (max > prev_end) - max -= prev_end; + if (gap > prev_end) + gap -= prev_end; else - max = 0; + gap = 0; } + return gap; +} + +#ifdef CONFIG_DEBUG_VM_RB +static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma) +{ + unsigned long max = vma_compute_gap(vma), subtree_gap; if (vma->vm_rb.rb_left) { subtree_gap = rb_entry(vma->vm_rb.rb_left, struct vm_area_struct, vm_rb)->rb_subtree_gap; @@ -322,7 +329,6 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) return max; } -#ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct mm_struct *mm) { struct rb_root *root = &mm->mm_rb; @@ -428,8 +434,9 @@ static void validate_mm(struct mm_struct *mm) #define validate_mm(mm) do { } while (0) #endif -RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, - unsigned long, rb_subtree_gap, vma_compute_subtree_gap) +RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks, + struct vm_area_struct, vm_rb, + unsigned long, rb_subtree_gap, vma_compute_gap) /* * Update augmented rbtree rb_subtree_gap values after vma->vm_start or @@ -439,8 +446,8 @@ RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, static void vma_gap_update(struct vm_area_struct *vma) { /* - * As it turns out, RB_DECLARE_CALLBACKS() already created a callback - * function that does exactly what we want. + * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created + * a callback function that does exactly what we want. */ vma_gap_callbacks_propagate(&vma->vm_rb, NULL); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fcadd3e25c0c..a3c70e275f4e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -396,9 +396,8 @@ compute_subtree_max_size(struct vmap_area *va) get_subtree_max_size(va->rb_node.rb_right)); } -RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb, - struct vmap_area, rb_node, unsigned long, subtree_max_size, - compute_subtree_max_size) +RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, + struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index de3a480204ba..4e8c4c76e9a2 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -63,7 +63,7 @@ rb_insert_augmented_cached(struct rb_node *node, } /* - * Template for declaring augmented rbtree callbacks + * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure @@ -109,6 +109,40 @@ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .rotate = RBNAME ## _rotate \ }; +/* + * Template for declaring augmented rbtree callbacks, + * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar + */ + +#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ +{ \ + RBSTRUCT *child; \ + RBTYPE max = RBCOMPUTE(node); \ + if (node->RBFIELD.rb_left) { \ + child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + if (node->RBFIELD.rb_right) { \ + child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + return max; \ +} \ +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBNAME ## _compute_max) + #define RB_RED 0 #define RB_BLACK 1 From 6d2052d188d962ffb7ad3d413e6ffd5f276aec94 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 25 Sep 2019 16:46:10 -0700 Subject: [PATCH 1236/2880] augmented rbtree: rework the RB_DECLARE_CALLBACKS macro definition Change the definition of the RBCOMPUTE function. The propagate callback repeatedly calls RBCOMPUTE as it moves from leaf to root. it wants to stop recomputing once the augmented subtree information doesn't change. This was previously checked using the == operator, but that only works when the augmented subtree information is a scalar field. This commit modifies the RBCOMPUTE function so that it now sets the augmented subtree information instead of returning it, and returns a boolean value indicating if the propagate callback should stop. The motivation for this change is that I want to introduce augmented rbtree uses where the augmented data for the subtree is a struct instead of a scalar. Link: http://lkml.kernel.org/r/20190703040156.56953-4-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Davidlohr Bueso Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree_augmented.h | 24 ++++++++++++------------ tools/include/linux/rbtree_augmented.h | 24 ++++++++++++------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index e5937e387e02..fdd421b8d9ae 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -67,22 +67,19 @@ rb_insert_augmented_cached(struct rb_node *node, * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT - * RBTYPE: type of the RBAUGMENTED field - * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ -#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ - RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ - RBTYPE augmented = RBCOMPUTE(node); \ - if (node->RBAUGMENTED == augmented) \ + if (RBCOMPUTE(node, true)) \ break; \ - node->RBAUGMENTED = augmented; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ @@ -99,7 +96,7 @@ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ - old->RBAUGMENTED = RBCOMPUTE(old); \ + RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ @@ -122,7 +119,7 @@ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ -static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ +static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ @@ -136,10 +133,13 @@ static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ - return max; \ + if (exit && node->RBAUGMENTED == max) \ + return true; \ + node->RBAUGMENTED = max; \ + return false; \ } \ -RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ - RBTYPE, RBAUGMENTED, RBNAME ## _compute_max) +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 4e8c4c76e9a2..381aa948610d 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -69,22 +69,19 @@ rb_insert_augmented_cached(struct rb_node *node, * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT - * RBTYPE: type of the RBAUGMENTED field - * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ -#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ - RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ - RBTYPE augmented = RBCOMPUTE(node); \ - if (node->RBAUGMENTED == augmented) \ + if (RBCOMPUTE(node, true)) \ break; \ - node->RBAUGMENTED = augmented; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ @@ -101,7 +98,7 @@ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ - old->RBAUGMENTED = RBCOMPUTE(old); \ + RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ @@ -124,7 +121,7 @@ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ -static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ +static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ @@ -138,10 +135,13 @@ static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ - return max; \ + if (exit && node->RBAUGMENTED == max) \ + return true; \ + node->RBAUGMENTED = max; \ + return false; \ } \ -RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ - RBTYPE, RBAUGMENTED, RBNAME ## _compute_max) +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 From 917cda2790c4bd624c5191b8d9edd12121749e86 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:13 -0700 Subject: [PATCH 1237/2880] kernel-doc: core-api: include string.h into core-api core-api should show all the various string functions including the newly added stracpy and stracpy_pad. Miscellanea: o Update the Returns: value for strscpy o fix a defect with %NUL) [joe@perches.com: correct return of -E2BIG descriptions] Link: http://lkml.kernel.org/r/29f998b4c1a9d69fbeae70500ba0daa4b340c546.1563889130.git.joe@perches.com Link: http://lkml.kernel.org/r/224a6ebf39955f4107c0c376d66155d970e46733.1563841972.git.joe@perches.com Signed-off-by: Joe Perches Reviewed-by: Kees Cook Cc: Jonathan Corbet Cc: Stephen Kitt Cc: Nitin Gote Cc: Rasmus Villemoes Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/kernel-api.rst | 3 +++ include/linux/string.h | 5 +++-- lib/string.c | 10 ++++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 08af5caf036d..f77de49b1d51 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -42,6 +42,9 @@ String Manipulation .. kernel-doc:: lib/string.c :export: +.. kernel-doc:: include/linux/string.h + :internal: + .. kernel-doc:: mm/util.c :functions: kstrdup kstrdup_const kstrndup kmemdup kmemdup_nul memdup_user vmemdup_user strndup_user memdup_user_nul diff --git a/include/linux/string.h b/include/linux/string.h index 4deb11f7976b..b2f9df7f0761 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -474,8 +474,9 @@ static inline void memcpy_and_pad(void *dest, size_t dest_len, * But this can lead to bugs due to typos, or if prefix is a pointer * and not a constant. Instead use str_has_prefix(). * - * Returns: 0 if @str does not start with @prefix - strlen(@prefix) if @str does start with @prefix + * Returns: + * * strlen(@prefix) if @str starts with @prefix + * * 0 if @str does not start with @prefix */ static __always_inline size_t str_has_prefix(const char *str, const char *prefix) { diff --git a/lib/string.c b/lib/string.c index 461fb620f85f..f7bc10da4259 100644 --- a/lib/string.c +++ b/lib/string.c @@ -173,8 +173,9 @@ EXPORT_SYMBOL(strlcpy); * doesn't unnecessarily force the tail of the destination buffer to be * zeroed. If zeroing is desired please use strscpy_pad(). * - * Return: The number of characters copied (not including the trailing - * %NUL) or -E2BIG if the destination buffer wasn't big enough. + * Returns: + * * The number of characters copied (not including the trailing %NUL) + * * -E2BIG if count is 0 or @src was truncated. */ ssize_t strscpy(char *dest, const char *src, size_t count) { @@ -253,8 +254,9 @@ EXPORT_SYMBOL(strscpy); * For full explanation of why you may want to consider using the * 'strscpy' functions please see the function docstring for strscpy(). * - * Return: The number of characters copied (not including the trailing - * %NUL) or -E2BIG if the destination buffer wasn't big enough. + * Returns: + * * The number of characters copied (not including the trailing %NUL) + * * -E2BIG if count is 0 or @src was truncated. */ ssize_t strscpy_pad(char *dest, const char *src, size_t count) { From d1a445d3b86c9341ce7a0954c23be0edb5c9bec5 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 25 Sep 2019 16:46:16 -0700 Subject: [PATCH 1238/2880] include/trace/events/writeback.h: fix -Wstringop-truncation warnings There are many of those warnings. In file included from ./arch/powerpc/include/asm/paca.h:15, from ./arch/powerpc/include/asm/current.h:13, from ./include/linux/thread_info.h:21, from ./include/asm-generic/preempt.h:5, from ./arch/powerpc/include/generated/asm/preempt.h:1, from ./include/linux/preempt.h:78, from ./include/linux/spinlock.h:51, from fs/fs-writeback.c:19: In function 'strncpy', inlined from 'perf_trace_writeback_page_template' at ./include/trace/events/writeback.h:56:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix it by using the new strscpy_pad() which was introduced in "lib/string: Add strscpy_pad() function" and will always be NUL-terminated instead of strncpy(). Also, change strlcpy() to use strscpy_pad() in this file for consistency. Link: http://lkml.kernel.org/r/1564075099-27750-1-git-send-email-cai@lca.pw Fixes: 455b2864686d ("writeback: Initial tracing support") Fixes: 028c2dd184c0 ("writeback: Add tracing to balance_dirty_pages") Fixes: e84d0a4f8e39 ("writeback: trace event writeback_queue_io") Fixes: b48c104d2211 ("writeback: trace event bdi_dirty_ratelimit") Fixes: cc1676d917f3 ("writeback: Move requeueing when I_SYNC set to writeback_sb_inodes()") Fixes: 9fb0a7da0c52 ("writeback: add more tracepoints") Signed-off-by: Qian Cai Reviewed-by: Jan Kara Cc: Tobin C. Harding Cc: Steven Rostedt (VMware) Cc: Ingo Molnar Cc: Tejun Heo Cc: Dave Chinner Cc: Fengguang Wu Cc: Jens Axboe Cc: Joe Perches Cc: Kees Cook Cc: Jann Horn Cc: Jonathan Corbet Cc: Nitin Gote Cc: Rasmus Villemoes Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/writeback.h | 38 +++++++++++++++++--------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 3a27335fce2c..c2ce6480b4b1 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -66,8 +66,9 @@ DECLARE_EVENT_CLASS(writeback_page_template, ), TP_fast_assign( - strncpy(__entry->name, - mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", 32); + strscpy_pad(__entry->name, + mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", + 32); __entry->ino = mapping ? mapping->host->i_ino : 0; __entry->index = page->index; ), @@ -110,8 +111,8 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ - strncpy(__entry->name, - bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); + strscpy_pad(__entry->name, + bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; @@ -316,8 +317,8 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, ), TP_fast_assign( - strncpy(__entry->name, - dev_name(inode_to_bdi(inode)->dev), 32); + strscpy_pad(__entry->name, + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); @@ -360,8 +361,9 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(unsigned int, cgroup_ino) ), TP_fast_assign( - strncpy(__entry->name, - wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32); + strscpy_pad(__entry->name, + wb->bdi->dev ? dev_name(wb->bdi->dev) : + "(unknown)", 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; @@ -414,7 +416,7 @@ DECLARE_EVENT_CLASS(writeback_class, __field(unsigned int, cgroup_ino) ), TP_fast_assign( - strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%u", @@ -436,7 +438,7 @@ TRACE_EVENT(writeback_bdi_register, __array(char, name, 32) ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(bdi->dev), 32); ), TP_printk("bdi %s", __entry->name @@ -461,7 +463,7 @@ DECLARE_EVENT_CLASS(wbc_class, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(bdi->dev), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; @@ -512,7 +514,7 @@ TRACE_EVENT(writeback_queue_io, ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; - strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32); __entry->older = older_than_this ? *older_than_this : 0; __entry->age = older_than_this ? (jiffies - *older_than_this) * 1000 / HZ : -1; @@ -598,7 +600,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, ), TP_fast_assign( - strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); @@ -663,7 +665,7 @@ TRACE_EVENT(balance_dirty_pages, TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; - strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + @@ -723,8 +725,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue, ), TP_fast_assign( - strncpy(__entry->name, - dev_name(inode_to_bdi(inode)->dev), 32); + strscpy_pad(__entry->name, + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; @@ -797,8 +799,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, ), TP_fast_assign( - strncpy(__entry->name, - dev_name(inode_to_bdi(inode)->dev), 32); + strscpy_pad(__entry->name, + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; From 9a156466147b61504f4cbe97ea503e67c21e117a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:46:20 -0700 Subject: [PATCH 1239/2880] strscpy: reject buffer sizes larger than INT_MAX As already done for snprintf(), add a check in strscpy() for giant (i.e. likely negative and/or miscalculated) copy sizes, WARN, and error out. Link: http://lkml.kernel.org/r/201907260928.23DE35406@keescook Signed-off-by: Kees Cook Cc: Joe Perches Cc: Rasmus Villemoes Cc: Yann Droneaud Cc: David Laight Cc: Jonathan Corbet Cc: Stephen Kitt Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/string.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/string.c b/lib/string.c index f7bc10da4259..cd7a10c19210 100644 --- a/lib/string.c +++ b/lib/string.c @@ -183,7 +183,7 @@ ssize_t strscpy(char *dest, const char *src, size_t count) size_t max = count; long res = 0; - if (count == 0) + if (count == 0 || WARN_ON_ONCE(count > INT_MAX)) return -E2BIG; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS From e3f4faa42095cacceeb33c68fda647a8e6d48a90 Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Wed, 25 Sep 2019 16:46:23 -0700 Subject: [PATCH 1240/2880] lib/generic-radix-tree.c: make 2 functions static inline When building with W=1, we get some warnings: l CC lib/generic-radix-tree.o lib/generic-radix-tree.c:39:10: warning: no previous prototype for 'genradix_root_to_depth' [-Wmissing-prototypes] 39 | unsigned genradix_root_to_depth(struct genradix_root *r) | ^~~~~~~~~~~~~~~~~~~~~~ lib/generic-radix-tree.c:44:23: warning: no previous prototype for 'genradix_root_to_node' [-Wmissing-prototypes] 44 | struct genradix_node *genradix_root_to_node(struct genradix_root *r) | ^~~~~~~~~~~~~~~~~~~~~ They're not used anywhere else, so make them static inline. Link: http://lkml.kernel.org/r/46923.1565236485@turing-police Signed-off-by: Valdis Kletnieks Cc: Kent Overstreet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/generic-radix-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index a7bafc413730..ae25e2fa2187 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -36,12 +36,12 @@ static inline size_t genradix_depth_size(unsigned depth) #define GENRADIX_DEPTH_MASK \ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) -unsigned genradix_root_to_depth(struct genradix_root *r) +static inline unsigned genradix_root_to_depth(struct genradix_root *r) { return (unsigned long) r & GENRADIX_DEPTH_MASK; } -struct genradix_node *genradix_root_to_node(struct genradix_root *r) +static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) { return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); } From 8e72a7a44df5534ae7664240c1fa75e71e11c64a Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Wed, 25 Sep 2019 16:46:26 -0700 Subject: [PATCH 1241/2880] lib/extable.c: add missing prototypes When building with W=1, a number of warnings are issued: CC lib/extable.o lib/extable.c:63:6: warning: no previous prototype for 'sort_extable' [-Wmissing-prototypes] 63 | void sort_extable(struct exception_table_entry *start, | ^~~~~~~~~~~~ lib/extable.c:75:6: warning: no previous prototype for 'trim_init_extable' [-Wmissing-prototypes] 75 | void trim_init_extable(struct module *m) | ^~~~~~~~~~~~~~~~~ lib/extable.c:115:1: warning: no previous prototype for 'search_extable' [-Wmissing-prototypes] 115 | search_extable(const struct exception_table_entry *base, | ^~~~~~~~~~~~~~ Add the missing #include for the prototypes. Link: http://lkml.kernel.org/r/45574.1565235784@turing-police Signed-off-by: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/extable.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/extable.c b/lib/extable.c index 25da4071122a..c3e59caf7ffa 100644 --- a/lib/extable.c +++ b/lib/extable.c @@ -10,6 +10,7 @@ #include #include #include +#include #ifndef ARCH_HAS_RELATIVE_EXTABLE #define ex_to_insn(x) ((x)->insn) From 091cb0994edd20d67521094ac9c6ec9804058d9a Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 25 Sep 2019 16:46:29 -0700 Subject: [PATCH 1242/2880] lib/hexdump: make print_hex_dump_bytes() a nop on !DEBUG builds I'm seeing a bunch of debug prints from a user of print_hex_dump_bytes() in my kernel logs, but I don't have CONFIG_DYNAMIC_DEBUG enabled nor do I have DEBUG defined in my build. The problem is that print_hex_dump_bytes() calls a wrapper function in lib/hexdump.c that calls print_hex_dump() with KERN_DEBUG level. There are three cases to consider here 1. CONFIG_DYNAMIC_DEBUG=y --> call dynamic_hex_dum() 2. CONFIG_DYNAMIC_DEBUG=n && DEBUG --> call print_hex_dump() 3. CONFIG_DYNAMIC_DEBUG=n && !DEBUG --> stub it out Right now, that last case isn't detected and we still call print_hex_dump() from the stub wrapper. Let's make print_hex_dump_bytes() only call print_hex_dump_debug() so that it works properly in all cases. Case #1, print_hex_dump_debug() calls dynamic_hex_dump() and we get same behavior. Case #2, print_hex_dump_debug() calls print_hex_dump() with KERN_DEBUG and we get the same behavior. Case #3, print_hex_dump_debug() is a nop, changing behavior to what we want, i.e. print nothing. Link: http://lkml.kernel.org/r/20190816235624.115280-1-swboyd@chromium.org Signed-off-by: Stephen Boyd Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 22 +++++++++++++++------- lib/hexdump.c | 21 --------------------- 2 files changed, 15 insertions(+), 28 deletions(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index cefd374c47b1..c09d67edda3a 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -488,13 +488,6 @@ extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, extern void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); -#if defined(CONFIG_DYNAMIC_DEBUG) -#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ - dynamic_hex_dump(prefix_str, prefix_type, 16, 1, buf, len, true) -#else -extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type, - const void *buf, size_t len); -#endif /* defined(CONFIG_DYNAMIC_DEBUG) */ #else static inline void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, @@ -526,4 +519,19 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, } #endif +/** + * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params + * @prefix_str: string to prefix each line with; + * caller supplies trailing spaces for alignment if desired + * @prefix_type: controls whether prefix of an offset, address, or none + * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) + * @buf: data blob to dump + * @len: number of bytes in the @buf + * + * Calls print_hex_dump(), with log level of KERN_DEBUG, + * rowsize of 16, groupsize of 1, and ASCII output included. + */ +#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ + print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true) + #endif diff --git a/lib/hexdump.c b/lib/hexdump.c index b1d55b669ae2..147133f8eb2f 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -270,25 +270,4 @@ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, } EXPORT_SYMBOL(print_hex_dump); -#if !defined(CONFIG_DYNAMIC_DEBUG) -/** - * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params - * @prefix_str: string to prefix each line with; - * caller supplies trailing spaces for alignment if desired - * @prefix_type: controls whether prefix of an offset, address, or none - * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) - * @buf: data blob to dump - * @len: number of bytes in the @buf - * - * Calls print_hex_dump(), with log level of KERN_DEBUG, - * rowsize of 16, groupsize of 1, and ASCII output included. - */ -void print_hex_dump_bytes(const char *prefix_str, int prefix_type, - const void *buf, size_t len) -{ - print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, 16, 1, - buf, len, true); -} -EXPORT_SYMBOL(print_hex_dump_bytes); -#endif /* !defined(CONFIG_DYNAMIC_DEBUG) */ #endif /* defined(CONFIG_PRINTK) */ From 634cffcc9478e954d121c3e27e53de4f0d917ac8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:32 -0700 Subject: [PATCH 1243/2880] checkpatch: don't interpret stack dumps as commit IDs Add more types of lines that appear to be stack dumps that also include hex lines that might otherwise be interpreted as commit IDs. Link: http://lkml.kernel.org/r/ff00208289224f0ca4eaf4ff7c9c6e087dad0a63.camel@perches.com Link: http://lkml.kernel.org/r/f7dc9727795db3802809a24162abe0b67e14123b.1563575364.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 93a7edfe0f05..3c0ee0dde850 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2725,8 +2725,10 @@ sub process { ($line =~ /^\s*(?:WARNING:|BUG:)/ || $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ || # timestamp - $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) { - # stack dump address + $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/) || + $line =~ /^(?:\s+\w+:\s+[0-9a-fA-F]+){3,3}/ || + $line =~ /^\s*\#\d+\s*\[[0-9a-fA-F]+\]\s*\w+ at [0-9a-fA-F]+/) { + # stack dump address styles $commit_log_possible_stack_dump = 1; } From ffbce8974d90efb5ced95fabc61283467c80cb0d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:35 -0700 Subject: [PATCH 1244/2880] checkpatch: improve SPDX license checking Use perl's m@@ match and not // comparisons to avoid an error using c90's // comment style. Miscellanea: o Use normal tab indentation and alignment Link: http://lkml.kernel.org/r/5e4a8fa7901148fbcd77ab391e6dd0e6bf95777f.camel@perches.com Link: http://lkml.kernel.org/r/f08eb62458407a145cfedf959d1091af151cd665.1563575364.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3c0ee0dde850..6cb99ec62000 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3071,21 +3071,21 @@ sub process { # check SPDX comment style for .[chsS] files if ($realfile =~ /\.[chsS]$/ && $rawline =~ /SPDX-License-Identifier:/ && - $rawline !~ /^\+\s*\Q$comment\E\s*/) { + $rawline !~ m@^\+\s*\Q$comment\E\s*@) { WARN("SPDX_LICENSE_TAG", "Improper SPDX comment style for '$realfile', please use '$comment' instead\n" . $herecurr); } if ($comment !~ /^$/ && - $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) { - WARN("SPDX_LICENSE_TAG", - "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); + $rawline !~ m@^\+\Q$comment\E SPDX-License-Identifier: @) { + WARN("SPDX_LICENSE_TAG", + "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); } elsif ($rawline =~ /(SPDX-License-Identifier: .*)/) { - my $spdx_license = $1; - if (!is_SPDX_License_valid($spdx_license)) { - WARN("SPDX_LICENSE_TAG", - "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr); - } + my $spdx_license = $1; + if (!is_SPDX_License_valid($spdx_license)) { + WARN("SPDX_LICENSE_TAG", + "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr); + } } } } From a8dd86bf746256fbf68f82bc13356244c5ad8efa Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Wed, 25 Sep 2019 16:46:38 -0700 Subject: [PATCH 1245/2880] checkpatch.pl: warn on invalid commit id It can happen that a commit message refers to an invalid commit id, because the referenced hash changed following a rebase, or simply by mistake. Add a check in checkpatch.pl which checks that an hash referenced by a Fixes tag, or just cited in the commit message, is a valid commit id. $ scripts/checkpatch.pl <<'EOF' Subject: [PATCH] test commit Sample test commit to test checkpatch.pl Commit 1da177e4c3f4 ("Linux-2.6.12-rc2") really exists, commit 0bba044c4ce7 ("tree") is valid but not a commit, while commit b4cc0b1c0cca ("unknown") is invalid. Fixes: f0cacc14cade ("unknown") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") EOF WARNING: Unknown commit id '0bba044c4ce7', maybe rebased or not pulled? #8: commit 0bba044c4ce7 ("tree") is valid but not a commit, WARNING: Unknown commit id 'b4cc0b1c0cca', maybe rebased or not pulled? #9: while commit b4cc0b1c0cca ("unknown") is invalid. WARNING: Unknown commit id 'f0cacc14cade', maybe rebased or not pulled? #11: Fixes: f0cacc14cade ("unknown") total: 0 errors, 3 warnings, 4 lines checked Link: http://lkml.kernel.org/r/20190711001640.13398-1-mcroce@redhat.com Signed-off-by: Matteo Croce Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 6cb99ec62000..0b1388078b69 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2900,6 +2900,17 @@ sub process { } } +# check for invalid commit id + if ($in_commit_log && $line =~ /(^fixes:|\bcommit)\s+([0-9a-f]{6,40})\b/i) { + my $id; + my $description; + ($id, $description) = git_commit_info($2, undef, undef); + if (!defined($id)) { + WARN("UNKNOWN_COMMIT_ID", + "Unknown commit id '$2', maybe rebased or not pulled?\n" . $herecurr); + } + } + # ignore non-hunk lines and lines being removed next if (!$hunk_line || $line =~ /^-/); From 6dba824e9ef7155f58f11b268b0f98ecec31b723 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 25 Sep 2019 16:46:41 -0700 Subject: [PATCH 1246/2880] checkpatch: exclude sizeof sub-expressions from MACRO_ARG_REUSE The arguments of sizeof are not evaluated so arguments are safe to re-use in that context. Excluding sizeof subexpressions means macros like ARRAY_SIZE can pass checkpatch. Link: http://lkml.kernel.org/r/20190806070833.24423-1-brendan.jackman@bluwireless.co.uk Signed-off-by: Brendan Jackman Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 0b1388078b69..d7466879d505 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5204,7 +5204,7 @@ sub process { next if ($arg =~ /\.\.\./); next if ($arg =~ /^type$/i); my $tmp_stmt = $define_stmt; - $tmp_stmt =~ s/\b(typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g; + $tmp_stmt =~ s/\b(sizeof|typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g; $tmp_stmt =~ s/\#+\s*$arg\b//g; $tmp_stmt =~ s/\b$arg\s*\#\#//g; my $use_cnt = () = $tmp_stmt =~ /\b$arg\b/g; From 462811d9d4007abe229c4cede0c124ed631da0cc Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:44 -0700 Subject: [PATCH 1247/2880] checkpatch: prefer __section over __attribute__((section(...))) Add another test for __attribute__((section("foo"))) uses that should be __section(foo) Link: http://lkml.kernel.org/r/2f374c3c27054b7f978115270d587c624d9962fc.camel@perches.com Suggested-by: Nick Desaulniers Signed-off-by: Joe Perches Tested-by: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d7466879d505..1f85a3abbd17 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5886,6 +5886,18 @@ sub process { "__aligned(size) is preferred over __attribute__((aligned(size)))\n" . $herecurr); } +# Check for __attribute__ section, prefer __section + if ($realfile !~ m@\binclude/uapi/@ && + $line =~ /\b__attribute__\s*\(\s*\(.*_*section_*\s*\(\s*("[^"]*")/) { + my $old = substr($rawline, $-[1], $+[1] - $-[1]); + my $new = substr($old, 1, -1); + if (WARN("PREFER_SECTION", + "__section($new) is preferred over __attribute__((section($old)))\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\b__attribute__\s*\(\s*\(\s*_*section_*\s*\(\s*\Q$old\E\s*\)\s*\)\s*\)/__section($new)/; + } + } + # Check for __attribute__ format(printf, prefer __printf if ($realfile !~ m@\binclude/uapi/@ && $line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) { From 94fb98450456da82a16a378816390d99b85edb55 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:47 -0700 Subject: [PATCH 1248/2880] checkpatch: allow consecutive close braces checkpatch allows consecutive open braces, so it should also allow consecutive close braces. Link: http://lkml.kernel.org/r/bfdb49ae2c3fa7b52fa168769e38b48f959880e2.camel@perches.com Signed-off-by: Joe Perches Acked-by: Jeff Kirsher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 1f85a3abbd17..7603711503ea 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4673,7 +4673,7 @@ sub process { # closing brace should have a space following it when it has anything # on the line - if ($line =~ /}(?!(?:,|;|\)))\S/) { + if ($line =~ /}(?!(?:,|;|\)|\}))\S/) { if (ERROR("SPACING", "space required after that close brace '}'\n" . $herecurr) && $fix) { From 5a7f4455ad321400e1361ab94fd6858c5b2fe0cf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 25 Sep 2019 16:46:49 -0700 Subject: [PATCH 1249/2880] checkpatch: remove obsolete period from "ambiguous SHA1" query Git dropped the period from its "ambiguous SHA1" error message in commit 0c99171ad2 ("get_short_sha1: mark ambiguity error for translation"), circa 2016. Drop the period from checkpatch's associated query so as to match both the old and new error messages. Link: http://lkml.kernel.org/r/20190830163103.15914-1-sean.j.christopherson@intel.com Signed-off-by: Sean Christopherson Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7603711503ea..4b25a94cfc36 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -962,7 +962,7 @@ sub git_commit_info { return ($id, $desc) if ($#lines < 0); - if ($lines[0] =~ /^error: short SHA1 $commit is ambiguous\./) { + if ($lines[0] =~ /^error: short SHA1 $commit is ambiguous/) { # Maybe one day convert this block of bash into something that returns # all matching commit ids, but it's very slow... # From dbbf869da3adeba4a6beae4ecc184e47a16d078d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:52 -0700 Subject: [PATCH 1250/2880] checkpatch: make git output use LANGUAGE=en_US.utf8 git output parsing depends on the language being en_US english. Make the backtick execution of all `git ` commands set the LANGUAGE of the process to en_US.utf8 before executing the actual command using `export LANGUAGE=en_US.utf8; git `. Because the command is executed in a child process, the parent LANGUAGE is unchanged. Link: http://lkml.kernel.org/r/bb9f29988f3258281956680ff39c3e19e37dc0b8.camel@perches.com Signed-off-by: Joe Perches Reported-by: Sean Christopherson Reviewed-by: Sean Christopherson Tested-by: Sean Christopherson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4b25a94cfc36..4eb355d8ae73 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -62,6 +62,8 @@ my $conststructsfile = "$D/const_structs.checkpatch"; my $typedefsfile = ""; my $color = "auto"; my $allow_c99_comments = 1; # Can be overridden by --ignore C99_COMMENT_TOLERANCE +# git output parsing needs US English output, so first set backtick child process LANGUAGE +my $git_command ='export LANGUAGE=en_US.UTF-8; git'; sub help { my ($exitcode) = @_; @@ -904,7 +906,7 @@ sub seed_camelcase_includes { $camelcase_seeded = 1; if (-e ".git") { - my $git_last_include_commit = `git log --no-merges --pretty=format:"%h%n" -1 -- include`; + my $git_last_include_commit = `${git_command} log --no-merges --pretty=format:"%h%n" -1 -- include`; chomp $git_last_include_commit; $camelcase_cache = ".checkpatch-camelcase.git.$git_last_include_commit"; } else { @@ -932,7 +934,7 @@ sub seed_camelcase_includes { } if (-e ".git") { - $files = `git ls-files "include/*.h"`; + $files = `${git_command} ls-files "include/*.h"`; @include_files = split('\n', $files); } @@ -956,7 +958,7 @@ sub git_commit_info { return ($id, $desc) if ((which("git") eq "") || !(-e ".git")); - my $output = `git log --no-color --format='%H %s' -1 $commit 2>&1`; + my $output = `${git_command} log --no-color --format='%H %s' -1 $commit 2>&1`; $output =~ s/^\s*//gm; my @lines = split("\n", $output); @@ -1006,7 +1008,7 @@ if ($git) { } else { $git_range = "-1 $commit_expr"; } - my $lines = `git log --no-color --no-merges --pretty=format:'%H %s' $git_range`; + my $lines = `${git_command} log --no-color --no-merges --pretty=format:'%H %s' $git_range`; foreach my $line (split(/\n/, $lines)) { $line =~ /^([0-9a-fA-F]{40,40}) (.*)$/; next if (!defined($1) || !defined($2)); From d256085be12dc3b24e7b19c357e975a37dbff509 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 25 Sep 2019 16:46:55 -0700 Subject: [PATCH 1251/2880] fs: reiserfs: remove unnecessary check of bh in remove_from_transaction() On lines 3430-3434, bh has been assured to be non-null: cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr); if (!cn || !cn->bh) { return ret; } bh = cn->bh; Thus, the check of bh on line 3447 is unnecessary and can be removed. Thank Andrew Morton for good advice. Link: http://lkml.kernel.org/r/20190727084019.11307-1-baijiaju1990@gmail.com Signed-off-by: Jia-Ju Bai Reviewed-by: Jan Kara Cc: Arnd Bergmann Cc: Hariprasad Kelam Cc: Bharath Vedartham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4517a1394c6f..11155b8513db 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -3444,9 +3444,8 @@ static int remove_from_transaction(struct super_block *sb, if (cn == journal->j_last) { journal->j_last = cn->prev; } - if (bh) - remove_journal_hash(sb, journal->j_hash_table, NULL, - bh->b_blocknr, 0); + remove_journal_hash(sb, journal->j_hash_table, NULL, + bh->b_blocknr, 0); clear_buffer_journaled(bh); /* don't log this one */ if (!already_cleaned) { From 6e9ca45f77bc944d44face28059ab4db02e280fa Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:46:58 -0700 Subject: [PATCH 1252/2880] fs/reiserfs/journal.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/reiserfs/journal.c: In function flush_older_commits: fs/reiserfs/journal.c:894:15: warning: variable first_trans_id set but not used [-Wunused-but-set-variable] fs/reiserfs/journal.c: In function flush_journal_list: fs/reiserfs/journal.c:1354:38: warning: variable last set but not used [-Wunused-but-set-variable] fs/reiserfs/journal.c: In function do_journal_release: fs/reiserfs/journal.c:1916:6: warning: variable flushed set but not used [-Wunused-but-set-variable] fs/reiserfs/journal.c: In function do_journal_end: fs/reiserfs/journal.c:3993:6: warning: variable old_start set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-2-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 11155b8513db..75d9d52d489f 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -891,7 +891,6 @@ static int flush_older_commits(struct super_block *s, struct list_head *entry; unsigned int trans_id = jl->j_trans_id; unsigned int other_trans_id; - unsigned int first_trans_id; find_first: /* @@ -914,8 +913,6 @@ find_first: return 0; } - first_trans_id = first_jl->j_trans_id; - entry = &first_jl->j_list; while (1) { other_jl = JOURNAL_LIST_ENTRY(entry); @@ -1351,7 +1348,7 @@ static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { struct reiserfs_journal_list *pjl; - struct reiserfs_journal_cnode *cn, *last; + struct reiserfs_journal_cnode *cn; int count; int was_jwait = 0; int was_dirty = 0; @@ -1509,7 +1506,6 @@ static int flush_journal_list(struct super_block *s, b_blocknr, __func__); } free_cnode: - last = cn; cn = cn->next; if (saved_bh) { /* @@ -1911,7 +1907,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *sb, int error) { struct reiserfs_transaction_handle myth; - int flushed = 0; struct reiserfs_journal *journal = SB_JOURNAL(sb); /* @@ -1933,7 +1928,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, 1); journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); do_journal_end(&myth, FLUSH_ALL); - flushed = 1; } } @@ -3987,7 +3981,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) struct buffer_head *c_bh; /* commit bh */ struct buffer_head *d_bh; /* desc bh */ int cur_write_start = 0; /* start index of current log write */ - int old_start; int i; int flush; int wait_on_commit; @@ -4244,7 +4237,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) journal->j_num_work_lists++; /* reset journal values for the next transaction */ - old_start = journal->j_start; journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(sb); From 66985cb9ee107f2596e9d721252b679249c41858 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:47:01 -0700 Subject: [PATCH 1253/2880] fs/reiserfs/stree.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/reiserfs/stree.c: In function search_by_key: fs/reiserfs/stree.c:596:6: warning: variable right_neighbor_of_leaf_node set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-3-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/stree.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 0037aea97d39..da9ebe33882b 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -593,7 +593,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, struct buffer_head *bh; struct path_element *last_element; int node_level, retval; - int right_neighbor_of_leaf_node; int fs_gen; struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA]; @@ -614,8 +613,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, pathrelse(search_path); - right_neighbor_of_leaf_node = 0; - /* * With each iteration of this loop we search through the items in the * current node, and calculate the next current node(next path element) @@ -701,7 +698,6 @@ io_error: */ block_number = SB_ROOT_BLOCK(sb); expected_level = -1; - right_neighbor_of_leaf_node = 0; /* repeat search from the root */ continue; From d4a1a857e31abe7acc9586450119d9c4bc6cc2cd Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:47:04 -0700 Subject: [PATCH 1254/2880] fs/reiserfs/lbalance.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/reiserfs/lbalance.c: In function leaf_paste_entries: fs/reiserfs/lbalance.c:1325:9: warning: variable old_entry_num set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-4-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/lbalance.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index f5cebd70d903..7f868569d4d0 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -1322,7 +1322,7 @@ void leaf_paste_entries(struct buffer_info *bi, char *item; struct reiserfs_de_head *deh; char *insert_point; - int i, old_entry_num; + int i; struct buffer_head *bh = bi->bi_bh; if (new_entry_count == 0) @@ -1362,7 +1362,6 @@ void leaf_paste_entries(struct buffer_info *bi, put_deh_location(&deh[i], deh_location(&deh[i]) + paste_size); - old_entry_num = ih_entry_count(ih); put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); /* prepare space for pasted records */ From 4a70aebb12689b2e7b51c4b10157c4c4e99c59e4 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:47:07 -0700 Subject: [PATCH 1255/2880] fs/reiserfs/objectid.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/reiserfs/objectid.c: In function reiserfs_convert_objectid_map_v1: fs/reiserfs/objectid.c:186:25: warning: variable new_objectid_map set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-5-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/objectid.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index 415d66ca87d1..34baf5c0f265 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -183,13 +183,12 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s) int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2; int old_max = sb_oid_maxsize(disk_sb); struct reiserfs_super_block_v1 *disk_sb_v1; - __le32 *objectid_map, *new_objectid_map; + __le32 *objectid_map; int i; disk_sb_v1 = (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); objectid_map = (__le32 *) (disk_sb_v1 + 1); - new_objectid_map = (__le32 *) (disk_sb + 1); if (cur_size > new_size) { /* From 73fbff5eea3c0684519c0d8b60177557b5ac8f9b Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:47:10 -0700 Subject: [PATCH 1256/2880] fs/reiserfs/prints.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/reiserfs/prints.c: In function check_internal_block_head: fs/reiserfs/prints.c:749:21: warning: variable blkh set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-6-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/prints.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 9fed1c05f1f4..500f2000eb41 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -746,9 +746,6 @@ static void check_leaf_block_head(struct buffer_head *bh) static void check_internal_block_head(struct buffer_head *bh) { - struct block_head *blkh; - - blkh = B_BLK_HEAD(bh); if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT)) reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh); From 4fadcd1c14d810ec6a695039cfc71e03ae742deb Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:47:13 -0700 Subject: [PATCH 1257/2880] fs/reiserfs/fix_node.c: remove set but not used variables fs/reiserfs/fix_node.c: In function get_num_ver: fs/reiserfs/fix_node.c:379:6: warning: variable cur_free set but not used [-Wunused-but-set-variable] fs/reiserfs/fix_node.c: In function dc_check_balance_internal: fs/reiserfs/fix_node.c:1737:6: warning: variable maxsize set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-7-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/fix_node.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 6b0ddb2a9091..117092224111 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -376,7 +376,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, int to, int to_bytes, short *snum012, int flow) { int i; - int cur_free; int units; struct virtual_node *vn = tb->tb_vn; int total_node_size, max_node_size, current_item_size; @@ -438,7 +437,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, /* leaf level */ needed_nodes = 1; total_node_size = 0; - cur_free = max_node_size; /* start from 'from'-th item */ start_item = from; @@ -1734,14 +1732,12 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) * and Fh is its father. */ struct buffer_head *Sh, *Fh; - int maxsize, ret; + int ret; int lfree, rfree /* free space in L and R */ ; Sh = PATH_H_PBUFFER(tb->tb_path, h); Fh = PATH_H_PPARENT(tb->tb_path, h); - maxsize = MAX_CHILD_SIZE(Sh); - /* * using tb->insert_size[h], which is negative in this case, * create_virtual_node calculates: From da5184c2ab10b57bf9b58f818405aa0054a2f829 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:47:16 -0700 Subject: [PATCH 1258/2880] fs/reiserfs/do_balan.c: remove set but not used variables fs/reiserfs/do_balan.c: In function balance_leaf_when_delete: fs/reiserfs/do_balan.c:245:20: warning: variable ih set but not used [-Wunused-but-set-variable] fs/reiserfs/do_balan.c: In function balance_leaf_insert_left: fs/reiserfs/do_balan.c:301:7: warning: variable version set but not used [-Wunused-but-set-variable] fs/reiserfs/do_balan.c: In function balance_leaf_insert_right: fs/reiserfs/do_balan.c:649:7: warning: variable version set but not used [-Wunused-but-set-variable] fs/reiserfs/do_balan.c: In function balance_leaf_new_nodes_insert: fs/reiserfs/do_balan.c:953:7: warning: variable version set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-8-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/do_balan.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 9c02d96d3a42..ffb6d7f0da94 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -239,10 +239,8 @@ static int balance_leaf_when_delete_left(struct tree_balance *tb) static int balance_leaf_when_delete(struct tree_balance *tb, int flag) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int item_pos = PATH_LAST_POSITION(tb->tb_path); struct buffer_info bi; int n; - struct item_head *ih; RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, "vs- 12000: level: wrong FR %z", tb->FR[0]); @@ -251,7 +249,6 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag) RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0), "PAP-12010: tree can not be empty"); - ih = item_head(tbS0, item_pos); buffer_info_init_tbS0(tb, &bi); /* Delete or truncate the item */ @@ -298,7 +295,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb, if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { /* part of new item falls into L[0] */ int new_item_len, shift; - int version; ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1); @@ -317,8 +313,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb, leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, min_t(int, tb->zeroes_num, ih_item_len(ih))); - version = ih_version(ih); - /* * Calculate key component, item length and body to * insert into S[0] @@ -646,13 +640,11 @@ static void balance_leaf_insert_right(struct tree_balance *tb, if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { loff_t old_key_comp, old_len, r_zeroes_number; const char *r_body; - int version, shift; + int shift; loff_t offset; leaf_shift_right(tb, tb->rnum[0] - 1, -1); - version = ih_version(ih); - /* Remember key component and item length */ old_key_comp = le_ih_k_offset(ih); old_len = ih_item_len(ih); @@ -950,14 +942,12 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb, if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) { int old_key_comp, old_len, r_zeroes_number; const char *r_body; - int version; /* Move snum[i]-1 items from S[0] to S_new[i] */ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1, tb->S_new[i]); /* Remember key component and item length */ - version = ih_version(ih); old_key_comp = le_ih_k_offset(ih); old_len = ih_item_len(ih); From 3e9fd5a48cb7b0ef93be097c2c1066738d37f5b7 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 25 Sep 2019 16:47:19 -0700 Subject: [PATCH 1259/2880] fs/reiserfs/journal.c: remove set but not used variable Fix the following gcc warning: fs/reiserfs/journal.c: In function flush_used_journal_lists: fs/reiserfs/journal.c:1791:6: warning: variable ret set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/20190827032932.46622-1-yanaijie@huawei.com Signed-off-by: Jason Yan Cc: zhengbin Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 75d9d52d489f..4b3e3e73b512 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1788,7 +1788,6 @@ static int flush_used_journal_lists(struct super_block *s, { unsigned long len = 0; unsigned long cur_len; - int ret; int i; int limit = 256; struct reiserfs_journal_list *tjl; @@ -1825,9 +1824,9 @@ static int flush_used_journal_lists(struct super_block *s, * transactions, but only bother if we've actually spanned * across multiple lists */ - if (flush_jl != jl) { - ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); - } + if (flush_jl != jl) + kupdate_transactions(s, jl, &tjl, &trans_id, len, i); + flush_journal_list(s, flush_jl, 1); put_journal_list(s, flush_jl); put_journal_list(s, jl); From b25bab17221ba366ffdff9dd62945932aac7dc98 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 25 Sep 2019 16:47:22 -0700 Subject: [PATCH 1260/2880] fs/reiserfs/do_balan.c: remove set but not used variable Fix the following gcc warning: fs/reiserfs/do_balan.c: In function balance_leaf_insert_right: fs/reiserfs/do_balan.c:629:6: warning: variable ret set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/20190827032932.46622-2-yanaijie@huawei.com Signed-off-by: Jason Yan Cc: zhengbin Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/do_balan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index ffb6d7f0da94..4075e41408b4 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -626,7 +626,6 @@ static void balance_leaf_insert_right(struct tree_balance *tb, struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tbS0); struct buffer_info bi; - int ret; /* new item or part of it doesn't fall into R[0] */ if (n - tb->rnum[0] >= tb->item_pos) { @@ -690,7 +689,7 @@ static void balance_leaf_insert_right(struct tree_balance *tb, /* whole new item falls into R[0] */ /* Shift rnum[0]-1 items to R[0] */ - ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); + leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); /* Insert new item into R[0] */ buffer_info_init_right(tb, &bi); From aadc4e01dbaaccd38abde03bc84d0332a6bd9eab Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 25 Sep 2019 16:47:24 -0700 Subject: [PATCH 1261/2880] fat: delete an unnecessary check before brelse() brelse() tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Link: http://lkml.kernel.org/r/cfff3b81-fb5d-af26-7b5e-724266509045@web.de Signed-off-by: Markus Elfring Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/dir.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 814ad2c2ba80..054acd9fd033 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -88,9 +88,7 @@ static int fat__get_entry(struct inode *dir, loff_t *pos, int err, offset; next: - if (*bh) - brelse(*bh); - + brelse(*bh); *bh = NULL; iblock = *pos >> sb->s_blocksize_bits; err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false); From 8495f7e6732ed248b648d36439795b42ec650b9e Mon Sep 17 00:00:00 2001 From: Sai Praneeth Prakhya Date: Wed, 25 Sep 2019 16:47:27 -0700 Subject: [PATCH 1262/2880] fork: improve error message for corrupted page tables When a user process exits, the kernel cleans up the mm_struct of the user process and during cleanup, check_mm() checks the page tables of the user process for corruption (E.g: unexpected page flags set/cleared). For corrupted page tables, the error message printed by check_mm() isn't very clear as it prints the loop index instead of page table type (E.g: Resident file mapping pages vs Resident shared memory pages). The loop index in check_mm() is used to index rss_stat[] which represents individual memory type stats. Hence, instead of printing index, print memory type, thereby improving error message. Without patch: -------------- [ 204.836425] mm/pgtable-generic.c:29: bad p4d 0000000089eb4e92(800000025f941467) [ 204.836544] BUG: Bad rss-counter state mm:00000000f75895ea idx:0 val:2 [ 204.836615] BUG: Bad rss-counter state mm:00000000f75895ea idx:1 val:5 [ 204.836685] BUG: non-zero pgtables_bytes on freeing mm: 20480 With patch: ----------- [ 69.815453] mm/pgtable-generic.c:29: bad p4d 0000000084653642(800000025ca37467) [ 69.815872] BUG: Bad rss-counter state mm:00000000014a6c03 type:MM_FILEPAGES val:2 [ 69.815962] BUG: Bad rss-counter state mm:00000000014a6c03 type:MM_ANONPAGES val:5 [ 69.816050] BUG: non-zero pgtables_bytes on freeing mm: 20480 Also, change print function (from printk(KERN_ALERT, ..) to pr_alert()) so that it matches the other print statement. Link: http://lkml.kernel.org/r/da75b5153f617f4c5739c08ee6ebeb3d19db0fbc.1565123758.git.sai.praneeth.prakhya@intel.com Signed-off-by: Sai Praneeth Prakhya Reviewed-by: Anshuman Khandual Suggested-by: Dave Hansen Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types_task.h | 4 ++++ kernel/fork.c | 16 ++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index d7016dcb245e..c1bc6731125c 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -36,6 +36,10 @@ struct vmacache { struct vm_area_struct *vmas[VMACACHE_SIZE]; }; +/* + * When updating this, please also update struct resident_page_types[] in + * kernel/fork.c + */ enum { MM_FILEPAGES, /* Resident file mapping pages */ MM_ANONPAGES, /* Resident anonymous pages */ diff --git a/kernel/fork.c b/kernel/fork.c index 5a0fd518e04e..60763c043aa3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -125,6 +125,15 @@ int nr_threads; /* The idle threads do not count.. */ static int max_threads; /* tunable limit on nr_threads */ +#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) + +static const char * const resident_page_types[] = { + NAMED_ARRAY_INDEX(MM_FILEPAGES), + NAMED_ARRAY_INDEX(MM_ANONPAGES), + NAMED_ARRAY_INDEX(MM_SWAPENTS), + NAMED_ARRAY_INDEX(MM_SHMEMPAGES), +}; + DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ @@ -645,12 +654,15 @@ static void check_mm(struct mm_struct *mm) { int i; + BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, + "Please make sure 'struct resident_page_types[]' is updated as well"); + for (i = 0; i < NR_MM_COUNTERS; i++) { long x = atomic_long_read(&mm->rss_stat.count[i]); if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", + mm, resident_page_types[i], x); } if (mm_pgtables_bytes(mm)) From 2a4a4082cd4438333b5ecffdd15d1a484e5a83c7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 25 Sep 2019 16:47:30 -0700 Subject: [PATCH 1263/2880] cpumask: nicer for_each_cpumask_and() signature Mask arguments can be swapped without changing anything. Make arguments names reflect that: #define for_each_cpu_and(cpu, mask1, mask2) Link: http://lkml.kernel.org/r/20190724183350.GA15041@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index b5a5a1ed9efd..78a73eba64dd 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -200,8 +200,8 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node) for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_wrap(cpu, mask, start) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) -#define for_each_cpu_and(cpu, mask, and) \ - for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and) +#define for_each_cpu_and(cpu, mask1, mask2) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) #else /** * cpumask_first - get the first cpu in a cpumask @@ -290,20 +290,20 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator - * @mask: the first cpumask pointer - * @and: the second cpumask pointer + * @mask1: the first cpumask pointer + * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; - * cpumask_and(&tmp, &mask, &and); + * cpumask_and(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ -#define for_each_cpu_and(cpu, mask, and) \ +#define for_each_cpu_and(cpu, mask1, mask2) \ for ((cpu) = -1; \ - (cpu) = cpumask_next_and((cpu), (mask), (and)), \ + (cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \ (cpu) < nr_cpu_ids;) #endif /* SMP */ From 7c3a6aedcd6aae0a32a527e68669f7dd667492d1 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 25 Sep 2019 16:47:33 -0700 Subject: [PATCH 1264/2880] kexec: bail out upon SIGKILL when allocating memory. syzbot found that a thread can stall for minutes inside kexec_load() after that thread was killed by SIGKILL [1]. It turned out that the reproducer was trying to allocate 2408MB of memory using kimage_alloc_page() from kimage_load_normal_segment(). Let's check for SIGKILL before doing memory allocation. [1] https://syzkaller.appspot.com/bug?id=a0e3436829698d5824231251fad9d8e998f94f5e Link: http://lkml.kernel.org/r/993c9185-d324-2640-d061-bed2dd18b1f7@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Reported-by: syzbot Cc: Eric Biederman Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d5870723b8ad..15d70a90b50d 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -300,6 +300,8 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; + if (fatal_signal_pending(current)) + return NULL; pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; From d5372c39132958679c480d0295dd328c741c7a41 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 25 Sep 2019 16:47:36 -0700 Subject: [PATCH 1265/2880] kexec: restore arch_kexec_kernel_image_probe declaration arch_kexec_kernel_image_probe function declaration has been removed by commit 9ec4ecef0af7 ("kexec_file,x86,powerpc: factor out kexec_file_ops functions"). Still this function is overridden by couple of architectures and proper prototype declaration is therefore important, so bring it back. This fixes the following sparse warning on s390: arch/s390/kernel/machine_kexec_file.c:333:5: warning: symbol 'arch_kexec_kernel_image_probe' was not declared. Should it be static? Link: http://lkml.kernel.org/r/patch.git-ff1c9045ebdc.your-ad-here.call-01564402297-ext-5690@work.hours Signed-off-by: Vasily Gorbik Acked-by: Dave Young Reviewed-by: Bhupesh Sharma Cc: Eric Biederman Cc: AKASHI Takahiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index f0b809258ed3..cc162f3e6461 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -183,6 +183,8 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, bool get_value); void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len); void * __weak arch_kexec_kernel_image_load(struct kimage *image); int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, From 9dd819a15162f8f82a6001b090caa38c18297b39 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:39 -0700 Subject: [PATCH 1266/2880] uaccess: add missing __must_check attributes The usercopy implementation comments describe that callers of the copy_*_user() family of functions must always have their return values checked. This can be enforced at compile time with __must_check, so add it where needed. Link: http://lkml.kernel.org/r/201908251609.ADAD5CAAC1@keescook Signed-off-by: Kees Cook Cc: Alexander Viro Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/thread_info.h | 2 +- include/linux/uaccess.h | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 8d8821b3689a..659a4400517b 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -134,7 +134,7 @@ static inline void copy_overflow(int size, unsigned long count) WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } -static __always_inline bool +static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { int sz = __compiletime_object_size(addr); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 34a038563d97..70bbdc38dc37 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -55,7 +55,7 @@ * as usual) and both source and destination can trigger faults. */ -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { kasan_check_write(to, n); @@ -63,7 +63,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) return raw_copy_from_user(to, from, n); } -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); @@ -85,7 +85,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. */ -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { kasan_check_read(from, n); @@ -93,7 +93,7 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) return raw_copy_to_user(to, from, n); } -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); @@ -103,7 +103,7 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) } #ifdef INLINE_COPY_FROM_USER -static inline unsigned long +static inline __must_check unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; @@ -117,12 +117,12 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) return res; } #else -extern unsigned long +extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); #endif #ifdef INLINE_COPY_TO_USER -static inline unsigned long +static inline __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); @@ -133,7 +133,7 @@ _copy_to_user(void __user *to, const void *from, unsigned long n) return n; } #else -extern unsigned long +extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); #endif @@ -222,8 +222,9 @@ static inline bool pagefault_disabled(void) #ifndef ARCH_HAS_NOCACHE_UACCESS -static inline unsigned long __copy_from_user_inatomic_nocache(void *to, - const void __user *from, unsigned long n) +static inline __must_check unsigned long +__copy_from_user_inatomic_nocache(void *to, const void __user *from, + unsigned long n) { return __copy_from_user_inatomic(to, from, n); } From ac7c3e4ff401b304489a031938dbeaab585bfe0a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 25 Sep 2019 16:47:42 -0700 Subject: [PATCH 1267/2880] compiler: enable CONFIG_OPTIMIZE_INLINING forcibly Commit 9012d011660e ("compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING") allowed all architectures to enable this option. A couple of build errors were reported by randconfig, but all of them have been ironed out. Towards the goal of removing CONFIG_OPTIMIZE_INLINING entirely (and it will simplify the 'inline' macro in compiler_types.h), this commit changes it to always-on option. Going forward, the compiler will always be allowed to not inline functions marked 'inline'. This is not a problem for x86 since it has been long used by arch/x86/configs/{x86_64,i386}_defconfig. I am keeping the config option just in case any problem crops up for other architectures. The code clean-up will be done after confirming this is solid. Link: http://lkml.kernel.org/r/20190830034304.24259-1-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Nick Desaulniers Cc: Ingo Molnar Cc: Borislav Petkov Cc: Miguel Ojeda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 6b1b1703a646..93d97f9b0157 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -311,7 +311,7 @@ config HEADERS_CHECK relevant for userspace, say 'Y'. config OPTIMIZE_INLINING - bool "Allow compiler to uninline functions marked 'inline'" + def_bool y help This option determines if the kernel forces gcc to inline the functions developers have marked 'inline'. Doing so takes away freedom from gcc to @@ -322,8 +322,6 @@ config OPTIMIZE_INLINING decision will become the default in the future. Until then this option is there to test gcc for this. - If unsure, say N. - config DEBUG_SECTION_MISMATCH bool "Enable full Section mismatch analysis" help From 7d92bda271ddcbb2d1be2f82733dcb9bf8378010 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 16:47:45 -0700 Subject: [PATCH 1268/2880] kgdb: don't use a notifier to enter kgdb at panic; call directly Right now kgdb/kdb hooks up to debug panics by registering for the panic notifier. This works OK except that it means that kgdb/kdb gets called _after_ the CPUs in the system are taken offline. That means that if anything important was happening on those CPUs (like something that might have contributed to the panic) you can't debug them. Specifically I ran into a case where I got a panic because a task was "blocked for more than 120 seconds" which was detected on CPU 2. I nicely got shown stack traces in the kernel log for all CPUs including CPU 0, which was running 'PID: 111 Comm: kworker/0:1H' and was in the middle of __mmc_switch(). I then ended up at the kdb prompt where switched over to kgdb to try to look at local variables of the process on CPU 0. I found that I couldn't. Digging more, I found that I had no info on any tasks running on CPUs other than CPU 2 and that asking kdb for help showed me "Error: no saved data for this cpu". This was because all the CPUs were offline. Let's move the entry of kdb/kgdb to a direct call from panic() and stop using the generic notifier. Putting a direct call in allows us to order things more properly and it also doesn't seem like we're breaking any abstractions by calling into the debugger from the panic function. Daniel said: : This patch changes the way kdump and kgdb interact with each other. : However it would seem rather odd to have both tools simultaneously armed : and, even if they were, the user still has the option to use panic_timeout : to force a kdump to happen. Thus I think the change of order is : acceptable. Link: http://lkml.kernel.org/r/20190703170354.217312-1-dianders@chromium.org Signed-off-by: Douglas Anderson Reviewed-by: Daniel Thompson Cc: Jason Wessel Cc: Kees Cook Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Feng Tang Cc: YueHaibing Cc: Sergey Senozhatsky Cc: "Steven Rostedt (VMware)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kgdb.h | 2 ++ kernel/debug/debug_core.c | 33 ++++++++++++--------------------- kernel/panic.c | 8 ++++++++ 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index fbf144aaa749..b072aeb1fd78 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -326,8 +326,10 @@ extern atomic_t kgdb_active; (raw_smp_processor_id() == atomic_read(&kgdb_active)) extern bool dbg_is_early; extern void __init dbg_late_init(void); +extern void kgdb_panic(const char *msg); #else /* ! CONFIG_KGDB */ #define in_dbg_master() (0) #define dbg_late_init() +static inline void kgdb_panic(const char *msg) {} #endif /* ! CONFIG_KGDB */ #endif /* _KGDB_H_ */ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 10f1187b3907..f76d6f77dd5e 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -893,29 +893,24 @@ static struct sysrq_key_op sysrq_dbg_op = { }; #endif -static int kgdb_panic_event(struct notifier_block *self, - unsigned long val, - void *data) +void kgdb_panic(const char *msg) { + if (!kgdb_io_module_registered) + return; + /* - * Avoid entering the debugger if we were triggered due to a panic - * We don't want to get stuck waiting for input from user in such case. - * panic_timeout indicates the system should automatically + * We don't want to get stuck waiting for input from user if + * "panic_timeout" indicates the system should automatically * reboot on panic. */ if (panic_timeout) - return NOTIFY_DONE; + return; if (dbg_kdb_mode) - kdb_printf("PANIC: %s\n", (char *)data); - kgdb_breakpoint(); - return NOTIFY_DONE; -} + kdb_printf("PANIC: %s\n", msg); -static struct notifier_block kgdb_panic_event_nb = { - .notifier_call = kgdb_panic_event, - .priority = INT_MAX, -}; + kgdb_breakpoint(); +} void __weak kgdb_arch_late(void) { @@ -965,8 +960,6 @@ static void kgdb_register_callbacks(void) kgdb_arch_late(); register_module_notifier(&dbg_module_load_nb); register_reboot_notifier(&dbg_reboot_notifier); - atomic_notifier_chain_register(&panic_notifier_list, - &kgdb_panic_event_nb); #ifdef CONFIG_MAGIC_SYSRQ register_sysrq_key('g', &sysrq_dbg_op); #endif @@ -980,16 +973,14 @@ static void kgdb_register_callbacks(void) static void kgdb_unregister_callbacks(void) { /* - * When this routine is called KGDB should unregister from the - * panic handler and clean up, making sure it is not handling any + * When this routine is called KGDB should unregister from + * handlers and clean up, making sure it is not handling any * break exceptions at the time. */ if (kgdb_io_module_registered) { kgdb_io_module_registered = 0; unregister_reboot_notifier(&dbg_reboot_notifier); unregister_module_notifier(&dbg_module_load_nb); - atomic_notifier_chain_unregister(&panic_notifier_list, - &kgdb_panic_event_nb); kgdb_arch_exit(); #ifdef CONFIG_MAGIC_SYSRQ unregister_sysrq_key('g', &sysrq_dbg_op); diff --git a/kernel/panic.c b/kernel/panic.c index 057540b6eee9..d1ece4c363b9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -219,6 +220,13 @@ void panic(const char *fmt, ...) dump_stack(); #endif + /* + * If kgdb is enabled, give it a chance to run before we stop all + * the other CPUs or else we won't be able to debug processes left + * running on them. + */ + kgdb_panic(buf); + /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. From da036ae147624b70f7d3784ff3a53bd4fda20d2a Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 16:47:48 -0700 Subject: [PATCH 1269/2880] scripts/gdb: handle split debug Some systems (like Chrome OS) may use "split debug" for kernel modules. That means that the debug symbols are in a different file than the main elf file. Let's handle that by also searching for debug symbols that end in ".ko.debug". This is a packaging topic. You can take a normal elf file and split the debug out of it using objcopy. Try "man objcopy" and then take a look at the "--only-keep-debug" option. It'll give you a whole recipe for doing splitdebug. The suffix used for the debug symbols is arbitrary. If people have other another suffix besides ".ko.debug" then we could presumably support that too... For portage (which is the packaging system used by Chrome OS) split debug is supported by default (and the suffix is .ko.debug). ...and so in Chrome OS we always get the installed elf files stripped and then the symbols stashed away. At the moment we don't actually use the normal portage magic to do this for the kernel though since it affects our ability to get good stack dumps in the kernel. We instead pass a script as "strip" [1]. [1] https://chromium.googlesource.com/chromiumos/overlays/chromiumos-overlay/+/refs/heads/master/eclass/cros-kernel/strip_splitdebug Link: http://lkml.kernel.org/r/20190730234052.148744-1-dianders@chromium.org Signed-off-by: Douglas Anderson Reviewed-by: Stephen Boyd Reviewed-by: Jan Kiszka Cc: Kieran Bingham Cc: Jason Wessel Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/gdb/linux/symbols.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 2f5b95f09fa0..34e40e96dee2 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -77,12 +77,12 @@ lx-symbols command.""" gdb.write("scanning for modules in {0}\n".format(path)) for root, dirs, files in os.walk(path): for name in files: - if name.endswith(".ko"): + if name.endswith(".ko") or name.endswith(".ko.debug"): self.module_files.append(root + "/" + name) self.module_files_updated = True def _get_module_file(self, module_name): - module_pattern = ".*/{0}\.ko$".format( + module_pattern = ".*/{0}\.ko(?:.debug)?$".format( module_name.replace("_", r"[_\-]")) for name in self.module_files: if re.match(module_pattern, name) and os.path.exists(name): From ee8711336c51708382627ebcaee5f2122b77dfef Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:52 -0700 Subject: [PATCH 1270/2880] bug: refactor away warn_slowpath_fmt_taint() Patch series "Clean up WARN() "cut here" handling", v2. Christophe Leroy noticed that the fix for missing "cut here" in the WARN() case was adding explicit printk() calls instead of teaching the exception handler to add it. This refactors the bug/warn infrastructure to pass this information as a new BUGFLAG. Longer details repeated from the last patch in the series: bug: move WARN_ON() "cut here" into exception handler The original cleanup of "cut here" missed the WARN_ON() case (that does not have a printk message), which was fixed recently by adding an explicit printk of "cut here". This had the downside of adding a printk() to every WARN_ON() caller, which reduces the utility of using an instruction exception to streamline the resulting code. By making this a new BUGFLAG, all of these can be removed and "cut here" can be handled by the exception handler. This was very pronounced on PowerPC, but the effect can be seen on x86 as well. The resulting text size of a defconfig build shows some small savings from this patch: text data bss dec hex filename 19691167 5134320 1646664 26472151 193eed7 vmlinux.before 19676362 5134260 1663048 26473670 193f4c6 vmlinux.after This change also opens the door for creating something like BUG_MSG(), where a custom printk() before issuing BUG(), without confusing the "cut here" line. This patch (of 7): There's no reason to have specialized helpers for passing the warn taint down to __warn(). Consolidate and refactor helper macros, removing __WARN_printf() and warn_slowpath_fmt_taint(). Link: http://lkml.kernel.org/r/20190819234111.9019-2-keescook@chromium.org Signed-off-by: Kees Cook Cc: Christophe Leroy Cc: Peter Zijlstra Cc: Christophe Leroy Cc: Drew Davenport Cc: Arnd Bergmann Cc: "Steven Rostedt (VMware)" Cc: Feng Tang Cc: Petr Mladek Cc: Mauro Carvalho Chehab Cc: Borislav Petkov Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 13 ++++--------- kernel/panic.c | 18 +++--------------- 2 files changed, 7 insertions(+), 24 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 7357a3c942a0..c3a9c16a2b69 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -90,24 +90,19 @@ struct bug_entry { * Use the versions with printk format strings to provide better diagnostics. */ #ifndef __WARN_TAINT -extern __printf(3, 4) -void warn_slowpath_fmt(const char *file, const int line, - const char *fmt, ...); extern __printf(4, 5) -void warn_slowpath_fmt_taint(const char *file, const int line, unsigned taint, - const char *fmt, ...); +void warn_slowpath_fmt(const char *file, const int line, unsigned taint, + const char *fmt, ...); extern void warn_slowpath_null(const char *file, const int line); #define WANT_WARN_ON_SLOWPATH #define __WARN() warn_slowpath_null(__FILE__, __LINE__) -#define __WARN_printf(arg...) warn_slowpath_fmt(__FILE__, __LINE__, arg) #define __WARN_printf_taint(taint, arg...) \ - warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg) + warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #define __WARN() do { \ printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \ } while (0) -#define __WARN_printf(arg...) __WARN_printf_taint(TAINT_WARN, arg) #define __WARN_printf_taint(taint, arg...) \ do { __warn_printk(arg); __WARN_TAINT(taint); } while (0) #endif @@ -132,7 +127,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #define WARN(condition, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_printf(format); \ + __WARN_printf_taint(TAINT_WARN, format); \ unlikely(__ret_warn_on); \ }) #endif diff --git a/kernel/panic.c b/kernel/panic.c index d1ece4c363b9..1d89f5423426 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -600,20 +600,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, } #ifdef WANT_WARN_ON_SLOWPATH -void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) -{ - struct warn_args args; - - args.fmt = fmt; - va_start(args.args, fmt); - __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, - &args); - va_end(args.args); -} -EXPORT_SYMBOL(warn_slowpath_fmt); - -void warn_slowpath_fmt_taint(const char *file, int line, - unsigned taint, const char *fmt, ...) +void warn_slowpath_fmt(const char *file, int line, unsigned taint, + const char *fmt, ...) { struct warn_args args; @@ -622,7 +610,7 @@ void warn_slowpath_fmt_taint(const char *file, int line, __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } -EXPORT_SYMBOL(warn_slowpath_fmt_taint); +EXPORT_SYMBOL(warn_slowpath_fmt); void warn_slowpath_null(const char *file, int line) { From 89348fc31441f0270b040fbeb68b6d7d13504f36 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:55 -0700 Subject: [PATCH 1271/2880] bug: rename __WARN_printf_taint() to __WARN_printf() This just renames the helper to improve readability. Link: http://lkml.kernel.org/r/20190819234111.9019-3-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index c3a9c16a2b69..2d35bbf687d0 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -96,14 +96,14 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint, extern void warn_slowpath_null(const char *file, const int line); #define WANT_WARN_ON_SLOWPATH #define __WARN() warn_slowpath_null(__FILE__, __LINE__) -#define __WARN_printf_taint(taint, arg...) \ +#define __WARN_printf(taint, arg...) \ warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #define __WARN() do { \ printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \ } while (0) -#define __WARN_printf_taint(taint, arg...) \ +#define __WARN_printf(taint, arg...) \ do { __warn_printk(arg); __WARN_TAINT(taint); } while (0) #endif @@ -127,7 +127,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #define WARN(condition, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_printf_taint(TAINT_WARN, format); \ + __WARN_printf(TAINT_WARN, format); \ unlikely(__ret_warn_on); \ }) #endif @@ -135,7 +135,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #define WARN_TAINT(condition, taint, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_printf_taint(taint, format); \ + __WARN_printf(taint, format); \ unlikely(__ret_warn_on); \ }) From f2f84b05e02b7710a201f0017b3272ad7ef703d1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:58 -0700 Subject: [PATCH 1272/2880] bug: consolidate warn_slowpath_fmt() usage Instead of having a separate helper for no printk output, just consolidate the logic into warn_slowpath_fmt(). Link: http://lkml.kernel.org/r/20190819234111.9019-4-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 3 +-- kernel/panic.c | 14 +++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 2d35bbf687d0..598d7072602f 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -93,9 +93,8 @@ struct bug_entry { extern __printf(4, 5) void warn_slowpath_fmt(const char *file, const int line, unsigned taint, const char *fmt, ...); -extern void warn_slowpath_null(const char *file, const int line); #define WANT_WARN_ON_SLOWPATH -#define __WARN() warn_slowpath_null(__FILE__, __LINE__) +#define __WARN() __WARN_printf(TAINT_WARN, NULL) #define __WARN_printf(taint, arg...) \ warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else diff --git a/kernel/panic.c b/kernel/panic.c index 1d89f5423426..79c153951b59 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -605,19 +605,19 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, { struct warn_args args; + if (!fmt) { + pr_warn(CUT_HERE); + __warn(file, line, __builtin_return_address(0), taint, + NULL, NULL); + return; + } + args.fmt = fmt; va_start(args.args, fmt); __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt); - -void warn_slowpath_null(const char *file, int line) -{ - pr_warn(CUT_HERE); - __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); -} -EXPORT_SYMBOL(warn_slowpath_null); #else void __warn_printk(const char *fmt, ...) { From d38aba49a9f72b862f1220739ca837c886fdc319 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:01 -0700 Subject: [PATCH 1273/2880] bug: lift "cut here" out of __warn() In preparation for cleaning up "cut here", move the "cut here" logic up out of __warn() and into callers that pass non-NULL args. For anyone looking closely, there are two callers that pass NULL args: one already explicitly prints "cut here". The remaining case is covered by how a WARN is built, which will be cleaned up in the next patch. Link: http://lkml.kernel.org/r/20190819234111.9019-5-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index 79c153951b59..a643e5464296 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -559,9 +559,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - if (args) - pr_warn(CUT_HERE); - if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", raw_smp_processor_id(), current->pid, file, line, @@ -605,8 +602,9 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, { struct warn_args args; + pr_warn(CUT_HERE); + if (!fmt) { - pr_warn(CUT_HERE); __warn(file, line, __builtin_return_address(0), taint, NULL, NULL); return; From d4bce140b4e739bceb4e239d4842cf8f346c1e0f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:04 -0700 Subject: [PATCH 1274/2880] bug: clean up helper macros to remove __WARN_TAINT() In preparation for cleaning up "cut here" even more, this removes the __WARN_*TAINT() helpers, as they limit the ability to add new BUGFLAG_* flags to call sites. They are removed by expanding them into full __WARN_FLAGS() calls. Link: http://lkml.kernel.org/r/20190819234111.9019-6-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 598d7072602f..4b18e09094cf 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -62,13 +62,11 @@ struct bug_entry { #endif #ifdef __WARN_FLAGS -#define __WARN_TAINT(taint) __WARN_FLAGS(BUGFLAG_TAINT(taint)) -#define __WARN_ONCE_TAINT(taint) __WARN_FLAGS(BUGFLAG_ONCE|BUGFLAG_TAINT(taint)) - #define WARN_ON_ONCE(condition) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_ONCE_TAINT(TAINT_WARN); \ + __WARN_FLAGS(BUGFLAG_ONCE | \ + BUGFLAG_TAINT(TAINT_WARN)); \ unlikely(__ret_warn_on); \ }) #endif @@ -89,7 +87,7 @@ struct bug_entry { * * Use the versions with printk format strings to provide better diagnostics. */ -#ifndef __WARN_TAINT +#ifndef __WARN_FLAGS extern __printf(4, 5) void warn_slowpath_fmt(const char *file, const int line, unsigned taint, const char *fmt, ...); @@ -99,11 +97,14 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint, warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); -#define __WARN() do { \ - printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \ -} while (0) -#define __WARN_printf(taint, arg...) \ - do { __warn_printk(arg); __WARN_TAINT(taint); } while (0) +#define __WARN() do { \ + printk(KERN_WARNING CUT_HERE); \ + __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)); \ + } while (0) +#define __WARN_printf(taint, arg...) do { \ + __warn_printk(arg); \ + __WARN_FLAGS(BUGFLAG_TAINT(taint)); \ + } while (0) #endif /* used internally by panic.c */ From 2da1ead4d5f7fa5f61e5805655de1e245d03a763 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:08 -0700 Subject: [PATCH 1275/2880] bug: consolidate __WARN_FLAGS usage Instead of having separate tests for __WARN_FLAGS, merge the two #ifdef blocks and replace the synonym WANT_WARN_ON_SLOWPATH macro. Link: http://lkml.kernel.org/r/20190819234111.9019-7-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 18 +++++++----------- kernel/panic.c | 2 +- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 4b18e09094cf..b4a2639130a0 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -61,16 +61,6 @@ struct bug_entry { #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0) #endif -#ifdef __WARN_FLAGS -#define WARN_ON_ONCE(condition) ({ \ - int __ret_warn_on = !!(condition); \ - if (unlikely(__ret_warn_on)) \ - __WARN_FLAGS(BUGFLAG_ONCE | \ - BUGFLAG_TAINT(TAINT_WARN)); \ - unlikely(__ret_warn_on); \ -}) -#endif - /* * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report * significant kernel issues that need prompt attention if they should ever @@ -91,7 +81,6 @@ struct bug_entry { extern __printf(4, 5) void warn_slowpath_fmt(const char *file, const int line, unsigned taint, const char *fmt, ...); -#define WANT_WARN_ON_SLOWPATH #define __WARN() __WARN_printf(TAINT_WARN, NULL) #define __WARN_printf(taint, arg...) \ warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) @@ -105,6 +94,13 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); __warn_printk(arg); \ __WARN_FLAGS(BUGFLAG_TAINT(taint)); \ } while (0) +#define WARN_ON_ONCE(condition) ({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + __WARN_FLAGS(BUGFLAG_ONCE | \ + BUGFLAG_TAINT(TAINT_WARN)); \ + unlikely(__ret_warn_on); \ +}) #endif /* used internally by panic.c */ diff --git a/kernel/panic.c b/kernel/panic.c index a643e5464296..47e8ebccc22b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -596,7 +596,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, add_taint(taint, LOCKDEP_STILL_OK); } -#ifdef WANT_WARN_ON_SLOWPATH +#ifndef __WARN_FLAGS void warn_slowpath_fmt(const char *file, int line, unsigned taint, const char *fmt, ...) { From a44f71a9ab99b509fec9d5a9f5c222debd89934f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:11 -0700 Subject: [PATCH 1276/2880] bug: move WARN_ON() "cut here" into exception handler The original clean up of "cut here" missed the WARN_ON() case (that does not have a printk message), which was fixed recently by adding an explicit printk of "cut here". This had the downside of adding a printk() to every WARN_ON() caller, which reduces the utility of using an instruction exception to streamline the resulting code. By making this a new BUGFLAG, all of these can be removed and "cut here" can be handled by the exception handler. This was very pronounced on PowerPC, but the effect can be seen on x86 as well. The resulting text size of a defconfig build shows some small savings from this patch: text data bss dec hex filename 19691167 5134320 1646664 26472151 193eed7 vmlinux.before 19676362 5134260 1663048 26473670 193f4c6 vmlinux.after This change also opens the door for creating something like BUG_MSG(), where a custom printk() before issuing BUG(), without confusing the "cut here" line. Link: http://lkml.kernel.org/r/201908200943.601DD59DCE@keescook Fixes: 6b15f678fb7d ("include/asm-generic/bug.h: fix "cut here" for WARN_ON for __WARN_TAINT architectures") Signed-off-by: Kees Cook Reported-by: Christophe Leroy Cc: Peter Zijlstra Cc: Christophe Leroy Cc: Drew Davenport Cc: Arnd Bergmann Cc: "Steven Rostedt (VMware)" Cc: Feng Tang Cc: Petr Mladek Cc: Mauro Carvalho Chehab Cc: Borislav Petkov Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 8 +++----- lib/bug.c | 11 +++++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index b4a2639130a0..384b5c835ced 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -10,6 +10,7 @@ #define BUGFLAG_WARNING (1 << 0) #define BUGFLAG_ONCE (1 << 1) #define BUGFLAG_DONE (1 << 2) +#define BUGFLAG_NO_CUT_HERE (1 << 3) /* CUT_HERE already sent */ #define BUGFLAG_TAINT(taint) ((taint) << 8) #define BUG_GET_TAINT(bug) ((bug)->flags >> 8) #endif @@ -86,13 +87,10 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint, warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); -#define __WARN() do { \ - printk(KERN_WARNING CUT_HERE); \ - __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)); \ - } while (0) +#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)) #define __WARN_printf(taint, arg...) do { \ __warn_printk(arg); \ - __WARN_FLAGS(BUGFLAG_TAINT(taint)); \ + __WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ } while (0) #define WARN_ON_ONCE(condition) ({ \ int __ret_warn_on = !!(condition); \ diff --git a/lib/bug.c b/lib/bug.c index 1077366f496b..8c98af0bf585 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -181,6 +181,15 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) } } + /* + * BUG() and WARN_ON() families don't print a custom debug message + * before triggering the exception handler, so we must add the + * "cut here" line now. WARN() issues its own "cut here" before the + * extra debugging message it writes before triggering the handler. + */ + if ((bug->flags & BUGFLAG_NO_CUT_HERE) == 0) + printk(KERN_DEFAULT CUT_HERE); + if (warning) { /* this is a WARN_ON rather than BUG/BUG_ON */ __warn(file, line, (void *)bugaddr, BUG_GET_TAINT(bug), regs, @@ -188,8 +197,6 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) return BUG_TRAP_TYPE_WARN; } - printk(KERN_DEFAULT CUT_HERE); - if (file) pr_crit("kernel BUG at %s:%u!\n", file, line); else From 97b0b1ad58fab9f71b1a6bc056a09af6065ec3bc Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 25 Sep 2019 16:48:14 -0700 Subject: [PATCH 1277/2880] ipc/mqueue.c: delete an unnecessary check before the macro call dev_kfree_skb() dev_kfree_skb() input parameter validation, thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Link: http://lkml.kernel.org/r/07477187-63e5-cc80-34c1-32dd16b38e12@web.de Signed-off-by: Markus Elfring Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/mqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 7c15729d9d25..b02eb842b42e 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1333,7 +1333,7 @@ out_fput: out: if (sock) netlink_detachskb(sock, nc); - else if (nc) + else dev_kfree_skb(nc); return ret; From c231740dd95e854de5034cff8f49737d942bc098 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 25 Sep 2019 16:48:17 -0700 Subject: [PATCH 1278/2880] ipc/mqueue: improve exception handling in do_mq_notify() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Null pointers were assigned to local variables in a few cases as exception handling. The jump target “out” was used where no meaningful data processing actions should eventually be performed by branches of an if statement then. Use an additional jump target for calling dev_kfree_skb() directly. Return also directly after error conditions were detected when no extra clean-up is needed by this function implementation. Link: http://lkml.kernel.org/r/592ef10e-0b69-72d0-9789-fc48f638fdfd@web.de Signed-off-by: Markus Elfring Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/mqueue.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index b02eb842b42e..3d920ff15c80 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1240,15 +1240,14 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) /* create the notify skb */ nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL); - if (!nc) { - ret = -ENOMEM; - goto out; - } + if (!nc) + return -ENOMEM; + if (copy_from_user(nc->data, notification->sigev_value.sival_ptr, NOTIFY_COOKIE_LEN)) { ret = -EFAULT; - goto out; + goto free_skb; } /* TODO: add a header? */ @@ -1264,8 +1263,7 @@ retry: fdput(f); if (IS_ERR(sock)) { ret = PTR_ERR(sock); - sock = NULL; - goto out; + goto free_skb; } timeo = MAX_SCHEDULE_TIMEOUT; @@ -1274,11 +1272,8 @@ retry: sock = NULL; goto retry; } - if (ret) { - sock = NULL; - nc = NULL; - goto out; - } + if (ret) + return ret; } } @@ -1334,6 +1329,7 @@ out: if (sock) netlink_detachskb(sock, nc); else +free_skb: dev_kfree_skb(nc); return ret; From 984035ad7b247ccc62b06e113eea3fc673f114cc Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 25 Sep 2019 16:48:20 -0700 Subject: [PATCH 1279/2880] ipc/sem.c: convert to use built-in RCU list checking CONFIG_PROVE_RCU_LIST requires list_for_each_entry_rcu() to pass a lockdep expression if using srcu or locking for protection. It can only check regular RCU protection, all other protection needs to be passed as lockdep expression. Link: http://lkml.kernel.org/r/20190830231817.76862-2-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Cc: Arnd Bergmann Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: "Gustavo A. R. Silva" Cc: Jonathan Derrick Cc: Keith Busch Cc: Lorenzo Pieralisi Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ipc/sem.c b/ipc/sem.c index 7da4504bcc7c..ec97a7072413 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1852,7 +1852,8 @@ static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid) { struct sem_undo *un; - list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) { + list_for_each_entry_rcu(un, &ulp->list_proc, list_proc, + spin_is_locked(&ulp->lock)) { if (un->semid == semid) return un; } From 09b35b4192f6682dff96a093ab1930998cdb73b4 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 25 Sep 2019 16:48:24 -0700 Subject: [PATCH 1280/2880] lib/lzo/lzo1x_compress.c: fix alignment bug in lzo-rle Fix an unaligned access which breaks on platforms where this is not permitted (e.g., Sparc). Link: http://lkml.kernel.org/r/20190912145502.35229-1-dave.rodgman@arm.com Signed-off-by: Dave Rodgman Cc: Dave Rodgman Cc: Markus F.X.J. Oberhumer Cc: Minchan Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/lzo/lzo1x_compress.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c index ba16c08e8cb9..717c940112f9 100644 --- a/lib/lzo/lzo1x_compress.c +++ b/lib/lzo/lzo1x_compress.c @@ -83,17 +83,19 @@ next: ALIGN((uintptr_t)ir, 4)) && (ir < limit) && (*ir == 0)) ir++; - for (; (ir + 4) <= limit; ir += 4) { - dv = *((u32 *)ir); - if (dv) { + if (IS_ALIGNED((uintptr_t)ir, 4)) { + for (; (ir + 4) <= limit; ir += 4) { + dv = *((u32 *)ir); + if (dv) { # if defined(__LITTLE_ENDIAN) - ir += __builtin_ctz(dv) >> 3; + ir += __builtin_ctz(dv) >> 3; # elif defined(__BIG_ENDIAN) - ir += __builtin_clz(dv) >> 3; + ir += __builtin_clz(dv) >> 3; # else # error "missing endian definition" # endif - break; + break; + } } } #endif From 903f433f8f7a33e292a319259483adece8cc6674 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:27 -0700 Subject: [PATCH 1281/2880] lib: untag user pointers in strn*_user Patch series "arm64: untag user pointers passed to the kernel", v19. === Overview arm64 has a feature called Top Byte Ignore, which allows to embed pointer tags into the top byte of each pointer. Userspace programs (such as HWASan, a memory debugging tool [1]) might use this feature and pass tagged user pointers to the kernel through syscalls or other interfaces. Right now the kernel is already able to handle user faults with tagged pointers, due to these patches: 1. 81cddd65 ("arm64: traps: fix userspace cache maintenance emulation on a tagged pointer") 2. 7dcd9dd8 ("arm64: hw_breakpoint: fix watchpoint matching for tagged pointers") 3. 276e9327 ("arm64: entry: improve data abort handling of tagged pointers") This patchset extends tagged pointer support to syscall arguments. As per the proposed ABI change [3], tagged pointers are only allowed to be passed to syscalls when they point to memory ranges obtained by anonymous mmap() or sbrk() (see the patchset [3] for more details). For non-memory syscalls this is done by untaging user pointers when the kernel performs pointer checking to find out whether the pointer comes from userspace (most notably in access_ok). The untagging is done only when the pointer is being checked, the tag is preserved as the pointer makes its way through the kernel and stays tagged when the kernel dereferences the pointer when perfoming user memory accesses. The mmap and mremap (only new_addr) syscalls do not currently accept tagged addresses. Architectures may interpret the tag as a background colour for the corresponding vma. Other memory syscalls (mprotect, etc.) don't do user memory accesses but rather deal with memory ranges, and untagged pointers are better suited to describe memory ranges internally. Thus for memory syscalls we untag pointers completely when they enter the kernel. === Other approaches One of the alternative approaches to untagging that was considered is to completely strip the pointer tag as the pointer enters the kernel with some kind of a syscall wrapper, but that won't work with the countless number of different ioctl calls. With this approach we would need a custom wrapper for each ioctl variation, which doesn't seem practical. An alternative approach to untagging pointers in memory syscalls prologues is to inspead allow tagged pointers to be passed to find_vma() (and other vma related functions) and untag them there. Unfortunately, a lot of find_vma() callers then compare or subtract the returned vma start and end fields against the pointer that was being searched. Thus this approach would still require changing all find_vma() callers. === Testing The following testing approaches has been taken to find potential issues with user pointer untagging: 1. Static testing (with sparse [2] and separately with a custom static analyzer based on Clang) to track casts of __user pointers to integer types to find places where untagging needs to be done. 2. Static testing with grep to find parts of the kernel that call find_vma() (and other similar functions) or directly compare against vm_start/vm_end fields of vma. 3. Static testing with grep to find parts of the kernel that compare user pointers with TASK_SIZE or other similar consts and macros. 4. Dynamic testing: adding BUG_ON(has_tag(addr)) to find_vma() and running a modified syzkaller version that passes tagged pointers to the kernel. Based on the results of the testing the requried patches have been added to the patchset. === Notes This patchset is meant to be merged together with "arm64 relaxed ABI" [3]. This patchset is a prerequisite for ARM's memory tagging hardware feature support [4]. This patchset has been merged into the Pixel 2 & 3 kernel trees and is now being used to enable testing of Pixel phones with HWASan. Thanks! [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html [2] https://github.com/lucvoo/sparse-dev/commit/5f960cb10f56ec2017c128ef9d16060e0145f292 [3] https://lkml.org/lkml/2019/6/12/745 [4] https://community.arm.com/processors/b/blog/posts/arm-a-profile-architecture-2018-developments-armv85a This patch (of 11) This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. strncpy_from_user and strnlen_user accept user addresses as arguments, and do not go through the same path as copy_from_user and others, so here we need to handle the case of tagged user addresses separately. Untag user pointers passed to these functions. Note, that this patch only temporarily untags the pointers to perform validity checks, but then uses them as is to perform user memory accesses. [andreyknvl@google.com: fix sparc4 build] Link: http://lkml.kernel.org/r/CAAeHK+yx4a-P0sDrXTUxMvO2H0CJZUFPffBrg_cU7oJOZyC7ew@mail.gmail.com Link: http://lkml.kernel.org/r/c5a78bcad3e94d6cda71fcaa60a423231ae71e4c.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Vincenzo Frascino Reviewed-by: Khalid Aziz Acked-by: Kees Cook Reviewed-by: Catalin Marinas Cc: Al Viro Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 5 +++-- lib/strncpy_from_user.c | 3 ++- lib/strnlen_user.c | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index b57f9c631eca..6ec514fe3bef 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1078,7 +1078,7 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, } #define io_remap_pfn_range io_remap_pfn_range -static inline unsigned long untagged_addr(unsigned long start) +static inline unsigned long __untagged_addr(unsigned long start) { if (adi_capable()) { long addr = start; @@ -1098,7 +1098,8 @@ static inline unsigned long untagged_addr(unsigned long start) return start; } -#define untagged_addr untagged_addr +#define untagged_addr(addr) \ + ((__typeof__(addr))(__untagged_addr((unsigned long)(addr))) static inline bool pte_access_permitted(pte_t pte, bool write) { diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 023ba9f3b99f..dccb95af6003 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -108,7 +109,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) return 0; max_addr = user_addr_max(); - src_addr = (unsigned long)src; + src_addr = (unsigned long)untagged_addr(src); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 7f2db3fe311f..28ff554a1be8 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -2,6 +2,7 @@ #include #include #include +#include #include @@ -109,7 +110,7 @@ long strnlen_user(const char __user *str, long count) return 0; max_addr = user_addr_max(); - src_addr = (unsigned long)str; + src_addr = (unsigned long)untagged_addr(str); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; From 057d3389108eda8a20c7f496f011846932680d88 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:30 -0700 Subject: [PATCH 1282/2880] mm: untag user pointers passed to memory syscalls This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. This patch allows tagged pointers to be passed to the following memory syscalls: get_mempolicy, madvise, mbind, mincore, mlock, mlock2, mprotect, mremap, msync, munlock, move_pages. The mmap and mremap syscalls do not currently accept tagged addresses. Architectures may interpret the tag as a background colour for the corresponding vma. Link: http://lkml.kernel.org/r/aaf0c0969d46b2feb9017f3e1b3ef3970b633d91.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Khalid Aziz Reviewed-by: Vincenzo Frascino Reviewed-by: Catalin Marinas Reviewed-by: Kees Cook Cc: Al Viro Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 2 ++ mm/mempolicy.c | 3 +++ mm/migrate.c | 2 +- mm/mincore.c | 2 ++ mm/mlock.c | 4 ++++ mm/mprotect.c | 2 ++ mm/mremap.c | 7 +++++++ mm/msync.c | 2 ++ 8 files changed, 23 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 68ab988ad433..1f8a6fdc6878 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -784,6 +784,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) size_t len; struct blk_plug plug; + start = untagged_addr(start); + if (!madvise_behavior_valid(behavior)) return error; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 464406e8da91..de27d08b1ff8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1405,6 +1405,7 @@ static long kernel_mbind(unsigned long start, unsigned long len, int err; unsigned short mode_flags; + start = untagged_addr(start); mode_flags = mode & MPOL_MODE_FLAGS; mode &= ~MPOL_MODE_FLAGS; if (mode >= MPOL_MAX) @@ -1558,6 +1559,8 @@ static int kernel_get_mempolicy(int __user *policy, int uninitialized_var(pval); nodemask_t nodes; + addr = untagged_addr(addr); + if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; diff --git a/mm/migrate.c b/mm/migrate.c index 73d476d690b1..4fe45d1428c8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1612,7 +1612,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, goto out_flush; if (get_user(node, nodes + i)) goto out_flush; - addr = (unsigned long)p; + addr = (unsigned long)untagged_addr(p); err = -ENODEV; if (node < 0 || node >= MAX_NUMNODES) diff --git a/mm/mincore.c b/mm/mincore.c index f9a9dbe8cd33..49b6fa2f6aa1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -256,6 +256,8 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, unsigned long pages; unsigned char *tmp; + start = untagged_addr(start); + /* Check the start address: needs to be page-aligned.. */ if (start & ~PAGE_MASK) return -EINVAL; diff --git a/mm/mlock.c b/mm/mlock.c index a90099da4fb4..a72c1eeded77 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -674,6 +674,8 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla unsigned long lock_limit; int error = -ENOMEM; + start = untagged_addr(start); + if (!can_do_mlock()) return -EPERM; @@ -735,6 +737,8 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; + start = untagged_addr(start); + len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; diff --git a/mm/mprotect.c b/mm/mprotect.c index 675e5d34a507..7967825f6d33 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -459,6 +459,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); + start = untagged_addr(start); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ return -EINVAL; diff --git a/mm/mremap.c b/mm/mremap.c index fc241d23cd97..64c9a3b8be0a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -606,6 +606,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); + /* + * Architectures may interpret the tag passed to mmap as a background + * colour for the corresponding vma. For mremap we don't allow tagged + * new_addr to preserve similar behaviour to mmap. + */ + addr = untagged_addr(addr); + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; diff --git a/mm/msync.c b/mm/msync.c index ef30a429623a..c3bd3e75f687 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -37,6 +37,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) int unmapped_error = 0; int error = -EINVAL; + start = untagged_addr(start); + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; if (offset_in_page(start)) From f9652594195fca8c3d8b8ee392ad0ff9f701bb20 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:34 -0700 Subject: [PATCH 1283/2880] mm: untag user pointers in mm/gup.c This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. mm/gup.c provides a kernel interface that accepts user addresses and manipulates user pages directly (for example get_user_pages, that is used by the futex syscall). Since a user can provided tagged addresses, we need to handle this case. Add untagging to gup.c functions that use user addresses for vma lookups. Link: http://lkml.kernel.org/r/4731bddba3c938658c10ff4ed55cc01c60f4c8f8.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Khalid Aziz Reviewed-by: Vincenzo Frascino Reviewed-by: Kees Cook Reviewed-by: Catalin Marinas Cc: Al Viro Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/gup.c b/mm/gup.c index 60c3915c8ee6..23a9f9c9d377 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -788,6 +788,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (!nr_pages) return 0; + start = untagged_addr(start); + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); /* @@ -950,6 +952,8 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; vm_fault_t ret, major = 0; + address = untagged_addr(address); + if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY; From 5d65e7a7d8cd5c77baa1acf129a11b8b45ffee75 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:37 -0700 Subject: [PATCH 1284/2880] mm: untag user pointers in get_vaddr_frames This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. get_vaddr_frames uses provided user pointers for vma lookups, which can only by done with untagged pointers. Instead of locating and changing all callers of this function, perform untagging in it. Link: http://lkml.kernel.org/r/28f05e49c92b2a69c4703323d6c12208f3d881fe.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Khalid Aziz Reviewed-by: Vincenzo Frascino Acked-by: Catalin Marinas Reviewed-by: Kees Cook Cc: Al Viro Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frame_vector.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/frame_vector.c b/mm/frame_vector.c index c64dca6e27c2..c431ca81dad5 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -46,6 +46,8 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) nr_frames = vec->nr_allocated; + start = untagged_addr(start); + down_read(&mm->mmap_sem); locked = 1; vma = find_vma_intersection(mm, start, start + 1); From ed8a66b83269c27f7181c95b477da5d33fecfbc4 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:40 -0700 Subject: [PATCH 1285/2880] fs/namespace: untag user pointers in copy_mount_options This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. In copy_mount_options a user address is being subtracted from TASK_SIZE. If the address is lower than TASK_SIZE, the size is calculated to not allow the exact_copy_from_user() call to cross TASK_SIZE boundary. However if the address is tagged, then the size will be calculated incorrectly. Untag the address before subtracting. Link: http://lkml.kernel.org/r/1de225e4a54204bfd7f25dac2635e31aa4aa1d90.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Khalid Aziz Reviewed-by: Vincenzo Frascino Reviewed-by: Kees Cook Reviewed-by: Catalin Marinas Cc: Al Viro Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 93c043245c46..abcdc5f44865 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3028,7 +3028,7 @@ void *copy_mount_options(const void __user * data) * the remainder of the page. */ /* copy_from_user cannot cross TASK_SIZE ! */ - size = TASK_SIZE - (unsigned long)data; + size = TASK_SIZE - (unsigned long)untagged_addr(data); if (size > PAGE_SIZE) size = PAGE_SIZE; From 7d0325749a6c77b075424ab9de76bcb73a118430 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:44 -0700 Subject: [PATCH 1286/2880] userfaultfd: untag user pointers This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. userfaultfd code use provided user pointers for vma lookups, which can only by done with untagged pointers. Untag user pointers in validate_range(). Link: http://lkml.kernel.org/r/cdc59ddd7011012ca2e689bc88c3b65b1ea7e413.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Mike Rapoport Reviewed-by: Vincenzo Frascino Reviewed-by: Catalin Marinas Reviewed-by: Kees Cook Cc: Al Viro Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Khalid Aziz Cc: Mauro Carvalho Chehab Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index fe6d804a38dc..f9fd18670e22 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1272,21 +1272,23 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, } static __always_inline int validate_range(struct mm_struct *mm, - __u64 start, __u64 len) + __u64 *start, __u64 len) { __u64 task_size = mm->task_size; - if (start & ~PAGE_MASK) + *start = untagged_addr(*start); + + if (*start & ~PAGE_MASK) return -EINVAL; if (len & ~PAGE_MASK) return -EINVAL; if (!len) return -EINVAL; - if (start < mmap_min_addr) + if (*start < mmap_min_addr) return -EINVAL; - if (start >= task_size) + if (*start >= task_size) return -EINVAL; - if (len > task_size - start) + if (len > task_size - *start) return -EINVAL; return 0; } @@ -1336,7 +1338,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; } - ret = validate_range(mm, uffdio_register.range.start, + ret = validate_range(mm, &uffdio_register.range.start, uffdio_register.range.len); if (ret) goto out; @@ -1525,7 +1527,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) goto out; - ret = validate_range(mm, uffdio_unregister.start, + ret = validate_range(mm, &uffdio_unregister.start, uffdio_unregister.len); if (ret) goto out; @@ -1676,7 +1678,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) goto out; - ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); + ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len); if (ret) goto out; @@ -1716,7 +1718,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, sizeof(uffdio_copy)-sizeof(__s64))) goto out; - ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); + ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len); if (ret) goto out; /* @@ -1772,7 +1774,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, sizeof(uffdio_zeropage)-sizeof(__s64))) goto out; - ret = validate_range(ctx->mm, uffdio_zeropage.range.start, + ret = validate_range(ctx->mm, &uffdio_zeropage.range.start, uffdio_zeropage.range.len); if (ret) goto out; From 35f3fc87bebfb2fffc1a9cdaf661ee3f95c2a5f1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:47 -0700 Subject: [PATCH 1287/2880] drm/amdgpu: untag user pointers This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. In amdgpu_gem_userptr_ioctl() and amdgpu_amdkfd_gpuvm.c/init_user_pages() an MMU notifier is set up with a (tagged) userspace pointer. The untagged address should be used so that MMU notifiers for the untagged address get correctly matched up with the right BO. This patch untag user pointers in amdgpu_gem_userptr_ioctl() for the GEM case and in amdgpu_amdkfd_gpuvm_ alloc_memory_of_gpu() for the KFD case. This also makes sure that an untagged pointer is passed to amdgpu_ttm_tt_get_user_pages(), which uses it for vma lookups. Link: http://lkml.kernel.org/r/d684e1df08f2ecb6bc292e222b64fa9efbc26e69.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Kees Cook Suggested-by: Felix Kuehling Acked-by: Felix Kuehling Cc: Al Viro Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric Auger Cc: Jens Wiklander Cc: Khalid Aziz Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 42b936b6bbf1..6d021ecc8d59 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1103,7 +1103,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = 0; if (!offset || !*offset) return -EINVAL; - user_addr = *offset; + user_addr = untagged_addr(*offset); } else if (flags & (ALLOC_MEM_FLAGS_DOORBELL | ALLOC_MEM_FLAGS_MMIO_REMAP)) { domain = AMDGPU_GEM_DOMAIN_GTT; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index b174bd5eb38e..8ceb44925947 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -291,6 +291,8 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data, uint32_t handle; int r; + args->addr = untagged_addr(args->addr); + if (offset_in_page(args->addr | args->size)) return -EINVAL; From 4fdfae8d8f855d79b7d83fcd590b6ac7ed0099cf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:51 -0700 Subject: [PATCH 1288/2880] drm/radeon: untag user pointers in radeon_gem_userptr_ioctl This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. In radeon_gem_userptr_ioctl() an MMU notifier is set up with a (tagged) userspace pointer. The untagged address should be used so that MMU notifiers for the untagged address get correctly matched up with the right BO. This funcation also calls radeon_ttm_tt_pin_userptr(), which uses provided user pointers for vma lookups, which can only by done with untagged pointers. This patch untags user pointers in radeon_gem_userptr_ioctl(). Link: http://lkml.kernel.org/r/c856babeb67195b35603b8d5ba386a2819cec5ff.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Khalid Aziz Reviewed-by: Kees Cook Suggested-by: Felix Kuehling Acked-by: Felix Kuehling Cc: Al Viro Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric Auger Cc: Jens Wiklander Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/radeon/radeon_gem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index 4cf58dbbe439..b2b076606f54 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -296,6 +296,8 @@ int radeon_gem_userptr_ioctl(struct drm_device *dev, void *data, uint32_t handle; int r; + args->addr = untagged_addr(args->addr); + if (offset_in_page(args->addr | args->size)) return -EINVAL; From e275faf367e3a3b9db06a71924b199f429d3d508 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:54 -0700 Subject: [PATCH 1289/2880] media/v4l2-core: untag user pointers in videobuf_dma_contig_user_get This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. videobuf_dma_contig_user_get() uses provided user pointers for vma lookups, which can only by done with untagged pointers. Untag the pointers in this function. Link: http://lkml.kernel.org/r/100436d5f8e4349a78f27b0bbb27e4801fcb946b.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Khalid Aziz Reviewed-by: Kees Cook Acked-by: Mauro Carvalho Chehab Cc: Al Viro Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Mike Rapoport Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/media/v4l2-core/videobuf-dma-contig.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/media/v4l2-core/videobuf-dma-contig.c b/drivers/media/v4l2-core/videobuf-dma-contig.c index 76b4ac7b1678..aeb2f497c683 100644 --- a/drivers/media/v4l2-core/videobuf-dma-contig.c +++ b/drivers/media/v4l2-core/videobuf-dma-contig.c @@ -157,6 +157,7 @@ static void videobuf_dma_contig_user_put(struct videobuf_dma_contig_memory *mem) static int videobuf_dma_contig_user_get(struct videobuf_dma_contig_memory *mem, struct videobuf_buffer *vb) { + unsigned long untagged_baddr = untagged_addr(vb->baddr); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long prev_pfn, this_pfn; @@ -164,22 +165,22 @@ static int videobuf_dma_contig_user_get(struct videobuf_dma_contig_memory *mem, unsigned int offset; int ret; - offset = vb->baddr & ~PAGE_MASK; + offset = untagged_baddr & ~PAGE_MASK; mem->size = PAGE_ALIGN(vb->size + offset); ret = -EINVAL; down_read(&mm->mmap_sem); - vma = find_vma(mm, vb->baddr); + vma = find_vma(mm, untagged_baddr); if (!vma) goto out_up; - if ((vb->baddr + mem->size) > vma->vm_end) + if ((untagged_baddr + mem->size) > vma->vm_end) goto out_up; pages_done = 0; prev_pfn = 0; /* kill warning */ - user_address = vb->baddr; + user_address = untagged_baddr; while (pages_done < (mem->size >> PAGE_SHIFT)) { ret = follow_pfn(vma, user_address, &this_pfn); From 78063a9dd9637c0450cf6eacc03f42eb1295917f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:58 -0700 Subject: [PATCH 1290/2880] tee/shm: untag user pointers in tee_shm_register This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. tee_shm_register()->optee_shm_unregister()->check_mem_type() uses provided user pointers for vma lookups (via __check_mem_type()), which can only by done with untagged pointers. Untag user pointers in this function. Link: http://lkml.kernel.org/r/4b993f33196b3566ac81285ff8453219e2079b45.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Kees Cook Acked-by: Jens Wiklander Cc: Al Viro Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Khalid Aziz Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/tee/tee_shm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 2da026fd12c9..09ddcd06c715 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -254,6 +254,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr, shm->teedev = teedev; shm->ctx = ctx; shm->id = -1; + addr = untagged_addr(addr); start = rounddown(addr, PAGE_SIZE); shm->offset = addr - start; shm->size = length; From 6cf5354c1c4b74fd2e5527db084f163e9d4dae4e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:49:01 -0700 Subject: [PATCH 1291/2880] vfio/type1: untag user pointers in vaddr_get_pfn This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. vaddr_get_pfn() uses provided user pointers for vma lookups, which can only by done with untagged pointers. Untag user pointers in this function. Link: http://lkml.kernel.org/r/87422b4d72116a975896f2b19b00f38acbd28f33.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Eric Auger Reviewed-by: Vincenzo Frascino Reviewed-by: Catalin Marinas Reviewed-by: Kees Cook Cc: Dave Hansen Cc: Will Deacon Cc: Al Viro Cc: Felix Kuehling Cc: Jens Wiklander Cc: Khalid Aziz Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/vfio/vfio_iommu_type1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9a50b0558fa9..96fddc1dafc3 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -375,6 +375,8 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, down_read(&mm->mmap_sem); + vaddr = untagged_addr(vaddr); + vma = find_vma_intersection(mm, vaddr, vaddr + 1); if (vma && vma->vm_flags & VM_PFNMAP) { From ce18d171cb7368557e6498a3ce111d7d3dc03e4d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 25 Sep 2019 16:49:04 -0700 Subject: [PATCH 1292/2880] mm: untag user pointers in mmap/munmap/mremap/brk There isn't a good reason to differentiate between the user address space layout modification syscalls and the other memory permission/attributes ones (e.g. mprotect, madvise) w.r.t. the tagged address ABI. Untag the user addresses on entry to these functions. Link: http://lkml.kernel.org/r/20190821164730.47450-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Acked-by: Will Deacon Acked-by: Andrey Konovalov Cc: Vincenzo Frascino Cc: Szabolcs Nagy Cc: Kevin Brodsky Cc: Dave P Martin Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 +++++ mm/mremap.c | 6 +----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 14b7da317ec0..a7d8c84d19b7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -201,6 +201,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool downgraded = false; LIST_HEAD(uf); + brk = untagged_addr(brk); + if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -1587,6 +1589,8 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval; + addr = untagged_addr(addr); + if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); @@ -2885,6 +2889,7 @@ EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { + addr = untagged_addr(addr); profile_munmap(addr); return __vm_munmap(addr, len, true); } diff --git a/mm/mremap.c b/mm/mremap.c index 64c9a3b8be0a..1fc8a29fbe3f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -606,12 +606,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); - /* - * Architectures may interpret the tag passed to mmap as a background - * colour for the corresponding vma. For mremap we don't allow tagged - * new_addr to preserve similar behaviour to mmap. - */ addr = untagged_addr(addr); + new_addr = untagged_addr(new_addr); if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; From 9c276cc65a58faf98be8e56962745ec99ab87636 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 25 Sep 2019 16:49:08 -0700 Subject: [PATCH 1293/2880] mm: introduce MADV_COLD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Introduce MADV_COLD and MADV_PAGEOUT", v7. - Background The Android terminology used for forking a new process and starting an app from scratch is a cold start, while resuming an existing app is a hot start. While we continually try to improve the performance of cold starts, hot starts will always be significantly less power hungry as well as faster so we are trying to make hot start more likely than cold start. To increase hot start, Android userspace manages the order that apps should be killed in a process called ActivityManagerService. ActivityManagerService tracks every Android app or service that the user could be interacting with at any time and translates that into a ranked list for lmkd(low memory killer daemon). They are likely to be killed by lmkd if the system has to reclaim memory. In that sense they are similar to entries in any other cache. Those apps are kept alive for opportunistic performance improvements but those performance improvements will vary based on the memory requirements of individual workloads. - Problem Naturally, cached apps were dominant consumers of memory on the system. However, they were not significant consumers of swap even though they are good candidate for swap. Under investigation, swapping out only begins once the low zone watermark is hit and kswapd wakes up, but the overall allocation rate in the system might trip lmkd thresholds and cause a cached process to be killed(we measured performance swapping out vs. zapping the memory by killing a process. Unsurprisingly, zapping is 10x times faster even though we use zram which is much faster than real storage) so kill from lmkd will often satisfy the high zone watermark, resulting in very few pages actually being moved to swap. - Approach The approach we chose was to use a new interface to allow userspace to proactively reclaim entire processes by leveraging platform information. This allowed us to bypass the inaccuracy of the kernel’s LRUs for pages that are known to be cold from userspace and to avoid races with lmkd by reclaiming apps as soon as they entered the cached state. Additionally, it could provide many chances for platform to use much information to optimize memory efficiency. To achieve the goal, the patchset introduce two new options for madvise. One is MADV_COLD which will deactivate activated pages and the other is MADV_PAGEOUT which will reclaim private pages instantly. These new options complement MADV_DONTNEED and MADV_FREE by adding non-destructive ways to gain some free memory space. MADV_PAGEOUT is similar to MADV_DONTNEED in a way that it hints the kernel that memory region is not currently needed and should be reclaimed immediately; MADV_COLD is similar to MADV_FREE in a way that it hints the kernel that memory region is not currently needed and should be reclaimed when memory pressure rises. This patch (of 5): When a process expects no accesses to a certain memory range, it could give a hint to kernel that the pages can be reclaimed when memory pressure happens but data should be preserved for future use. This could reduce workingset eviction so it ends up increasing performance. This patch introduces the new MADV_COLD hint to madvise(2) syscall. MADV_COLD can be used by a process to mark a memory range as not expected to be used in the near future. The hint can help kernel in deciding which pages to evict early during memory pressure. It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves active file page -> inactive file LRU active anon page -> inacdtive anon LRU Unlike MADV_FREE, it doesn't move active anonymous pages to inactive file LRU's head because MADV_COLD is a little bit different symantic. MADV_FREE means it's okay to discard when the memory pressure because the content of the page is *garbage* so freeing such pages is almost zero overhead since we don't need to swap out and access afterward causes just minor fault. Thus, it would make sense to put those freeable pages in inactive file LRU to compete other used-once pages. It makes sense for implmentaion point of view, too because it's not swapbacked memory any longer until it would be re-dirtied. Even, it could give a bonus to make them be reclaimed on swapless system. However, MADV_COLD doesn't mean garbage so reclaiming them requires swap-out/in in the end so it's bigger cost. Since we have designed VM LRU aging based on cost-model, anonymous cold pages would be better to position inactive anon's LRU list, not file LRU. Furthermore, it would help to avoid unnecessary scanning if system doesn't have a swap device. Let's start simpler way without adding complexity at this moment. However, keep in mind, too that it's a caveat that workloads with a lot of pages cache are likely to ignore MADV_COLD on anonymous memory because we rarely age anonymous LRU lists. * man-page material MADV_COLD (since Linux x.x) Pages in the specified regions will be treated as less-recently-accessed compared to pages in the system with similar access frequencies. In contrast to MADV_FREE, the contents of the region are preserved regardless of subsequent writes to pages. MADV_COLD cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP pages. [akpm@linux-foundation.org: resolve conflicts with hmm.git] Link: http://lkml.kernel.org/r/20190726023435.214162-2-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: kbuild test robot Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: James E.J. Bottomley Cc: Richard Henderson Cc: Ralf Baechle Cc: Chris Zankel Cc: Johannes Weiner Cc: Daniel Colascione Cc: Dave Hansen Cc: Hillf Danton Cc: Joel Fernandes (Google) Cc: Kirill A. Shutemov Cc: Oleksandr Natalenko Cc: Shakeel Butt Cc: Sonny Rao Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/uapi/asm/mman.h | 2 + arch/mips/include/uapi/asm/mman.h | 2 + arch/parisc/include/uapi/asm/mman.h | 2 + arch/xtensa/include/uapi/asm/mman.h | 2 + include/linux/swap.h | 1 + include/uapi/asm-generic/mman-common.h | 2 + mm/internal.h | 2 +- mm/madvise.c | 179 ++++++++++++++++++++++++- mm/oom_kill.c | 2 +- mm/swap.c | 42 ++++++ 10 files changed, 232 insertions(+), 4 deletions(-) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index ac23379b7a87..f3258fbf03d0 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -68,6 +68,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index c2b40969eb1f..00ad09fc5eb1 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -95,6 +95,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index c98162f494db..eb14e3a7b8f3 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -48,6 +48,8 @@ #define MADV_DONTFORK 10 /* don't inherit across fork */ #define MADV_DOFORK 11 /* do inherit across fork */ +#define MADV_COLD 20 /* deactivate these pages */ + #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index ebbb48842190..f926b00ff11f 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -103,6 +103,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/linux/swap.h b/include/linux/swap.h index de2c67a33b7e..0ce997edb8bb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -340,6 +340,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); +extern void deactivate_page(struct page *page); extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 63b1f506ea67..23431faf0eb6 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -67,6 +67,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/internal.h b/mm/internal.h index e32390802fd3..0d5f720c75ab 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -39,7 +39,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma) +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); } diff --git a/mm/madvise.c b/mm/madvise.c index 1f8a6fdc6878..e1aee62967c3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_COLD: case MADV_FREE: return 0; default: @@ -289,6 +291,176 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + pte_t *orig_pte, *pte, ptent; + spinlock_t *ptl; + struct page *page; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t orig_pmd; + unsigned long next = pmd_addr_end(addr, end); + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) + goto huge_unlock; + + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto huge_unlock; + } + + page = pmd_page(orig_pmd); + if (next - addr != HPAGE_PMD_SIZE) { + int err; + + if (page_mapcount(page) != 1) + goto huge_unlock; + + get_page(page); + spin_unlock(ptl); + lock_page(page); + err = split_huge_page(page); + unlock_page(page); + put_page(page); + if (!err) + goto regular_page; + return 0; + } + + if (pmd_young(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); + orig_pmd = pmd_mkold(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + } + + test_and_clear_page_young(page); + deactivate_page(page); +huge_unlock: + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; +regular_page: +#endif + tlb_change_page_size(tlb, PAGE_SIZE); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * Creating a THP page is expensive so split it only if we + * are sure it's worth. Split it if we are only owner. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + break; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + break; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + break; + } + unlock_page(page); + put_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (pte_young(ptent)) { + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + + /* + * We are deactivating a page for accelerating reclaiming. + * VM couldn't reclaim the page unless we clear PG_young. + * As a side effect, it makes confuse idle-page tracking + * because they will miss recent referenced history. + */ + test_and_clear_page_young(page); + deactivate_page(page); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + cond_resched(); + + return 0; +} + +static const struct mm_walk_ops cold_walk_ops = { + .pmd_entry = madvise_cold_pte_range, +}; + +static void madvise_cold_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL); + tlb_end_vma(tlb, vma); +} + +static long madvise_cold(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_cold_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb, start_addr, end_addr); + + return 0; +} + static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -493,7 +665,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, int behavior) { *prev = vma; - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) return -EINVAL; if (!userfaultfd_remove(vma, start, end)) { @@ -515,7 +687,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ return -ENOMEM; } - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) return -EINVAL; if (end > vma->vm_end) { /* @@ -669,6 +841,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_COLD: + return madvise_cold(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); @@ -690,6 +864,7 @@ madvise_behavior_valid(int behavior) case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: + case MADV_COLD: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c1d9496b4c43..71e3acea7817 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -523,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) set_bit(MMF_UNSTABLE, &mm->flags); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) continue; /* diff --git a/mm/swap.c b/mm/swap.c index 784dc1620620..38c3fa4308e2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -47,6 +47,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); @@ -538,6 +539,22 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, update_page_reclaim_stat(lruvec, file, 0); } +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + + del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(page, lruvec, lru); + + __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page)); + update_page_reclaim_stat(lruvec, file, 0); + } +} static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) @@ -590,6 +607,10 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pvec = &per_cpu(lru_lazyfree_pvecs, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); @@ -623,6 +644,26 @@ void deactivate_file_page(struct page *page) } } +/* + * deactivate_page - deactivate a page + * @page: page to deactivate + * + * deactivate_page() moves @page to the inactive list if @page was on the active + * list and was not an unevictable page. This is done to accelerate the reclaim + * of @page. + */ +void deactivate_page(struct page *page) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + get_page(page); + if (!pagevec_add(pvec, page) || PageCompound(page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } +} + /** * mark_page_lazyfree - make an anon page lazyfree * @page: page to deactivate @@ -687,6 +728,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); From 8940b34a4e082ae11498ddae8432f2ac07685d1c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 25 Sep 2019 16:49:11 -0700 Subject: [PATCH 1294/2880] mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM The local variable references in shrink_page_list is PAGEREF_RECLAIM_CLEAN as default. It is for preventing to reclaim dirty pages when CMA try to migrate pages. Strictly speaking, we don't need it because CMA didn't allow to write out by .may_writepage = 0 in reclaim_clean_pages_from_list. Moreover, it has a problem to prevent anonymous pages's swap out even though force_reclaim = true in shrink_page_list on upcoming patch. So this patch makes references's default value to PAGEREF_RECLAIM and rename force_reclaim with ignore_references to make it more clear. This is a preparatory work for next patch. Link: http://lkml.kernel.org/r/20190726023435.214162-3-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Chris Zankel Cc: Daniel Colascione Cc: Dave Hansen Cc: Hillf Danton Cc: James E.J. Bottomley Cc: Joel Fernandes (Google) Cc: kbuild test robot Cc: Kirill A. Shutemov Cc: Oleksandr Natalenko Cc: Ralf Baechle Cc: Richard Henderson Cc: Shakeel Butt Cc: Sonny Rao Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4911754c93b7..d8bbaf068c35 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1123,7 +1123,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct scan_control *sc, enum ttu_flags ttu_flags, struct reclaim_stat *stat, - bool force_reclaim) + bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -1137,7 +1137,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct address_space *mapping; struct page *page; int may_enter_fs; - enum page_references references = PAGEREF_RECLAIM_CLEAN; + enum page_references references = PAGEREF_RECLAIM; bool dirty, writeback; unsigned int nr_pages; @@ -1268,7 +1268,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - if (!force_reclaim) + if (!ignore_references) references = page_check_references(page, sc); switch (references) { From 1a4e58cce84ee88129d5d49c064bd2852b481357 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 25 Sep 2019 16:49:15 -0700 Subject: [PATCH 1295/2880] mm: introduce MADV_PAGEOUT When a process expects no accesses to a certain memory range for a long time, it could hint kernel that the pages can be reclaimed instantly but data should be preserved for future use. This could reduce workingset eviction so it ends up increasing performance. This patch introduces the new MADV_PAGEOUT hint to madvise(2) syscall. MADV_PAGEOUT can be used by a process to mark a memory range as not expected to be used for a long time so that kernel reclaims *any LRU* pages instantly. The hint can help kernel in deciding which pages to evict proactively. A note: It doesn't apply SWAP_CLUSTER_MAX LRU page isolation limit intentionally because it's automatically bounded by PMD size. If PMD size(e.g., 256) makes some trouble, we could fix it later by limit it to SWAP_CLUSTER_MAX[1]. - man-page material MADV_PAGEOUT (since Linux x.x) Do not expect access in the near future so pages in the specified regions could be reclaimed instantly regardless of memory pressure. Thus, access in the range after successful operation could cause major page fault but never lose the up-to-date contents unlike MADV_DONTNEED. Pages belonging to a shared mapping are only processed if a write access is allowed for the calling process. MADV_PAGEOUT cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP pages. [1] https://lore.kernel.org/lkml/20190710194719.GS29695@dhcp22.suse.cz/ [minchan@kernel.org: clear PG_active on MADV_PAGEOUT] Link: http://lkml.kernel.org/r/20190802200643.GA181880@google.com [akpm@linux-foundation.org: resolve conflicts with hmm.git] Link: http://lkml.kernel.org/r/20190726023435.214162-5-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: kbuild test robot Acked-by: Michal Hocko Cc: James E.J. Bottomley Cc: Richard Henderson Cc: Ralf Baechle Cc: Chris Zankel Cc: Daniel Colascione Cc: Dave Hansen Cc: Hillf Danton Cc: Joel Fernandes (Google) Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Oleksandr Natalenko Cc: Shakeel Butt Cc: Sonny Rao Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/uapi/asm/mman.h | 1 + arch/mips/include/uapi/asm/mman.h | 1 + arch/parisc/include/uapi/asm/mman.h | 1 + arch/xtensa/include/uapi/asm/mman.h | 1 + include/linux/swap.h | 1 + include/uapi/asm-generic/mman-common.h | 1 + mm/madvise.c | 189 +++++++++++++++++++++++++ mm/vmscan.c | 56 ++++++++ 8 files changed, 251 insertions(+) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index f3258fbf03d0..a18ec7f63888 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -69,6 +69,7 @@ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 00ad09fc5eb1..57dc2ac4f8bd 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -96,6 +96,7 @@ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index eb14e3a7b8f3..6fd8871e4081 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -49,6 +49,7 @@ #define MADV_DOFORK 11 /* do inherit across fork */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index f926b00ff11f..e5e643752947 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -104,6 +104,7 @@ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/linux/swap.h b/include/linux/swap.h index 0ce997edb8bb..063c0c1e112b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -365,6 +365,7 @@ extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); extern unsigned long vm_total_pages; +extern unsigned long reclaim_pages(struct list_head *page_list); #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 23431faf0eb6..c160a5354eb6 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -68,6 +68,7 @@ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/madvise.c b/mm/madvise.c index e1aee62967c3..54c5639774b6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -44,6 +44,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_WILLNEED: case MADV_DONTNEED: case MADV_COLD: + case MADV_PAGEOUT: case MADV_FREE: return 0; default: @@ -461,6 +462,191 @@ static long madvise_cold(struct vm_area_struct *vma, return 0; } +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + pte_t *orig_pte, *pte, ptent; + spinlock_t *ptl; + LIST_HEAD(page_list); + struct page *page; + + if (fatal_signal_pending(current)) + return -EINTR; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t orig_pmd; + unsigned long next = pmd_addr_end(addr, end); + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) + goto huge_unlock; + + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto huge_unlock; + } + + page = pmd_page(orig_pmd); + if (next - addr != HPAGE_PMD_SIZE) { + int err; + + if (page_mapcount(page) != 1) + goto huge_unlock; + get_page(page); + spin_unlock(ptl); + lock_page(page); + err = split_huge_page(page); + unlock_page(page); + put_page(page); + if (!err) + goto regular_page; + return 0; + } + + if (pmd_young(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); + orig_pmd = pmd_mkold(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_tlb_entry(tlb, pmd, addr); + } + + ClearPageReferenced(page); + test_and_clear_page_young(page); + + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); +huge_unlock: + spin_unlock(ptl); + reclaim_pages(&page_list); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; +regular_page: +#endif + tlb_change_page_size(tlb, PAGE_SIZE); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * creating a THP page is expensive so split it only if we + * are sure it's worth. Split it if we are only owner. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + break; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + break; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + break; + } + unlock_page(page); + put_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (pte_young(ptent)) { + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + ClearPageReferenced(page); + test_and_clear_page_young(page); + + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + reclaim_pages(&page_list); + cond_resched(); + + return 0; +} + +static void madvise_pageout_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL); + tlb_end_vma(tlb, vma); +} + +static inline bool can_do_pageout(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * paging out pagecache only for non-anonymous mappings that correspond + * to the files the calling process could (if tried) open for writing; + * otherwise we'd be including shared non-exclusive mappings, which + * opens a side channel. + */ + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + +static long madvise_pageout(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + if (!can_do_pageout(vma)) + return 0; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb, start_addr, end_addr); + + return 0; +} + static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -843,6 +1029,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_willneed(vma, prev, start, end); case MADV_COLD: return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); @@ -865,6 +1053,7 @@ madvise_behavior_valid(int behavior) case MADV_DONTNEED: case MADV_FREE: case MADV_COLD: + case MADV_PAGEOUT: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/vmscan.c b/mm/vmscan.c index d8bbaf068c35..e5d52d6a24af 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2145,6 +2145,62 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } +unsigned long reclaim_pages(struct list_head *page_list) +{ + int nid = -1; + unsigned long nr_reclaimed = 0; + LIST_HEAD(node_page_list); + struct reclaim_stat dummy_stat; + struct page *page; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + }; + + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + if (nid == -1) { + nid = page_to_nid(page); + INIT_LIST_HEAD(&node_page_list); + } + + if (nid == page_to_nid(page)) { + ClearPageActive(page); + list_move(&page->lru, &node_page_list); + continue; + } + + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + nid = -1; + } + + if (!list_empty(&node_page_list)) { + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + } + + return nr_reclaimed; +} + /* * The inactive anon list should be small enough that the VM never has * to do too much work. From d616d5126503967bf365db0711ee3c78b356efe9 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 25 Sep 2019 16:49:19 -0700 Subject: [PATCH 1296/2880] mm: factor out common parts between MADV_COLD and MADV_PAGEOUT There are many common parts between MADV_COLD and MADV_PAGEOUT. This patch factor them out to save code duplication. Link: http://lkml.kernel.org/r/20190726023435.214162-6-minchan@kernel.org Signed-off-by: Minchan Kim Suggested-by: Johannes Weiner Acked-by: Michal Hocko Cc: Chris Zankel Cc: Daniel Colascione Cc: Dave Hansen Cc: Hillf Danton Cc: James E.J. Bottomley Cc: Joel Fernandes (Google) Cc: kbuild test robot Cc: Kirill A. Shutemov Cc: Oleksandr Natalenko Cc: Ralf Baechle Cc: Richard Henderson Cc: Shakeel Butt Cc: Sonny Rao Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 192 ++++++++++++--------------------------------------- 1 file changed, 45 insertions(+), 147 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 54c5639774b6..2be9f3fdb05e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -32,6 +32,11 @@ #include "internal.h" +struct madvise_walk_private { + struct mmu_gather *tlb; + bool pageout; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_sem for writing. Others, which simply traverse vmas, need @@ -292,15 +297,22 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } -static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) +static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) { - struct mmu_gather *tlb = walk->private; + struct madvise_walk_private *private = walk->private; + struct mmu_gather *tlb = private->tlb; + bool pageout = private->pageout; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; pte_t *orig_pte, *pte, ptent; spinlock_t *ptl; - struct page *page; + struct page *page = NULL; + LIST_HEAD(page_list); + + if (fatal_signal_pending(current)) + return -EINTR; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { @@ -348,10 +360,17 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } + ClearPageReferenced(page); test_and_clear_page_young(page); - deactivate_page(page); + if (pageout) { + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); + } else + deactivate_page(page); huge_unlock: spin_unlock(ptl); + if (pageout) + reclaim_pages(&page_list); return 0; } @@ -419,27 +438,39 @@ regular_page: * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history. */ + ClearPageReferenced(page); test_and_clear_page_young(page); - deactivate_page(page); + if (pageout) { + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); + } else + deactivate_page(page); } arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_pte, ptl); + if (pageout) + reclaim_pages(&page_list); cond_resched(); return 0; } static const struct mm_walk_ops cold_walk_ops = { - .pmd_entry = madvise_cold_pte_range, + .pmd_entry = madvise_cold_or_pageout_pte_range, }; static void madvise_cold_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { + struct madvise_walk_private walk_private = { + .pageout = false, + .tlb = tlb, + }; + tlb_start_vma(tlb, vma); - walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } @@ -462,150 +493,17 @@ static long madvise_cold(struct vm_area_struct *vma, return 0; } -static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct mmu_gather *tlb = walk->private; - struct mm_struct *mm = tlb->mm; - struct vm_area_struct *vma = walk->vma; - pte_t *orig_pte, *pte, ptent; - spinlock_t *ptl; - LIST_HEAD(page_list); - struct page *page; - - if (fatal_signal_pending(current)) - return -EINTR; - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge(*pmd)) { - pmd_t orig_pmd; - unsigned long next = pmd_addr_end(addr, end); - - tlb_change_page_size(tlb, HPAGE_PMD_SIZE); - ptl = pmd_trans_huge_lock(pmd, vma); - if (!ptl) - return 0; - - orig_pmd = *pmd; - if (is_huge_zero_pmd(orig_pmd)) - goto huge_unlock; - - if (unlikely(!pmd_present(orig_pmd))) { - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); - goto huge_unlock; - } - - page = pmd_page(orig_pmd); - if (next - addr != HPAGE_PMD_SIZE) { - int err; - - if (page_mapcount(page) != 1) - goto huge_unlock; - get_page(page); - spin_unlock(ptl); - lock_page(page); - err = split_huge_page(page); - unlock_page(page); - put_page(page); - if (!err) - goto regular_page; - return 0; - } - - if (pmd_young(orig_pmd)) { - pmdp_invalidate(vma, addr, pmd); - orig_pmd = pmd_mkold(orig_pmd); - - set_pmd_at(mm, addr, pmd, orig_pmd); - tlb_remove_tlb_entry(tlb, pmd, addr); - } - - ClearPageReferenced(page); - test_and_clear_page_young(page); - - if (!isolate_lru_page(page)) - list_add(&page->lru, &page_list); -huge_unlock: - spin_unlock(ptl); - reclaim_pages(&page_list); - return 0; - } - - if (pmd_trans_unstable(pmd)) - return 0; -regular_page: -#endif - tlb_change_page_size(tlb, PAGE_SIZE); - orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); - for (; addr < end; pte++, addr += PAGE_SIZE) { - ptent = *pte; - if (!pte_present(ptent)) - continue; - - page = vm_normal_page(vma, addr, ptent); - if (!page) - continue; - - /* - * creating a THP page is expensive so split it only if we - * are sure it's worth. Split it if we are only owner. - */ - if (PageTransCompound(page)) { - if (page_mapcount(page) != 1) - break; - get_page(page); - if (!trylock_page(page)) { - put_page(page); - break; - } - pte_unmap_unlock(orig_pte, ptl); - if (split_huge_page(page)) { - unlock_page(page); - put_page(page); - pte_offset_map_lock(mm, pmd, addr, &ptl); - break; - } - unlock_page(page); - put_page(page); - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - pte--; - addr -= PAGE_SIZE; - continue; - } - - VM_BUG_ON_PAGE(PageTransCompound(page), page); - - if (pte_young(ptent)) { - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - ptent = pte_mkold(ptent); - set_pte_at(mm, addr, pte, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); - } - ClearPageReferenced(page); - test_and_clear_page_young(page); - - if (!isolate_lru_page(page)) - list_add(&page->lru, &page_list); - } - - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(orig_pte, ptl); - reclaim_pages(&page_list); - cond_resched(); - - return 0; -} - static void madvise_pageout_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { + struct madvise_walk_private walk_private = { + .pageout = true, + .tlb = tlb, + }; + tlb_start_vma(tlb, vma); - walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } From c7cc8d77316b4386622b2dbd29de800df7b05099 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 25 Sep 2019 16:49:22 -0700 Subject: [PATCH 1297/2880] hexagon: drop empty and unused free_initrd_mem hexagon never reserves or initializes initrd and the only mention of it is the empty free_initrd_mem() function. As we have a generic implementation of free_initrd_mem(), there is no need to define an empty stub for the hexagon implementation and it can be dropped. Link: http://lkml.kernel.org/r/1565858133-25852-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Christoph Hellwig Cc: Richard Kuo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/mm/init.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index f1f6ebd537b7..c961773a6fff 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -71,19 +71,6 @@ void __init mem_init(void) init_mm.context.ptbase = __pa(init_mm.pgd); } -/* - * free_initrd_mem - frees... initrd memory. - * @start - start of init memory - * @end - end of init memory - * - * Apparently has to be passed the address of the initrd memory. - * - * Wrapped by #ifdef CONFIG_BLKDEV_INITRD - */ -void free_initrd_mem(unsigned long start, unsigned long end) -{ -} - void sync_icache_dcache(pte_t pte) { unsigned long addr; From de3f186f87cf15bed8d13fedafb5bcad0167fc6d Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:25 -0700 Subject: [PATCH 1298/2880] checkpatch: check for nested (un)?likely() calls IS_ERR(), IS_ERR_OR_NULL(), IS_ERR_VALUE() and WARN*() already contain unlikely() optimization internally. Thus, there is no point in calling these functions and defines under likely()/unlikely(). This check is based on the coccinelle rule developed by Enrico Weigelt https://lore.kernel.org/lkml/1559767582-11081-1-git-send-email-info@metux.net/ Link: http://lkml.kernel.org/r/20190829165025.15750-1-efremov@linux.com Signed-off-by: Denis Efremov Cc: Joe Perches Cc: Alexander Viro Cc: Anton Altaparmakov Cc: Boris Ostrovsky Cc: Boris Pismenny Cc: Darrick J. Wong Cc: "David S. Miller" Cc: Denis Efremov Cc: Dennis Dalessandro Cc: Inaky Perez-Gonzalez Cc: Juergen Gross Cc: Leon Romanovsky Cc: Mike Marciniszyn Cc: Rob Clark Cc: Saeed Mahameed Cc: Sean Paul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4eb355d8ae73..6fcc66afb088 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6507,6 +6507,12 @@ sub process { "Using $1 should generally have parentheses around the comparison\n" . $herecurr); } +# nested likely/unlikely calls + if ($line =~ /\b(?:(?:un)?likely)\s*\(\s*!?\s*(IS_ERR(?:_OR_NULL|_VALUE)?|WARN)/) { + WARN("LIKELY_MISUSE", + "nested (un)?likely() calls, $1 already uses unlikely() internally\n" . $herecurr); + } + # whine mightly about in_atomic if ($line =~ /\bin_atomic\s*\(/) { if ($realfile =~ m@^drivers/@) { From 284b94be1925dbe035ce5218d8b5c197321262c7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 26 Sep 2019 06:23:54 +0800 Subject: [PATCH 1299/2880] blk-mq: move lockdep_assert_held() into elevator_exit Commit c48dac137a62 ("block: don't hold q->sysfs_lock in elevator_init_mq") removes q->sysfs_lock from elevator_init_mq(), but forgot to deal with lockdep_assert_held() called in blk_mq_sched_free_requests() which is run in failure path of elevator_init_mq(). blk_mq_sched_free_requests() is called in the following 3 functions: elevator_init_mq() elevator_exit() blk_cleanup_queue() In blk_cleanup_queue(), blk_mq_sched_free_requests() is followed exactly by 'mutex_lock(&q->sysfs_lock)'. So moving the lockdep_assert_held() from blk_mq_sched_free_requests() into elevator_exit() for fixing the report by syzbot. Reported-by: syzbot+da3b7677bb913dc1b737@syzkaller.appspotmail.com Fixed: c48dac137a62 ("block: don't hold q->sysfs_lock in elevator_init_mq") Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 2 -- block/blk.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c9d183d6c499..ca22afd47b3d 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -555,8 +555,6 @@ void blk_mq_sched_free_requests(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - lockdep_assert_held(&q->sysfs_lock); - queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); diff --git a/block/blk.h b/block/blk.h index ed347f7a97b1..25773d668ec0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -194,6 +194,8 @@ void elv_unregister_queue(struct request_queue *q); static inline void elevator_exit(struct request_queue *q, struct elevator_queue *e) { + lockdep_assert_held(&q->sysfs_lock); + blk_mq_sched_free_requests(q); __elevator_exit(q, e); } From b89f625e28d44552083f43752f62d8621ded0a04 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 23 Sep 2019 23:12:09 +0800 Subject: [PATCH 1300/2880] block: don't release queue's sysfs lock during switching elevator cecf5d87ff20 ("block: split .sysfs_lock into two locks") starts to release & acquire sysfs_lock before registering/un-registering elevator queue during switching elevator for avoiding potential deadlock from showing & storing 'queue/iosched' attributes and removing elevator's kobject. Turns out there isn't such deadlock because 'q->sysfs_lock' isn't required in .show & .store of queue/iosched's attributes, and just elevator's sysfs lock is acquired in elv_iosched_store() and elv_iosched_show(). So it is safe to hold queue's sysfs lock when registering/un-registering elevator queue. The biggest issue is that commit cecf5d87ff20 assumes that concurrent write on 'queue/scheduler' can't happen. However, this assumption isn't true, because kernfs_fop_write() only guarantees that concurrent write aren't called on the same open file, but the write could be from different open on the file. So we can't release & re-acquire queue's sysfs lock during switching elevator, otherwise use-after-free on elevator could be triggered. Fixes the issue by not releasing queue's sysfs lock during switching elevator. Fixes: cecf5d87ff20 ("block: split .sysfs_lock into two locks") Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Greg KH Cc: Mike Snitzer Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 13 ++++--------- block/elevator.c | 31 +------------------------------ 2 files changed, 5 insertions(+), 39 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b82736c781c5..962fc0c44381 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -989,13 +989,11 @@ int blk_register_queue(struct gendisk *disk) blk_mq_debugfs_register(q); } - /* - * The flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator - * switch won't happen at all. - */ + mutex_lock(&q->sysfs_lock); if (q->elevator) { ret = elv_register_queue(q, false); if (ret) { + mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); @@ -1005,7 +1003,6 @@ int blk_register_queue(struct gendisk *disk) has_elevator = true; } - mutex_lock(&q->sysfs_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register_queue(q); @@ -1062,12 +1059,10 @@ void blk_unregister_queue(struct gendisk *disk) kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); - /* - * q->kobj has been removed, so it is safe to check if elevator - * exists without holding q->sysfs_lock. - */ + mutex_lock(&q->sysfs_lock); if (q->elevator) elv_unregister_queue(q); + mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); kobject_put(&disk_to_dev(disk)->kobj); diff --git a/block/elevator.c b/block/elevator.c index bba10e83478a..5437059c9261 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -503,9 +503,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) if (uevent) kobject_uevent(&e->kobj, KOBJ_ADD); - mutex_lock(&q->sysfs_lock); e->registered = 1; - mutex_unlock(&q->sysfs_lock); } return error; } @@ -523,11 +521,9 @@ void elv_unregister_queue(struct request_queue *q) kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); - mutex_lock(&q->sysfs_lock); e->registered = 0; /* Re-enable throttling in case elevator disabled it */ wbt_enable_default(q); - mutex_unlock(&q->sysfs_lock); } } @@ -590,32 +586,11 @@ int elevator_switch_mq(struct request_queue *q, lockdep_assert_held(&q->sysfs_lock); if (q->elevator) { - if (q->elevator->registered) { - mutex_unlock(&q->sysfs_lock); - - /* - * Concurrent elevator switch can't happen becasue - * sysfs write is always exclusively on same file. - * - * Also the elevator queue won't be freed after - * sysfs_lock is released becasue kobject_del() in - * blk_unregister_queue() waits for completion of - * .store & .show on its attributes. - */ + if (q->elevator->registered) elv_unregister_queue(q); - mutex_lock(&q->sysfs_lock); - } ioc_clear_queue(q); elevator_exit(q, q->elevator); - - /* - * sysfs_lock may be dropped, so re-check if queue is - * unregistered. If yes, don't switch to new elevator - * any more - */ - if (!blk_queue_registered(q)) - return 0; } ret = blk_mq_init_sched(q, new_e); @@ -623,11 +598,7 @@ int elevator_switch_mq(struct request_queue *q, goto out; if (new_e) { - mutex_unlock(&q->sysfs_lock); - ret = elv_register_queue(q, true); - - mutex_lock(&q->sysfs_lock); if (ret) { elevator_exit(q, q->elevator); goto out; From 34b7bb2995b839729a2e793ea1dccc987b6dbab6 Mon Sep 17 00:00:00 2001 From: Rain River Date: Mon, 23 Sep 2019 22:37:46 +0800 Subject: [PATCH 1301/2880] MAINTAINERS: add Yanjun to FORCEDETH maintainers list Yanjun has been spending quite a lot of time fixing bugs in FORCEDETH source code. I'd like to add Yanjun to maintainers list. Signed-off-by: Rain River Acked-by: Zhu Yanjun Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b2326dece28e..d61071f187c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -643,6 +643,7 @@ F: drivers/net/ethernet/alacritech/* FORCEDETH GIGABIT ETHERNET DRIVER M: Rain River +M: Zhu Yanjun L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/nvidia/* From bf69abad27d8fe1daca9558441fd0205fb2d7bc9 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 23 Sep 2019 17:52:42 +0200 Subject: [PATCH 1302/2880] net: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Signed-off-by: Krzysztof Kozlowski Acked-by: Sven Eckelmann Signed-off-by: David S. Miller --- net/batman-adv/Kconfig | 10 +-- net/ife/Kconfig | 2 +- net/ipv4/Kconfig | 4 +- net/ipv6/netfilter/Kconfig | 16 ++--- net/netfilter/Kconfig | 2 +- net/netfilter/ipvs/Kconfig | 6 +- net/rds/Kconfig | 4 +- net/sched/Kconfig | 144 ++++++++++++++++++------------------- 8 files changed, 94 insertions(+), 94 deletions(-) diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index a3d188dfbe75..d5028af750d5 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -12,11 +12,11 @@ config BATMAN_ADV depends on NET select LIBCRC32C help - B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is - a routing protocol for multi-hop ad-hoc mesh networks. The - networks may be wired or wireless. See - https://www.open-mesh.org/ for more information and user space - tools. + B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is + a routing protocol for multi-hop ad-hoc mesh networks. The + networks may be wired or wireless. See + https://www.open-mesh.org/ for more information and user space + tools. config BATMAN_ADV_BATMAN_V bool "B.A.T.M.A.N. V protocol" diff --git a/net/ife/Kconfig b/net/ife/Kconfig index 6cd1f6d18f30..bcf650564db4 100644 --- a/net/ife/Kconfig +++ b/net/ife/Kconfig @@ -5,7 +5,7 @@ menuconfig NET_IFE depends on NET - tristate "Inter-FE based on IETF ForCES InterFE LFB" + tristate "Inter-FE based on IETF ForCES InterFE LFB" default n help Say Y here to add support of IFE encapsulation protocol diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 974de4d20f25..03381f3e12ba 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -492,8 +492,8 @@ config TCP_CONG_WESTWOOD wired networks and throughput over wireless links. config TCP_CONG_HTCP - tristate "H-TCP" - default m + tristate "H-TCP" + default m ---help--- H-TCP is a send-side only modifications of the TCP Reno protocol stack that optimizes the performance of TCP diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 6120a7800975..69443e9a3aa5 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -170,13 +170,13 @@ config IP6_NF_MATCH_RT To compile it as a module, choose M here. If unsure, say N. config IP6_NF_MATCH_SRH - tristate '"srh" Segment Routing header match support' - depends on NETFILTER_ADVANCED - help - srh matching allows you to match packets based on the segment + tristate '"srh" Segment Routing header match support' + depends on NETFILTER_ADVANCED + help + srh matching allows you to match packets based on the segment routing header of the packet. - To compile it as a module, choose M here. If unsure, say N. + To compile it as a module, choose M here. If unsure, say N. # The targets config IP6_NF_TARGET_HL @@ -249,10 +249,10 @@ config IP6_NF_SECURITY depends on SECURITY depends on NETFILTER_ADVANCED help - This option adds a `security' table to iptables, for use - with Mandatory Access Control (MAC) policy. + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. - If unsure, say N. + If unsure, say N. config IP6_NF_NAT tristate "ip6tables NAT support" diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 34ec7afec116..91efae88e8c2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -697,7 +697,7 @@ config NF_FLOW_TABLE_INET tristate "Netfilter flow table mixed IPv4/IPv6 module" depends on NF_FLOW_TABLE help - This option adds the flow table mixed IPv4/IPv6 support. + This option adds the flow table mixed IPv4/IPv6 support. To compile it as a module, choose M here. diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index f6f1a0d5c47d..5b672e05d758 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -135,7 +135,7 @@ config IP_VS_WRR module, choose M here. If unsure, say N. config IP_VS_LC - tristate "least-connection scheduling" + tristate "least-connection scheduling" ---help--- The least-connection scheduling algorithm directs network connections to the server with the least number of active @@ -145,7 +145,7 @@ config IP_VS_LC module, choose M here. If unsure, say N. config IP_VS_WLC - tristate "weighted least-connection scheduling" + tristate "weighted least-connection scheduling" ---help--- The weighted least-connection scheduling algorithm directs network connections to the server with the least active connections @@ -333,7 +333,7 @@ config IP_VS_NFCT config IP_VS_PE_SIP tristate "SIP persistence engine" - depends on IP_VS_PROTO_UDP + depends on IP_VS_PROTO_UDP depends on NF_CONNTRACK_SIP ---help--- Allow persistence based on the SIP Call-ID diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 38ea7f0f2699..c64e154bc18f 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -23,6 +23,6 @@ config RDS_TCP This transport does not support RDMA operations. config RDS_DEBUG - bool "RDS debugging messages" + bool "RDS debugging messages" depends on RDS - default n + default n diff --git a/net/sched/Kconfig b/net/sched/Kconfig index b3faafeafab9..5b044ae6dc1e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -324,7 +324,7 @@ config NET_SCH_CAKE tristate "Common Applications Kept Enhanced (CAKE)" help Say Y here if you want to use the Common Applications Kept Enhanced - (CAKE) queue management algorithm. + (CAKE) queue management algorithm. To compile this driver as a module, choose M here: the module will be called sch_cake. @@ -730,8 +730,8 @@ config NET_CLS_ACT config NET_ACT_POLICE tristate "Traffic Policing" - depends on NET_CLS_ACT - ---help--- + depends on NET_CLS_ACT + ---help--- Say Y here if you want to do traffic policing, i.e. strict bandwidth limiting. This action replaces the existing policing module. @@ -740,9 +740,9 @@ config NET_ACT_POLICE module will be called act_police. config NET_ACT_GACT - tristate "Generic actions" - depends on NET_CLS_ACT - ---help--- + tristate "Generic actions" + depends on NET_CLS_ACT + ---help--- Say Y here to take generic actions such as dropping and accepting packets. @@ -750,15 +750,15 @@ config NET_ACT_GACT module will be called act_gact. config GACT_PROB - bool "Probability support" - depends on NET_ACT_GACT - ---help--- + bool "Probability support" + depends on NET_ACT_GACT + ---help--- Say Y here to use the generic action randomly or deterministically. config NET_ACT_MIRRED - tristate "Redirecting and Mirroring" - depends on NET_CLS_ACT - ---help--- + tristate "Redirecting and Mirroring" + depends on NET_CLS_ACT + ---help--- Say Y here to allow packets to be mirrored or redirected to other devices. @@ -766,10 +766,10 @@ config NET_ACT_MIRRED module will be called act_mirred. config NET_ACT_SAMPLE - tristate "Traffic Sampling" - depends on NET_CLS_ACT - select PSAMPLE - ---help--- + tristate "Traffic Sampling" + depends on NET_CLS_ACT + select PSAMPLE + ---help--- Say Y here to allow packet sampling tc action. The packet sample action consists of statistically choosing packets and sampling them using the psample module. @@ -778,9 +778,9 @@ config NET_ACT_SAMPLE module will be called act_sample. config NET_ACT_IPT - tristate "IPtables targets" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - ---help--- + tristate "IPtables targets" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + ---help--- Say Y here to be able to invoke iptables targets after successful classification. @@ -788,9 +788,9 @@ config NET_ACT_IPT module will be called act_ipt. config NET_ACT_NAT - tristate "Stateless NAT" - depends on NET_CLS_ACT - ---help--- + tristate "Stateless NAT" + depends on NET_CLS_ACT + ---help--- Say Y here to do stateless NAT on IPv4 packets. You should use netfilter for NAT unless you know what you are doing. @@ -798,18 +798,18 @@ config NET_ACT_NAT module will be called act_nat. config NET_ACT_PEDIT - tristate "Packet Editing" - depends on NET_CLS_ACT - ---help--- + tristate "Packet Editing" + depends on NET_CLS_ACT + ---help--- Say Y here if you want to mangle the content of packets. To compile this code as a module, choose M here: the module will be called act_pedit. config NET_ACT_SIMP - tristate "Simple Example (Debug)" - depends on NET_CLS_ACT - ---help--- + tristate "Simple Example (Debug)" + depends on NET_CLS_ACT + ---help--- Say Y here to add a simple action for demonstration purposes. It is meant as an example and for debugging purposes. It will print a configured policy string followed by the packet count @@ -821,9 +821,9 @@ config NET_ACT_SIMP module will be called act_simple. config NET_ACT_SKBEDIT - tristate "SKB Editing" - depends on NET_CLS_ACT - ---help--- + tristate "SKB Editing" + depends on NET_CLS_ACT + ---help--- Say Y here to change skb priority or queue_mapping settings. If unsure, say N. @@ -832,10 +832,10 @@ config NET_ACT_SKBEDIT module will be called act_skbedit. config NET_ACT_CSUM - tristate "Checksum Updating" - depends on NET_CLS_ACT && INET - select LIBCRC32C - ---help--- + tristate "Checksum Updating" + depends on NET_CLS_ACT && INET + select LIBCRC32C + ---help--- Say Y here to update some common checksum after some direct packet alterations. @@ -854,9 +854,9 @@ config NET_ACT_MPLS module will be called act_mpls. config NET_ACT_VLAN - tristate "Vlan manipulation" - depends on NET_CLS_ACT - ---help--- + tristate "Vlan manipulation" + depends on NET_CLS_ACT + ---help--- Say Y here to push or pop vlan headers. If unsure, say N. @@ -865,9 +865,9 @@ config NET_ACT_VLAN module will be called act_vlan. config NET_ACT_BPF - tristate "BPF based action" - depends on NET_CLS_ACT - ---help--- + tristate "BPF based action" + depends on NET_CLS_ACT + ---help--- Say Y here to execute BPF code on packets. The BPF code will decide if the packet should be dropped or not. @@ -877,10 +877,10 @@ config NET_ACT_BPF module will be called act_bpf. config NET_ACT_CONNMARK - tristate "Netfilter Connection Mark Retriever" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - depends on NF_CONNTRACK && NF_CONNTRACK_MARK - ---help--- + tristate "Netfilter Connection Mark Retriever" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NF_CONNTRACK && NF_CONNTRACK_MARK + ---help--- Say Y here to allow retrieving of conn mark If unsure, say N. @@ -889,10 +889,10 @@ config NET_ACT_CONNMARK module will be called act_connmark. config NET_ACT_CTINFO - tristate "Netfilter Connection Mark Actions" - depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - depends on NF_CONNTRACK && NF_CONNTRACK_MARK - help + tristate "Netfilter Connection Mark Actions" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NF_CONNTRACK && NF_CONNTRACK_MARK + help Say Y here to allow transfer of a connmark stored information. Current actions transfer connmark stored DSCP into ipv4/v6 diffserv and/or to transfer connmark to packet @@ -906,21 +906,21 @@ config NET_ACT_CTINFO module will be called act_ctinfo. config NET_ACT_SKBMOD - tristate "skb data modification action" - depends on NET_CLS_ACT - ---help--- - Say Y here to allow modification of skb data + tristate "skb data modification action" + depends on NET_CLS_ACT + ---help--- + Say Y here to allow modification of skb data - If unsure, say N. + If unsure, say N. - To compile this code as a module, choose M here: the - module will be called act_skbmod. + To compile this code as a module, choose M here: the + module will be called act_skbmod. config NET_ACT_IFE - tristate "Inter-FE action based on IETF ForCES InterFE LFB" - depends on NET_CLS_ACT - select NET_IFE - ---help--- + tristate "Inter-FE action based on IETF ForCES InterFE LFB" + depends on NET_CLS_ACT + select NET_IFE + ---help--- Say Y here to allow for sourcing and terminating metadata For details refer to netdev01 paper: "Distributing Linux Traffic Control Classifier-Action Subsystem" @@ -930,9 +930,9 @@ config NET_ACT_IFE module will be called act_ife. config NET_ACT_TUNNEL_KEY - tristate "IP tunnel metadata manipulation" - depends on NET_CLS_ACT - ---help--- + tristate "IP tunnel metadata manipulation" + depends on NET_CLS_ACT + ---help--- Say Y here to set/release ip tunnel metadata. If unsure, say N. @@ -941,9 +941,9 @@ config NET_ACT_TUNNEL_KEY module will be called act_tunnel_key. config NET_ACT_CT - tristate "connection tracking tc action" - depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT - help + tristate "connection tracking tc action" + depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT + help Say Y here to allow sending the packets to conntrack module. If unsure, say N. @@ -952,16 +952,16 @@ config NET_ACT_CT module will be called act_ct. config NET_IFE_SKBMARK - tristate "Support to encoding decoding skb mark on IFE action" - depends on NET_ACT_IFE + tristate "Support to encoding decoding skb mark on IFE action" + depends on NET_ACT_IFE config NET_IFE_SKBPRIO - tristate "Support to encoding decoding skb prio on IFE action" - depends on NET_ACT_IFE + tristate "Support to encoding decoding skb prio on IFE action" + depends on NET_ACT_IFE config NET_IFE_SKBTCINDEX - tristate "Support to encoding decoding skb tcindex on IFE action" - depends on NET_ACT_IFE + tristate "Support to encoding decoding skb tcindex on IFE action" + depends on NET_ACT_IFE config NET_TC_SKB_EXT bool "TC recirculation support" From 02bc5eb990597796d8e8383d1b98e540af963bf1 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 23 Sep 2019 17:52:43 +0200 Subject: [PATCH 1303/2880] drivers: net: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Signed-off-by: Krzysztof Kozlowski Acked-by: Kalle Valo Reviewed-by: Leon Romanovsky Signed-off-by: David S. Miller --- drivers/net/Kconfig | 2 +- drivers/net/arcnet/Kconfig | 22 ++-- drivers/net/can/usb/Kconfig | 8 +- drivers/net/ethernet/allwinner/Kconfig | 10 +- drivers/net/ethernet/emulex/benet/Kconfig | 2 +- .../net/ethernet/mellanox/mlx5/core/Kconfig | 36 +++--- drivers/net/ethernet/nxp/Kconfig | 8 +- drivers/net/ethernet/pensando/Kconfig | 4 +- drivers/net/phy/Kconfig | 6 +- drivers/net/wireless/ath/Kconfig | 2 +- drivers/net/wireless/ath/ar5523/Kconfig | 4 +- drivers/net/wireless/ath/ath6kl/Kconfig | 2 +- drivers/net/wireless/ath/ath9k/Kconfig | 2 +- drivers/net/wireless/ath/carl9170/Kconfig | 6 +- drivers/net/wireless/atmel/Kconfig | 32 ++--- drivers/net/wireless/intel/ipw2x00/Kconfig | 116 +++++++++--------- drivers/net/wireless/intel/iwlegacy/Kconfig | 6 +- drivers/net/wireless/intel/iwlwifi/Kconfig | 6 +- drivers/net/wireless/ralink/rt2x00/Kconfig | 24 ++-- 19 files changed, 149 insertions(+), 149 deletions(-) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 48e209e55843..df1c7989e13d 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -487,7 +487,7 @@ config FUJITSU_ES depends on ACPI help This driver provides support for Extended Socket network device - on Extended Partitioning of FUJITSU PRIMEQUEST 2000 E2 series. + on Extended Partitioning of FUJITSU PRIMEQUEST 2000 E2 series. config THUNDERBOLT_NET tristate "Networking over Thunderbolt cable" diff --git a/drivers/net/arcnet/Kconfig b/drivers/net/arcnet/Kconfig index faeb4419b205..27551bf3d7e4 100644 --- a/drivers/net/arcnet/Kconfig +++ b/drivers/net/arcnet/Kconfig @@ -56,19 +56,19 @@ config ARCNET_CAP tristate "Enable CAP mode packet interface" help ARCnet "cap mode" packet encapsulation. Used to get the hardware - acknowledge back to userspace. After the initial protocol byte every - packet is stuffed with an extra 4 byte "cookie" which doesn't - actually appear on the network. After transmit the driver will send - back a packet with protocol byte 0 containing the status of the - transmission: - 0=no hardware acknowledge - 1=excessive nak - 2=transmission accepted by the receiver hardware + acknowledge back to userspace. After the initial protocol byte every + packet is stuffed with an extra 4 byte "cookie" which doesn't + actually appear on the network. After transmit the driver will send + back a packet with protocol byte 0 containing the status of the + transmission: + 0=no hardware acknowledge + 1=excessive nak + 2=transmission accepted by the receiver hardware - Received packets are also stuffed with the extra 4 bytes but it will - be random data. + Received packets are also stuffed with the extra 4 bytes but it will + be random data. - Cap only listens to protocol 1-8. + Cap only listens to protocol 1-8. config ARCNET_COM90xx tristate "ARCnet COM90xx (normal) chipset driver" diff --git a/drivers/net/can/usb/Kconfig b/drivers/net/can/usb/Kconfig index 4b3d0ddcda79..b412f7ba4f89 100644 --- a/drivers/net/can/usb/Kconfig +++ b/drivers/net/can/usb/Kconfig @@ -15,10 +15,10 @@ config CAN_EMS_USB from EMS Dr. Thomas Wuensche (http://www.ems-wuensche.de). config CAN_ESD_USB2 - tristate "ESD USB/2 CAN/USB interface" - ---help--- - This driver supports the CAN-USB/2 interface - from esd electronic system design gmbh (http://www.esd.eu). + tristate "ESD USB/2 CAN/USB interface" + ---help--- + This driver supports the CAN-USB/2 interface + from esd electronic system design gmbh (http://www.esd.eu). config CAN_GS_USB tristate "Geschwister Schneider UG interfaces" diff --git a/drivers/net/ethernet/allwinner/Kconfig b/drivers/net/ethernet/allwinner/Kconfig index a5e2bcbf2722..264a482ec31d 100644 --- a/drivers/net/ethernet/allwinner/Kconfig +++ b/drivers/net/ethernet/allwinner/Kconfig @@ -21,17 +21,17 @@ config NET_VENDOR_ALLWINNER if NET_VENDOR_ALLWINNER config SUN4I_EMAC - tristate "Allwinner A10 EMAC support" + tristate "Allwinner A10 EMAC support" depends on ARCH_SUNXI depends on OF select CRC32 select MII select PHYLIB select MDIO_SUN4I - ---help--- - Support for Allwinner A10 EMAC ethernet driver. + ---help--- + Support for Allwinner A10 EMAC ethernet driver. - To compile this driver as a module, choose M here. The module - will be called sun4i-emac. + To compile this driver as a module, choose M here. The module + will be called sun4i-emac. endif # NET_VENDOR_ALLWINNER diff --git a/drivers/net/ethernet/emulex/benet/Kconfig b/drivers/net/ethernet/emulex/benet/Kconfig index e8c7eb842dbe..17d300ea9955 100644 --- a/drivers/net/ethernet/emulex/benet/Kconfig +++ b/drivers/net/ethernet/emulex/benet/Kconfig @@ -48,5 +48,5 @@ config BE2NET_SKYHAWK chipsets. (e.g. OneConnect OCe14xxx) comment "WARNING: be2net is useless without any enabled chip" - depends on BE2NET_BE2=n && BE2NET_BE3=n && BE2NET_LANCER=n && \ + depends on BE2NET_BE2=n && BE2NET_BE3=n && BE2NET_LANCER=n && \ BE2NET_SKYHAWK=n && BE2NET diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index 0dba272a5b2f..a1f20b205299 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -20,15 +20,15 @@ config MLX5_ACCEL bool config MLX5_FPGA - bool "Mellanox Technologies Innova support" - depends on MLX5_CORE + bool "Mellanox Technologies Innova support" + depends on MLX5_CORE select MLX5_ACCEL - ---help--- - Build support for the Innova family of network cards by Mellanox - Technologies. Innova network cards are comprised of a ConnectX chip - and an FPGA chip on one board. If you select this option, the - mlx5_core driver will include the Innova FPGA core and allow building - sandbox-specific client drivers. + ---help--- + Build support for the Innova family of network cards by Mellanox + Technologies. Innova network cards are comprised of a ConnectX chip + and an FPGA chip on one board. If you select this option, the + mlx5_core driver will include the Innova FPGA core and allow building + sandbox-specific client drivers. config MLX5_CORE_EN bool "Mellanox 5th generation network adapters (ConnectX series) Ethernet support" @@ -58,14 +58,14 @@ config MLX5_EN_RXNFC API. config MLX5_MPFS - bool "Mellanox Technologies MLX5 MPFS support" - depends on MLX5_CORE_EN + bool "Mellanox Technologies MLX5 MPFS support" + depends on MLX5_CORE_EN default y - ---help--- + ---help--- Mellanox Technologies Ethernet Multi-Physical Function Switch (MPFS) - support in ConnectX NIC. MPFs is required for when multi-PF configuration - is enabled to allow passing user configured unicast MAC addresses to the - requesting PF. + support in ConnectX NIC. MPFs is required for when multi-PF configuration + is enabled to allow passing user configured unicast MAC addresses to the + requesting PF. config MLX5_ESWITCH bool "Mellanox Technologies MLX5 SRIOV E-Switch support" @@ -73,10 +73,10 @@ config MLX5_ESWITCH default y ---help--- Mellanox Technologies Ethernet SRIOV E-Switch support in ConnectX NIC. - E-Switch provides internal SRIOV packet steering and switching for the - enabled VFs and PF in two available modes: - Legacy SRIOV mode (L2 mac vlan steering based). - Switchdev mode (eswitch offloads). + E-Switch provides internal SRIOV packet steering and switching for the + enabled VFs and PF in two available modes: + Legacy SRIOV mode (L2 mac vlan steering based). + Switchdev mode (eswitch offloads). config MLX5_CORE_EN_DCB bool "Data Center Bridging (DCB) Support" diff --git a/drivers/net/ethernet/nxp/Kconfig b/drivers/net/ethernet/nxp/Kconfig index 418afb84c84b..ee83a71c2509 100644 --- a/drivers/net/ethernet/nxp/Kconfig +++ b/drivers/net/ethernet/nxp/Kconfig @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only config LPC_ENET - tristate "NXP ethernet MAC on LPC devices" - depends on ARCH_LPC32XX || COMPILE_TEST - select PHYLIB - help + tristate "NXP ethernet MAC on LPC devices" + depends on ARCH_LPC32XX || COMPILE_TEST + select PHYLIB + help Say Y or M here if you want to use the NXP ethernet MAC included on some NXP LPC devices. You can safely enable this option for LPC32xx SoC. Also available as a module. diff --git a/drivers/net/ethernet/pensando/Kconfig b/drivers/net/ethernet/pensando/Kconfig index 5ea570be8379..bd0583e409df 100644 --- a/drivers/net/ethernet/pensando/Kconfig +++ b/drivers/net/ethernet/pensando/Kconfig @@ -26,7 +26,7 @@ config IONIC found in . - To compile this driver as a module, choose M here. The module - will be called ionic. + To compile this driver as a module, choose M here. The module + will be called ionic. endif # NET_VENDOR_PENSANDO diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 03be30cde552..fe602648b99f 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -460,9 +460,9 @@ config RENESAS_PHY Supports the Renesas PHYs uPD60620 and uPD60620A. config ROCKCHIP_PHY - tristate "Driver for Rockchip Ethernet PHYs" - ---help--- - Currently supports the integrated Ethernet PHY. + tristate "Driver for Rockchip Ethernet PHYs" + ---help--- + Currently supports the integrated Ethernet PHY. config SMSC_PHY tristate "SMSC PHYs" diff --git a/drivers/net/wireless/ath/Kconfig b/drivers/net/wireless/ath/Kconfig index d98d6ac90f3d..56616d988c96 100644 --- a/drivers/net/wireless/ath/Kconfig +++ b/drivers/net/wireless/ath/Kconfig @@ -34,7 +34,7 @@ config ATH_TRACEPOINTS depends on ATH_DEBUG depends on EVENT_TRACING ---help--- - This option enables tracepoints for atheros wireless drivers. + This option enables tracepoints for atheros wireless drivers. Currently, ath9k makes use of this facility. config ATH_REG_DYNAMIC_USER_REG_HINTS diff --git a/drivers/net/wireless/ath/ar5523/Kconfig b/drivers/net/wireless/ath/ar5523/Kconfig index 41d3c9a48b08..65b39c7d035d 100644 --- a/drivers/net/wireless/ath/ar5523/Kconfig +++ b/drivers/net/wireless/ath/ar5523/Kconfig @@ -5,5 +5,5 @@ config AR5523 select ATH_COMMON select FW_LOADER ---help--- - This module add support for AR5523 based USB dongles such as D-Link - DWL-G132, Netgear WPN111 and many more. + This module add support for AR5523 based USB dongles such as D-Link + DWL-G132, Netgear WPN111 and many more. diff --git a/drivers/net/wireless/ath/ath6kl/Kconfig b/drivers/net/wireless/ath/ath6kl/Kconfig index dcf8ca0dcc52..62c22fdcca38 100644 --- a/drivers/net/wireless/ath/ath6kl/Kconfig +++ b/drivers/net/wireless/ath/ath6kl/Kconfig @@ -2,7 +2,7 @@ config ATH6KL tristate "Atheros mobile chipsets support" depends on CFG80211 - ---help--- + ---help--- This module adds core support for wireless adapters based on Atheros AR6003 and AR6004 chipsets. You still need separate bus drivers for USB and SDIO to be able to use real devices. diff --git a/drivers/net/wireless/ath/ath9k/Kconfig b/drivers/net/wireless/ath/ath9k/Kconfig index 2d1247f61297..c99f42284465 100644 --- a/drivers/net/wireless/ath/ath9k/Kconfig +++ b/drivers/net/wireless/ath/ath9k/Kconfig @@ -148,7 +148,7 @@ config ATH9K_CHANNEL_CONTEXT depends on ATH9K default n ---help--- - This option enables channel context support in ath9k, which is needed + This option enables channel context support in ath9k, which is needed for multi-channel concurrency. Enable this if P2P PowerSave support is required. diff --git a/drivers/net/wireless/ath/carl9170/Kconfig b/drivers/net/wireless/ath/carl9170/Kconfig index 757eb765e17c..b1bce7aad399 100644 --- a/drivers/net/wireless/ath/carl9170/Kconfig +++ b/drivers/net/wireless/ath/carl9170/Kconfig @@ -41,9 +41,9 @@ config CARL9170_WPC default y config CARL9170_HWRNG - bool "Random number generator" - depends on CARL9170 && (HW_RANDOM = y || HW_RANDOM = CARL9170) - default n + bool "Random number generator" + depends on CARL9170 && (HW_RANDOM = y || HW_RANDOM = CARL9170) + default n help Provides a hardware random number generator to the kernel. diff --git a/drivers/net/wireless/atmel/Kconfig b/drivers/net/wireless/atmel/Kconfig index 809bdf331848..4c0556b3a5ba 100644 --- a/drivers/net/wireless/atmel/Kconfig +++ b/drivers/net/wireless/atmel/Kconfig @@ -20,22 +20,22 @@ config ATMEL select FW_LOADER select CRC32 ---help--- - A driver 802.11b wireless cards based on the Atmel fast-vnet - chips. This driver supports standard Linux wireless extensions. + A driver 802.11b wireless cards based on the Atmel fast-vnet + chips. This driver supports standard Linux wireless extensions. - Many cards based on this chipset do not have flash memory - and need their firmware loaded at start-up. If yours is - one of these, you will need to provide a firmware image - to be loaded into the card by the driver. The Atmel - firmware package can be downloaded from - + Many cards based on this chipset do not have flash memory + and need their firmware loaded at start-up. If yours is + one of these, you will need to provide a firmware image + to be loaded into the card by the driver. The Atmel + firmware package can be downloaded from + config PCI_ATMEL tristate "Atmel at76c506 PCI cards" depends on ATMEL && PCI ---help--- - Enable support for PCI and mini-PCI cards containing the - Atmel at76c506 chip. + Enable support for PCI and mini-PCI cards containing the + Atmel at76c506 chip. config PCMCIA_ATMEL tristate "Atmel at76c502/at76c504 PCMCIA cards" @@ -48,11 +48,11 @@ config PCMCIA_ATMEL Atmel at76c502 and at76c504 chips. config AT76C50X_USB - tristate "Atmel at76c503/at76c505/at76c505a USB cards" - depends on MAC80211 && USB - select FW_LOADER - ---help--- - Enable support for USB Wireless devices using Atmel at76c503, - at76c505 or at76c505a chips. + tristate "Atmel at76c503/at76c505/at76c505a USB cards" + depends on MAC80211 && USB + select FW_LOADER + ---help--- + Enable support for USB Wireless devices using Atmel at76c503, + at76c505 or at76c505a chips. endif # WLAN_VENDOR_ATMEL diff --git a/drivers/net/wireless/intel/ipw2x00/Kconfig b/drivers/net/wireless/intel/ipw2x00/Kconfig index 5d2878a73732..ab17903ba9f8 100644 --- a/drivers/net/wireless/intel/ipw2x00/Kconfig +++ b/drivers/net/wireless/intel/ipw2x00/Kconfig @@ -13,37 +13,37 @@ config IPW2100 select LIB80211 select LIBIPW ---help--- - A driver for the Intel PRO/Wireless 2100 Network + A driver for the Intel PRO/Wireless 2100 Network Connection 802.11b wireless network adapter. - See + See for information on the capabilities currently enabled in this driver and for tips for debugging issues and problems. In order to use this driver, you will need a firmware image for it. - You can obtain the firmware from - . Once you have the firmware image, you + You can obtain the firmware from + . Once you have the firmware image, you will need to place it in /lib/firmware. - You will also very likely need the Wireless Tools in order to - configure your card: + You will also very likely need the Wireless Tools in order to + configure your card: - . + . + + It is recommended that you compile this driver as a module (M) + rather than built-in (Y). This driver requires firmware at device + initialization time, and when built-in this typically happens + before the filesystem is accessible (hence firmware will be + unavailable and initialization will fail). If you do choose to build + this driver into your kernel image, you can avoid this problem by + including the firmware and a firmware loader in an initramfs. - It is recommended that you compile this driver as a module (M) - rather than built-in (Y). This driver requires firmware at device - initialization time, and when built-in this typically happens - before the filesystem is accessible (hence firmware will be - unavailable and initialization will fail). If you do choose to build - this driver into your kernel image, you can avoid this problem by - including the firmware and a firmware loader in an initramfs. - config IPW2100_MONITOR - bool "Enable promiscuous mode" - depends on IPW2100 - ---help--- + bool "Enable promiscuous mode" + depends on IPW2100 + ---help--- Enables promiscuous/monitor mode support for the ipw2100 driver. - With this feature compiled into the driver, you can switch to + With this feature compiled into the driver, you can switch to promiscuous mode via the Wireless Tool's Monitor mode. While in this mode, no packets can be sent. @@ -51,17 +51,17 @@ config IPW2100_DEBUG bool "Enable full debugging output in IPW2100 module." depends on IPW2100 ---help--- - This option will enable debug tracing output for the IPW2100. + This option will enable debug tracing output for the IPW2100. - This will result in the kernel module being ~60k larger. You can - control which debug output is sent to the kernel log by setting the - value in + This will result in the kernel module being ~60k larger. You can + control which debug output is sent to the kernel log by setting the + value in /sys/bus/pci/drivers/ipw2100/debug_level This entry will only exist if this option is enabled. - If you are not trying to debug or develop the IPW2100 driver, you + If you are not trying to debug or develop the IPW2100 driver, you most likely want to say N here. config IPW2200 @@ -75,37 +75,37 @@ config IPW2200 select LIB80211 select LIBIPW ---help--- - A driver for the Intel PRO/Wireless 2200BG and 2915ABG Network - Connection adapters. + A driver for the Intel PRO/Wireless 2200BG and 2915ABG Network + Connection adapters. - See + See for information on the capabilities currently enabled in this driver and for tips for debugging issues and problems. In order to use this driver, you will need a firmware image for it. - You can obtain the firmware from - . See the above referenced README.ipw2200 + You can obtain the firmware from + . See the above referenced README.ipw2200 for information on where to install the firmware images. - You will also very likely need the Wireless Tools in order to - configure your card: + You will also very likely need the Wireless Tools in order to + configure your card: - . + . - It is recommended that you compile this driver as a module (M) - rather than built-in (Y). This driver requires firmware at device - initialization time, and when built-in this typically happens - before the filesystem is accessible (hence firmware will be - unavailable and initialization will fail). If you do choose to build - this driver into your kernel image, you can avoid this problem by - including the firmware and a firmware loader in an initramfs. + It is recommended that you compile this driver as a module (M) + rather than built-in (Y). This driver requires firmware at device + initialization time, and when built-in this typically happens + before the filesystem is accessible (hence firmware will be + unavailable and initialization will fail). If you do choose to build + this driver into your kernel image, you can avoid this problem by + including the firmware and a firmware loader in an initramfs. config IPW2200_MONITOR - bool "Enable promiscuous mode" - depends on IPW2200 - ---help--- + bool "Enable promiscuous mode" + depends on IPW2200 + ---help--- Enables promiscuous/monitor mode support for the ipw2200 driver. - With this feature compiled into the driver, you can switch to + With this feature compiled into the driver, you can switch to promiscuous mode via the Wireless Tool's Monitor mode. While in this mode, no packets can be sent. @@ -118,28 +118,28 @@ config IPW2200_PROMISCUOUS depends on IPW2200_MONITOR select IPW2200_RADIOTAP ---help--- - Enables the creation of a second interface prefixed 'rtap'. - This second interface will provide every received in radiotap + Enables the creation of a second interface prefixed 'rtap'. + This second interface will provide every received in radiotap format. - This is useful for performing wireless network analysis while - maintaining an active association. + This is useful for performing wireless network analysis while + maintaining an active association. - Example usage: + Example usage: - % modprobe ipw2200 rtap_iface=1 - % ifconfig rtap0 up - % tethereal -i rtap0 + % modprobe ipw2200 rtap_iface=1 + % ifconfig rtap0 up + % tethereal -i rtap0 - If you do not specify 'rtap_iface=1' as a module parameter then - the rtap interface will not be created and you will need to turn - it on via sysfs: - - % echo 1 > /sys/bus/pci/drivers/ipw2200/*/rtap_iface + If you do not specify 'rtap_iface=1' as a module parameter then + the rtap interface will not be created and you will need to turn + it on via sysfs: + + % echo 1 > /sys/bus/pci/drivers/ipw2200/*/rtap_iface config IPW2200_QOS - bool "Enable QoS support" - depends on IPW2200 + bool "Enable QoS support" + depends on IPW2200 config IPW2200_DEBUG bool "Enable full debugging output in IPW2200 module." diff --git a/drivers/net/wireless/intel/iwlegacy/Kconfig b/drivers/net/wireless/intel/iwlegacy/Kconfig index e329fd7b09c0..100f55858b13 100644 --- a/drivers/net/wireless/intel/iwlegacy/Kconfig +++ b/drivers/net/wireless/intel/iwlegacy/Kconfig @@ -91,9 +91,9 @@ config IWLEGACY_DEBUG any problems you may encounter. config IWLEGACY_DEBUGFS - bool "iwlegacy (iwl 3945/4965) debugfs support" - depends on IWLEGACY && MAC80211_DEBUGFS - ---help--- + bool "iwlegacy (iwl 3945/4965) debugfs support" + depends on IWLEGACY && MAC80211_DEBUGFS + ---help--- Enable creation of debugfs files for the iwlegacy drivers. This is a low-impact option that allows getting insight into the driver's state at runtime. diff --git a/drivers/net/wireless/intel/iwlwifi/Kconfig b/drivers/net/wireless/intel/iwlwifi/Kconfig index 7dbc0d38bb3b..091d621ad25f 100644 --- a/drivers/net/wireless/intel/iwlwifi/Kconfig +++ b/drivers/net/wireless/intel/iwlwifi/Kconfig @@ -119,9 +119,9 @@ config IWLWIFI_DEBUG any problems you may encounter. config IWLWIFI_DEBUGFS - bool "iwlwifi debugfs support" - depends on MAC80211_DEBUGFS - ---help--- + bool "iwlwifi debugfs support" + depends on MAC80211_DEBUGFS + ---help--- Enable creation of debugfs files for the iwlwifi drivers. This is a low-impact option that allows getting insight into the driver's state at runtime. diff --git a/drivers/net/wireless/ralink/rt2x00/Kconfig b/drivers/net/wireless/ralink/rt2x00/Kconfig index 858f8aa3e616..f8a9244ce012 100644 --- a/drivers/net/wireless/ralink/rt2x00/Kconfig +++ b/drivers/net/wireless/ralink/rt2x00/Kconfig @@ -98,17 +98,17 @@ config RT2800PCI_RT53XX bool "rt2800pci - Include support for rt53xx devices (EXPERIMENTAL)" default y ---help--- - This adds support for rt53xx wireless chipset family to the - rt2800pci driver. - Supported chips: RT5390 + This adds support for rt53xx wireless chipset family to the + rt2800pci driver. + Supported chips: RT5390 config RT2800PCI_RT3290 bool "rt2800pci - Include support for rt3290 devices (EXPERIMENTAL)" default y ---help--- - This adds support for rt3290 wireless chipset family to the - rt2800pci driver. - Supported chips: RT3290 + This adds support for rt3290 wireless chipset family to the + rt2800pci driver. + Supported chips: RT3290 endif config RT2500USB @@ -176,16 +176,16 @@ config RT2800USB_RT3573 config RT2800USB_RT53XX bool "rt2800usb - Include support for rt53xx devices (EXPERIMENTAL)" ---help--- - This adds support for rt53xx wireless chipset family to the - rt2800usb driver. - Supported chips: RT5370 + This adds support for rt53xx wireless chipset family to the + rt2800usb driver. + Supported chips: RT5370 config RT2800USB_RT55XX bool "rt2800usb - Include support for rt55xx devices (EXPERIMENTAL)" ---help--- - This adds support for rt55xx wireless chipset family to the - rt2800usb driver. - Supported chips: RT5572 + This adds support for rt55xx wireless chipset family to the + rt2800usb driver. + Supported chips: RT5572 config RT2800USB_UNKNOWN bool "rt2800usb - Include support for unknown (USB) devices" From 3e8b9bfa110896f95d602d8c98d5f9d67e41d78c Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 23 Sep 2019 22:04:58 -0700 Subject: [PATCH 1304/2880] net/sched: cbs: Fix not adding cbs instance to list When removing a cbs instance when offloading is enabled, the crash below can be observed. The problem happens because that when offloading is enabled, the cbs instance is not added to the list. Also, the current code doesn't handle correctly the case when offload is disabled without removing the qdisc: if the link speed changes the credit calculations will be wrong. When we create the cbs instance with offloading enabled, it's not added to the notification list, when later we disable offloading, it's not in the list, so link speed changes will not affect it. The solution for both issues is the same, add the cbs instance being created unconditionally to the global list, even if the link state notification isn't useful "right now". Crash log: [518758.189866] BUG: kernel NULL pointer dereference, address: 0000000000000000 [518758.189870] #PF: supervisor read access in kernel mode [518758.189871] #PF: error_code(0x0000) - not-present page [518758.189872] PGD 0 P4D 0 [518758.189874] Oops: 0000 [#1] SMP PTI [518758.189876] CPU: 3 PID: 4825 Comm: tc Not tainted 5.2.9 #1 [518758.189877] Hardware name: Gigabyte Technology Co., Ltd. Z390 AORUS ULTRA/Z390 AORUS ULTRA-CF, BIOS F7 03/14/2019 [518758.189881] RIP: 0010:__list_del_entry_valid+0x29/0xa0 [518758.189883] Code: 90 48 b8 00 01 00 00 00 00 ad de 55 48 8b 17 4c 8b 47 08 48 89 e5 48 39 c2 74 27 48 b8 00 02 00 00 00 00 ad de 49 39 c0 74 2d <49> 8b 30 48 39 fe 75 3d 48 8b 52 08 48 39 f2 75 4c b8 01 00 00 00 [518758.189885] RSP: 0018:ffffa27e43903990 EFLAGS: 00010207 [518758.189887] RAX: dead000000000200 RBX: ffff8bce69f0f000 RCX: 0000000000000000 [518758.189888] RDX: 0000000000000000 RSI: ffff8bce69f0f064 RDI: ffff8bce69f0f1e0 [518758.189890] RBP: ffffa27e43903990 R08: 0000000000000000 R09: ffff8bce69e788c0 [518758.189891] R10: ffff8bce62acd400 R11: 00000000000003cb R12: ffff8bce69e78000 [518758.189892] R13: ffff8bce69f0f140 R14: 0000000000000000 R15: 0000000000000000 [518758.189894] FS: 00007fa1572c8f80(0000) GS:ffff8bce6e0c0000(0000) knlGS:0000000000000000 [518758.189895] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [518758.189896] CR2: 0000000000000000 CR3: 000000040a398006 CR4: 00000000003606e0 [518758.189898] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [518758.189899] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [518758.189900] Call Trace: [518758.189904] cbs_destroy+0x32/0xa0 [sch_cbs] [518758.189906] qdisc_destroy+0x45/0x120 [518758.189907] qdisc_put+0x25/0x30 [518758.189908] qdisc_graft+0x2c1/0x450 [518758.189910] tc_get_qdisc+0x1c8/0x310 [518758.189912] ? get_page_from_freelist+0x91a/0xcb0 [518758.189914] rtnetlink_rcv_msg+0x293/0x360 [518758.189916] ? kmem_cache_alloc_node_trace+0x178/0x260 [518758.189918] ? __kmalloc_node_track_caller+0x38/0x50 [518758.189920] ? rtnl_calcit.isra.0+0xf0/0xf0 [518758.189922] netlink_rcv_skb+0x48/0x110 [518758.189923] rtnetlink_rcv+0x10/0x20 [518758.189925] netlink_unicast+0x15b/0x1d0 [518758.189926] netlink_sendmsg+0x1ea/0x380 [518758.189929] sock_sendmsg+0x2f/0x40 [518758.189930] ___sys_sendmsg+0x295/0x2f0 [518758.189932] ? ___sys_recvmsg+0x151/0x1e0 [518758.189933] ? do_wp_page+0x7e/0x450 [518758.189935] __sys_sendmsg+0x48/0x80 [518758.189937] __x64_sys_sendmsg+0x1a/0x20 [518758.189939] do_syscall_64+0x53/0x1f0 [518758.189941] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [518758.189942] RIP: 0033:0x7fa15755169a [518758.189944] Code: 48 c7 c0 ff ff ff ff eb be 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 18 b8 2e 00 00 00 c5 fc 77 0f 05 <48> 3d 00 f0 ff ff 77 5e c3 0f 1f 44 00 00 48 83 ec 28 89 54 24 1c [518758.189946] RSP: 002b:00007ffda58b60b8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [518758.189948] RAX: ffffffffffffffda RBX: 000055e4b836d9a0 RCX: 00007fa15755169a [518758.189949] RDX: 0000000000000000 RSI: 00007ffda58b6128 RDI: 0000000000000003 [518758.189951] RBP: 00007ffda58b6190 R08: 0000000000000001 R09: 000055e4b9d848a0 [518758.189952] R10: 0000000000000000 R11: 0000000000000246 R12: 000000005d654b49 [518758.189953] R13: 0000000000000000 R14: 00007ffda58b6230 R15: 00007ffda58b6210 [518758.189955] Modules linked in: sch_cbs sch_etf sch_mqprio netlink_diag unix_diag e1000e igb intel_pch_thermal thermal video backlight pcc_cpufreq [518758.189960] CR2: 0000000000000000 [518758.189961] ---[ end trace 6a13f7aaf5376019 ]--- [518758.189963] RIP: 0010:__list_del_entry_valid+0x29/0xa0 [518758.189964] Code: 90 48 b8 00 01 00 00 00 00 ad de 55 48 8b 17 4c 8b 47 08 48 89 e5 48 39 c2 74 27 48 b8 00 02 00 00 00 00 ad de 49 39 c0 74 2d <49> 8b 30 48 39 fe 75 3d 48 8b 52 08 48 39 f2 75 4c b8 01 00 00 00 [518758.189967] RSP: 0018:ffffa27e43903990 EFLAGS: 00010207 [518758.189968] RAX: dead000000000200 RBX: ffff8bce69f0f000 RCX: 0000000000000000 [518758.189969] RDX: 0000000000000000 RSI: ffff8bce69f0f064 RDI: ffff8bce69f0f1e0 [518758.189971] RBP: ffffa27e43903990 R08: 0000000000000000 R09: ffff8bce69e788c0 [518758.189972] R10: ffff8bce62acd400 R11: 00000000000003cb R12: ffff8bce69e78000 [518758.189973] R13: ffff8bce69f0f140 R14: 0000000000000000 R15: 0000000000000000 [518758.189975] FS: 00007fa1572c8f80(0000) GS:ffff8bce6e0c0000(0000) knlGS:0000000000000000 [518758.189976] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [518758.189977] CR2: 0000000000000000 CR3: 000000040a398006 CR4: 00000000003606e0 [518758.189979] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [518758.189980] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: e0a7683d30e9 ("net/sched: cbs: fix port_rate miscalculation") Signed-off-by: Vinicius Costa Gomes Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_cbs.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 93b58fde99b7..1bef152c5721 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -392,7 +392,6 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - int err; if (!opt) { NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory"); @@ -404,6 +403,10 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, if (!q->qdisc) return -ENOMEM; + spin_lock(&cbs_list_lock); + list_add(&q->cbs_list, &cbs_list); + spin_unlock(&cbs_list_lock); + qdisc_hash_add(q->qdisc, false); q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); @@ -413,17 +416,7 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, qdisc_watchdog_init(&q->watchdog, sch); - err = cbs_change(sch, opt, extack); - if (err) - return err; - - if (!q->offload) { - spin_lock(&cbs_list_lock); - list_add(&q->cbs_list, &cbs_list); - spin_unlock(&cbs_list_lock); - } - - return 0; + return cbs_change(sch, opt, extack); } static void cbs_destroy(struct Qdisc *sch) @@ -431,15 +424,18 @@ static void cbs_destroy(struct Qdisc *sch) struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - spin_lock(&cbs_list_lock); - list_del(&q->cbs_list); - spin_unlock(&cbs_list_lock); + /* Nothing to do if we couldn't create the underlying qdisc */ + if (!q->qdisc) + return; qdisc_watchdog_cancel(&q->watchdog); cbs_disable_offload(dev, q); - if (q->qdisc) - qdisc_put(q->qdisc); + spin_lock(&cbs_list_lock); + list_del(&q->cbs_list); + spin_unlock(&cbs_list_lock); + + qdisc_put(q->qdisc); } static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) From adecda5bee0a05c11f1a4a4b16b01d11a832fd43 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 24 Sep 2019 11:09:37 +0200 Subject: [PATCH 1305/2880] net: print proper warning on dst underflow Proper warnings with stack traces make it much easier to figure out what's doing the double free and create more meaningful bug reports from users. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/core/dst.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/dst.c b/net/core/dst.c index 1325316d9eab..193af526e908 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -172,7 +172,7 @@ void dst_release(struct dst_entry *dst) int newrefcnt; newrefcnt = atomic_dec_return(&dst->__refcnt); - if (unlikely(newrefcnt < 0)) + if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); if (!newrefcnt) @@ -187,7 +187,7 @@ void dst_release_immediate(struct dst_entry *dst) int newrefcnt; newrefcnt = atomic_dec_return(&dst->__refcnt); - if (unlikely(newrefcnt < 0)) + if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); if (!newrefcnt) From c3ca78e2174413c136d62ebdf8039580fe72b504 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 25 Sep 2019 00:32:13 -0500 Subject: [PATCH 1306/2880] smb3: pass mode bits into create calls We need to populate an ACL (security descriptor open context) on file and directory correct. This patch passes in the mode. Followon patch will build the open context and the security descriptor (from the mode) that goes in the open context. Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/cifsglob.h | 6 ++++-- fs/cifs/cifsproto.h | 3 ++- fs/cifs/cifssmb.c | 3 ++- fs/cifs/inode.c | 3 ++- fs/cifs/smb2inode.c | 34 +++++++++++++++++++--------------- fs/cifs/smb2pdu.c | 20 ++++++++++++++++++++ fs/cifs/smb2proto.h | 3 ++- 7 files changed, 51 insertions(+), 21 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 54e204589cb9..2e960e1049db 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -331,8 +331,9 @@ struct smb_version_operations { umode_t mode, struct cifs_tcon *tcon, const char *full_path, struct cifs_sb_info *cifs_sb); - int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *, - struct cifs_sb_info *); + int (*mkdir)(const unsigned int xid, struct inode *inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *sb); /* set info on created directory */ void (*mkdir_setinfo)(struct inode *, const char *, struct cifs_sb_info *, struct cifs_tcon *, @@ -1209,6 +1210,7 @@ struct cifs_search_info { bool smallBuf:1; /* so we know which buf_release function to call */ }; +#define ACL_NO_MODE -1 struct cifs_open_parms { struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 99b1b1ef558c..e53e9f62b87b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -372,7 +372,8 @@ extern int CIFSSMBUnixSetPathInfo(const unsigned int xid, const struct nls_table *nls_codepage, int remap); -extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, +extern int CIFSSMBMkDir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index dbee2132e419..4f554f019a98 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1078,7 +1078,8 @@ RmDirRetry: } int -CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, +CIFSSMBMkDir(const unsigned int xid, struct inode *inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { int rc = 0; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 26cdfbf1e164..3bae2e53f0b8 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1622,13 +1622,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) } /* BB add setting the equivalent of mode via CreateX w/ACLs */ - rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb); + rc = server->ops->mkdir(xid, inode, mode, tcon, full_path, cifs_sb); if (rc) { cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc); d_drop(direntry); goto mkdir_out; } + /* TODO: skip this for smb2/smb3 */ rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon, xid); mkdir_out: diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index d2a3fb7e5c8d..4121ac1163ca 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -51,7 +51,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, __u32 desired_access, __u32 create_disposition, - __u32 create_options, void *ptr, int command, + __u32 create_options, umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile) { int rc; @@ -103,6 +103,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, oparms.create_options |= CREATE_OPEN_BACKUP_INTENT; oparms.fid = &fid; oparms.reconnect = false; + oparms.mode = mode; memset(&open_iov, 0, sizeof(open_iov)); rqst[num_rqst].rq_iov = open_iov; @@ -478,7 +479,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - smb2_data, SMB2_OP_QUERY_INFO, cfile); + ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile); if (rc == -EOPNOTSUPP) { *symlink = true; create_options |= OPEN_REPARSE_POINT; @@ -486,8 +487,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* Failed on a symbolic link - query a reparse point info */ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, smb2_data, - SMB2_OP_QUERY_INFO, NULL); + create_options, ACL_NO_MODE, + smb2_data, SMB2_OP_QUERY_INFO, NULL); } if (rc) goto out; @@ -499,12 +500,14 @@ out: } int -smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, +smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR, NULL); + CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, + NULL); } void @@ -525,8 +528,8 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, cifs_get_writable_path(tcon, name, &cfile); tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO, - cfile); + CREATE_NOT_FILE, ACL_NO_MODE, + &data, SMB2_OP_SET_INFO, cfile); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -536,7 +539,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_NOT_FILE, + CREATE_NOT_FILE, ACL_NO_MODE, NULL, SMB2_OP_RMDIR, NULL); } @@ -546,7 +549,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - NULL, SMB2_OP_DELETE, NULL); + ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL); } static int @@ -564,7 +567,8 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, goto smb2_rename_path; } rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, 0, smb2_to_name, command, cfile); + FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, + command, cfile); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -601,8 +605,8 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, __le64 eof = cpu_to_le64(size); return smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_WRITE_DATA, FILE_OPEN, 0, &eof, - SMB2_OP_SET_EOF, NULL); + FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, + &eof, SMB2_OP_SET_EOF, NULL); } int @@ -623,8 +627,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path, return PTR_ERR(tlink); rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path, - FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, - SMB2_OP_SET_INFO, NULL); + FILE_WRITE_ATTRIBUTES, FILE_OPEN, + 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, NULL); cifs_put_tlink(tlink); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index ea08e6159481..85f9d614d968 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -751,6 +751,8 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) unsigned int num = *num_iovec; iov[num].iov_base = create_posix_buf(mode); + if (mode == -1) + cifs_dbg(VFS, "illegal mode\n"); /* BB REMOVEME */ if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); @@ -2417,6 +2419,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, /* File attributes ignored on open (used in create though) */ req->FileAttributes = cpu_to_le32(file_attributes); req->ShareAccess = FILE_SHARE_ALL_LE; + req->CreateDisposition = cpu_to_le32(oparms->disposition); req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK); req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)); @@ -2518,6 +2521,23 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, return rc; } + /* TODO: add handling for the mode on create */ + if (oparms->disposition == FILE_CREATE) + cifs_dbg(VFS, "mode is 0x%x\n", oparms->mode); /* BB REMOVEME */ + + if ((oparms->disposition == FILE_CREATE) && (oparms->mode != -1)) { + if (n_iov > 2) { + struct create_context *ccontext = + (struct create_context *)iov[n_iov-1].iov_base; + ccontext->Next = + cpu_to_le32(iov[n_iov-1].iov_len); + } + + /* rc = add_sd_context(iov, &n_iov, oparms->mode); */ + if (rc) + return rc; + } + if (n_iov > 2) { struct create_context *ccontext = (struct create_context *)iov[n_iov-1].iov_base; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 67a91b11fd59..da3a6d580808 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -84,7 +84,8 @@ extern int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, umode_t mode, struct cifs_tcon *tcon, const char *full_path, struct cifs_sb_info *cifs_sb); -extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, +extern int smb2_mkdir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, struct cifs_sb_info *cifs_sb, From 25d41e4aadb0788b4fae8a8fca90f437b9ebd727 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Sep 2019 16:02:07 -0700 Subject: [PATCH 1307/2880] iocost: better trace vrate changes vrate_adj tracepoint traces vrate changes; however, it does so only when busy_level is non-zero. busy_level turning to zero can sometimes be as interesting an event. This patch also enables vrate_adj tracepoint on other vrate related events - busy_level changes and non-zero nr_lagging. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 3b39deb8b9f8..32d4d6d86a99 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1343,7 +1343,7 @@ static void ioc_timer_fn(struct timer_list *timer) u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; u32 missed_ppm[2], rq_wait_pct; u64 period_vtime; - int i; + int prev_busy_level, i; /* how were the latencies during the period? */ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); @@ -1531,6 +1531,7 @@ skip_surplus_transfers: * and experiencing shortages but not surpluses, we're too stingy * and should increase vtime rate. */ + prev_busy_level = ioc->busy_level; if (rq_wait_pct > RQ_WAIT_BUSY_PCT || missed_ppm[READ] > ppm_rthr || missed_ppm[WRITE] > ppm_wthr) { @@ -1592,6 +1593,10 @@ skip_surplus_transfers: atomic64_set(&ioc->vtime_rate, vrate); ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( ioc->period_us * vrate * INUSE_MARGIN_PCT, 100); + } else if (ioc->busy_level != prev_busy_level || nr_lagging) { + trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + &missed_ppm, rq_wait_pct, nr_lagging, + nr_shortages, nr_surpluses); } ioc_refresh_params(ioc, false); From 7cd806a9a953f234b9865c30028f47fd738ce375 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Sep 2019 16:03:09 -0700 Subject: [PATCH 1308/2880] iocost: improve nr_lagging handling Some IOs may span multiple periods. As latencies are collected on completion, the inbetween periods won't register them and may incorrectly decide to increase vrate. nr_lagging tracks these IOs to avoid those situations. Currently, whenever there are IOs which are spanning from the previous period, busy_level is reset to 0 if negative thus suppressing vrate increase. This has the following two problems. * When latency target percentiles aren't set, vrate adjustment should only be governed by queue depth depletion; however, the current code keeps nr_lagging active which pulls in latency results and can keep down vrate unexpectedly. * When lagging condition is detected, it resets the entire negative busy_level. This turned out to be way too aggressive on some devices which sometimes experience extended latencies on a small subset of commands. In addition, a lagging IO will be accounted as latency target miss on completion anyway and resetting busy_level amplifies its impact unnecessarily. This patch fixes the above two problems by disabling nr_lagging counting when latency target percentiles aren't set and blocking vrate increases when there are lagging IOs while leaving busy_level as-is. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 32d4d6d86a99..10160de62e3b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1407,7 +1407,8 @@ static void ioc_timer_fn(struct timer_list *timer) * comparing vdone against period start. If lagging behind * IOs from past periods, don't increase vrate. */ - if (!atomic_read(&iocg_to_blkg(iocg)->use_delay) && + if ((ppm_rthr != MILLION || ppm_wthr != MILLION) && + !atomic_read(&iocg_to_blkg(iocg)->use_delay) && time_after64(vtime, vdone) && time_after64(vtime, now.vnow - MAX_LAGGING_PERIODS * period_vtime) && @@ -1537,21 +1538,23 @@ skip_surplus_transfers: missed_ppm[WRITE] > ppm_wthr) { ioc->busy_level = max(ioc->busy_level, 0); ioc->busy_level++; - } else if (nr_lagging) { - ioc->busy_level = max(ioc->busy_level, 0); - } else if (nr_shortages && !nr_surpluses && - rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 && + } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 && missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 && missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) { - ioc->busy_level = min(ioc->busy_level, 0); - ioc->busy_level--; + /* take action iff there is contention */ + if (nr_shortages && !nr_lagging) { + ioc->busy_level = min(ioc->busy_level, 0); + /* redistribute surpluses first */ + if (!nr_surpluses) + ioc->busy_level--; + } } else { ioc->busy_level = 0; } ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); - if (ioc->busy_level) { + if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { u64 vrate = atomic64_read(&ioc->vtime_rate); u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; From 7afcccafa59fb63b58f863a6c5e603a34625955b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Sep 2019 16:03:35 -0700 Subject: [PATCH 1309/2880] iocost: bump up default latency targets for hard disks The default hard disk param sets latency targets at 50ms. As the default target percentiles are zero, these don't directly regulate vrate; however, they're still used to calculate the period length - 100ms in this case. This is excessively low. A SATA drive with QD32 saturated with random IOs can easily reach avg completion latency of several hundred msecs. A period duration which is substantially lower than avg completion latency can lead to wildly fluctuating vrate. Let's bump up the default latency targets to 250ms so that the period duration is sufficiently long. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 10160de62e3b..2a3db80c1dce 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -529,8 +529,8 @@ struct iocg_wake_ctx { static const struct ioc_params autop[] = { [AUTOP_HDD] = { .qos = { - [QOS_RLAT] = 50000, /* 50ms */ - [QOS_WLAT] = 50000, + [QOS_RLAT] = 250000, /* 250ms */ + [QOS_WLAT] = 250000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, From ba56d8ce38c8252fff5b745db3899cf092578ede Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 23 Sep 2019 17:02:46 +0800 Subject: [PATCH 1310/2880] macsec: drop skb sk before calling gro_cells_receive Fei Liu reported a crash when doing netperf on a topo of macsec dev over veth: [ 448.919128] refcount_t: underflow; use-after-free. [ 449.090460] Call trace: [ 449.092895] refcount_sub_and_test+0xb4/0xc0 [ 449.097155] tcp_wfree+0x2c/0x150 [ 449.100460] ip_rcv+0x1d4/0x3a8 [ 449.103591] __netif_receive_skb_core+0x554/0xae0 [ 449.108282] __netif_receive_skb+0x28/0x78 [ 449.112366] netif_receive_skb_internal+0x54/0x100 [ 449.117144] napi_gro_complete+0x70/0xc0 [ 449.121054] napi_gro_flush+0x6c/0x90 [ 449.124703] napi_complete_done+0x50/0x130 [ 449.128788] gro_cell_poll+0x8c/0xa8 [ 449.132351] net_rx_action+0x16c/0x3f8 [ 449.136088] __do_softirq+0x128/0x320 The issue was caused by skb's true_size changed without its sk's sk_wmem_alloc increased in tcp/skb_gro_receive(). Later when the skb is being freed and the skb's truesize is subtracted from its sk's sk_wmem_alloc in tcp_wfree(), underflow occurs. macsec is calling gro_cells_receive() to receive a packet, which actually requires skb->sk to be NULL. However when macsec dev is over veth, it's possible the skb->sk is still set if the skb was not unshared or expanded from the peer veth. ip_rcv() is calling skb_orphan() to drop the skb's sk for tproxy, but it is too late for macsec's calling gro_cells_receive(). So fix it by dropping the skb's sk earlier on rx path of macsec. Fixes: 5491e7c6b1a9 ("macsec: enable GRO and RPS on macsec devices") Reported-by: Xiumei Mu Reported-by: Fei Liu Signed-off-by: Xin Long Signed-off-by: David S. Miller --- drivers/net/macsec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 8f46aa1ddec0..cb7637364b40 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -1235,6 +1235,7 @@ deliver: macsec_rxsa_put(rx_sa); macsec_rxsc_put(rx_sc); + skb_orphan(skb); ret = gro_cells_receive(&macsec->gro_cells, skb); if (ret == NET_RX_SUCCESS) count_rx(dev, skb->len); From 4f28bd956e081fc018fe9b41ffa31573f17bfb61 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 23 Sep 2019 11:59:15 +0200 Subject: [PATCH 1311/2880] net: stmmac: Fix page pool size The size of individual pages in the page pool in given by an order. The order is the binary logarithm of the number of pages that make up one of the pages in the pool. However, the driver currently passes the number of pages rather than the order, so it ends up wasting quite a bit of memory. Fix this by taking the binary logarithm and passing that in the order field. Fixes: 2af6106ae949 ("net: stmmac: Introducing support for Page Pool") Signed-off-by: Thierry Reding Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index a6cb2aa60e64..d3232738fb25 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1557,13 +1557,15 @@ static int alloc_dma_rx_desc_resources(struct stmmac_priv *priv) for (queue = 0; queue < rx_count; queue++) { struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; struct page_pool_params pp_params = { 0 }; + unsigned int num_pages; rx_q->queue_index = queue; rx_q->priv_data = priv; pp_params.flags = PP_FLAG_DMA_MAP; pp_params.pool_size = DMA_RX_SIZE; - pp_params.order = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE); + num_pages = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE); + pp_params.order = ilog2(num_pages); pp_params.nid = dev_to_node(priv->device); pp_params.dev = priv->device; pp_params.dma_dir = DMA_FROM_DEVICE; From c1d419d00494487b8401b2ac38d7e8e2a4977d9e Mon Sep 17 00:00:00 2001 From: Biju Das Date: Mon, 23 Sep 2019 14:32:46 +0100 Subject: [PATCH 1312/2880] dt-bindings: net: ravb: Add support for r8a774b1 SoC Document RZ/G2N (R8A774B1) SoC bindings. Signed-off-by: Biju Das Reviewed-by: Sergei Shtylyov Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/renesas,ravb.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt b/Documentation/devicetree/bindings/net/renesas,ravb.txt index 7ad36213093e..5df4aa7f6811 100644 --- a/Documentation/devicetree/bindings/net/renesas,ravb.txt +++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt @@ -18,6 +18,7 @@ Required properties: R-Car Gen2 and RZ/G1 devices. - "renesas,etheravb-r8a774a1" for the R8A774A1 SoC. + - "renesas,etheravb-r8a774b1" for the R8A774B1 SoC. - "renesas,etheravb-r8a774c0" for the R8A774C0 SoC. - "renesas,etheravb-r8a7795" for the R8A7795 SoC. - "renesas,etheravb-r8a7796" for the R8A7796 SoC. From ea8564c865299815095bebeb4b25bef474218e4c Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 24 Sep 2019 19:11:52 +0800 Subject: [PATCH 1313/2880] openvswitch: change type of UPCALL_PID attribute to NLA_UNSPEC userspace openvswitch patch "(dpif-linux: Implement the API functions to allow multiple handler threads read upcall)" changes its type from U32 to UNSPEC, but leave the kernel unchanged and after kernel 6e237d099fac "(netlink: Relax attr validation for fixed length types)", this bug is exposed by the below warning [ 57.215841] netlink: 'ovs-vswitchd': attribute type 5 has an invalid length. Fixes: 5cd667b0a456 ("openvswitch: Allow each vport to have an array of 'port_id's") Signed-off-by: Li RongQing Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index dde9d762edee..f30e406fbec5 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2294,7 +2294,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, From ca7a03c4175366a92cee0ccc4fec0038c3266e26 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 24 Sep 2019 16:01:28 +0200 Subject: [PATCH 1314/2880] ipv6: do not free rt if FIB_LOOKUP_NOREF is set on suppress rule Commit 7d9e5f422150 removed references from certain dsts, but accounting for this never translated down into the fib6 suppression code. This bug was triggered by WireGuard users who use wg-quick(8), which uses the "suppress-prefix" directive to ip-rule(8) for routing all of their internet traffic without routing loops. The test case added here causes the reference underflow by causing packets to evaluate a suppress rule. Fixes: 7d9e5f422150 ("ipv6: convert major tx path to use RT6_LOOKUP_F_DST_NOREF") Signed-off-by: Jason A. Donenfeld Acked-by: Wei Wang Signed-off-by: David S. Miller --- net/ipv6/fib6_rules.c | 3 ++- tools/testing/selftests/net/fib_tests.sh | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index d22b6c140f23..f9e8fe3ff0c5 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -287,7 +287,8 @@ static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg return false; suppress_route: - ip6_rt_put(rt); + if (!(arg->flags & FIB_LOOKUP_NOREF)) + ip6_rt_put(rt); return true; } diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index cba83a12da82..c4ba0ff4a53f 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -9,7 +9,7 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter" +TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter" VERBOSE=0 PAUSE_ON_FAIL=no @@ -616,6 +616,20 @@ fib_nexthop_test() cleanup } +fib_suppress_test() +{ + $IP link add dummy1 type dummy + $IP link set dummy1 up + $IP -6 route add default dev dummy1 + $IP -6 rule add table main suppress_prefixlength 0 + ping -f -c 1000 -W 1 1234::1 || true + $IP -6 rule del table main suppress_prefixlength 0 + $IP link del dummy1 + + # If we got here without crashing, we're good. + return 0 +} + ################################################################################ # Tests on route add and replace @@ -1593,6 +1607,7 @@ do fib_carrier_test|carrier) fib_carrier_test;; fib_rp_filter_test|rp_filter) fib_rp_filter_test;; fib_nexthop_test|nexthop) fib_nexthop_test;; + fib_suppress_test|suppress) fib_suppress_test;; ipv6_route_test|ipv6_rt) ipv6_route_test;; ipv4_route_test|ipv4_rt) ipv4_route_test;; ipv6_addr_metric) ipv6_addr_metric_test;; From bda521624e75c665c407b3d9cece6e7a28178cd8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Sep 2019 13:47:15 -0600 Subject: [PATCH 1315/2880] io_uring: make CQ ring wakeups be more efficient For batched IO, it's not uncommon for waiters to ask for more than 1 IO to complete before being woken up. This is a problem with wait_event() since tasks will get woken for every IO that completes, re-check condition, then go back to sleep. For batch counts on the order of what you do for high IOPS, that can result in 10s of extra wakeups for the waiting task. Add a private wake function that checks for the wake up count criteria being met before calling autoremove_wake_function(). Pavel reports that one test case he has runs 40% faster with proper batching of wakeups. Reported-by: Pavel Begunkov Tested-by: Pavel Begunkov Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 66 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9b84232e5cc4..c934f91c51e9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2768,6 +2768,38 @@ out: return submit; } +struct io_wait_queue { + struct wait_queue_entry wq; + struct io_ring_ctx *ctx; + unsigned to_wait; + unsigned nr_timeouts; +}; + +static inline bool io_should_wake(struct io_wait_queue *iowq) +{ + struct io_ring_ctx *ctx = iowq->ctx; + + /* + * Wake up if we have enough events, or if a timeout occured since we + * started waiting. For timeouts, we always want to return to userspace, + * regardless of event count. + */ + return io_cqring_events(ctx->rings) >= iowq->to_wait || + atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; +} + +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, + int wake_flags, void *key) +{ + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, + wq); + + if (!io_should_wake(iowq)) + return -1; + + return autoremove_wake_function(curr, mode, wake_flags, key); +} + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -2775,8 +2807,16 @@ out: static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { + struct io_wait_queue iowq = { + .wq = { + .private = current, + .func = io_wake_function, + .entry = LIST_HEAD_INIT(iowq.wq.entry), + }, + .ctx = ctx, + .to_wait = min_events, + }; struct io_rings *rings = ctx->rings; - unsigned nr_timeouts; int ret; if (io_cqring_events(rings) >= min_events) @@ -2795,15 +2835,21 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - nr_timeouts = atomic_read(&ctx->cq_timeouts); - /* - * Return if we have enough events, or if a timeout occured since - * we started waiting. For timeouts, we always want to return to - * userspace. - */ - ret = wait_event_interruptible(ctx->wait, - io_cqring_events(rings) >= min_events || - atomic_read(&ctx->cq_timeouts) != nr_timeouts); + ret = 0; + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + do { + prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, + TASK_INTERRUPTIBLE); + if (io_should_wake(&iowq)) + break; + schedule(); + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + } while (1); + finish_wait(&ctx->wait, &iowq.wq); + restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; From a0f0037e908c656573d165aadbfae6c05fc4dd75 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 26 Sep 2019 08:54:03 +0800 Subject: [PATCH 1316/2880] KVM: LAPIC: Loosen filter for adaptive tuning of lapic_timer_advance_ns 5000 guest cycles delta is easy to encounter on desktop, per-vCPU lapic_timer_advance_ns always keeps at 1000ns initial value, let's loosen the filter a bit to let adaptive tuning make progress. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3a3a6854dcca..87b0fcc23ef8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -66,9 +66,10 @@ #define X2APIC_BROADCAST 0xFFFFFFFFul static bool lapic_timer_advance_dynamic __read_mostly; -#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 -#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 5000 -#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 +#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_NS_INIT 1000 +#define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -1504,8 +1505,8 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } - if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_ADJUST_MAX)) - timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX)) + timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; apic->lapic_timer.timer_advance_ns = timer_advance_ns; } @@ -2302,7 +2303,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { - apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; lapic_timer_advance_dynamic = true; } else { apic->lapic_timer.timer_advance_ns = timer_advance_ns; From a1a640b8c0cd8a2a7f84ab694f04bc64dc6988af Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 25 Sep 2019 11:17:14 -0700 Subject: [PATCH 1317/2880] kvm: x86: Fix a spurious -E2BIG in __do_cpuid_func Don't return -E2BIG from __do_cpuid_func when processing function 0BH or 1FH and the last interesting subleaf occupies the last allocated entry in the result array. Cc: Paolo Bonzini Fixes: 831bf664e9c1fc ("KVM: Refactor and simplify kvm_dev_ioctl_get_supported_cpuid") Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 63316036f85a..6bf4a261b308 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -618,16 +618,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, */ case 0x1f: case 0xb: { - int i, level_type; + int i; - /* read more entries until level_type is zero */ - for (i = 1; ; ++i) { + /* + * We filled in entry[0] for CPUID(EAX=, + * ECX=00H) above. If its level type (ECX[15:8]) is + * zero, then the leaf is unimplemented, and we're + * done. Otherwise, continue to populate entries + * until the level type (ECX[15:8]) of the previously + * added entry is zero. + */ + for (i = 1; entry[i - 1].ecx & 0xff00; ++i) { if (*nent >= maxnent) goto out; - level_type = entry[i - 1].ecx & 0xff00; - if (!level_type) - break; do_host_cpuid(&entry[i], function, i); ++*nent; } From 3ca94192278ca8de169d78c085396c424be123b3 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 18 Sep 2019 17:50:10 +0800 Subject: [PATCH 1318/2880] KVM: X86: Fix userspace set invalid CR4 Reported by syzkaller: WARNING: CPU: 0 PID: 6544 at /home/kernel/data/kvm/arch/x86/kvm//vmx/vmx.c:4689 handle_desc+0x37/0x40 [kvm_intel] CPU: 0 PID: 6544 Comm: a.out Tainted: G OE 5.3.0-rc4+ #4 RIP: 0010:handle_desc+0x37/0x40 [kvm_intel] Call Trace: vmx_handle_exit+0xbe/0x6b0 [kvm_intel] vcpu_enter_guest+0x4dc/0x18d0 [kvm] kvm_arch_vcpu_ioctl_run+0x407/0x660 [kvm] kvm_vcpu_ioctl+0x3ad/0x690 [kvm] do_vfs_ioctl+0xa2/0x690 ksys_ioctl+0x6d/0x80 __x64_sys_ioctl+0x1a/0x20 do_syscall_64+0x74/0x720 entry_SYSCALL_64_after_hwframe+0x49/0xbe When CR4.UMIP is set, guest should have UMIP cpuid flag. Current kvm set_sregs function doesn't have such check when userspace inputs sregs values. SECONDARY_EXEC_DESC is enabled on writes to CR4.UMIP in vmx_set_cr4 though guest doesn't have UMIP cpuid flag. The testcast triggers handle_desc warning when executing ltr instruction since guest architectural CR4 doesn't set UMIP. This patch fixes it by adding valid CR4 and CPUID combination checking in __set_sregs. syzkaller source: https://syzkaller.appspot.com/x/repro.c?x=138efb99600000 Reported-by: syzbot+0f1819555fbdce992df9@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 58 +++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ed07d8d2caa..180c7e88577a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -885,34 +885,42 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); +static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (cr4 & CR4_RESERVED_BITS) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + return -EINVAL; + + return 0; +} + int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; - if (cr4 & CR4_RESERVED_BITS) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + if (kvm_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { @@ -8714,10 +8722,6 @@ EXPORT_SYMBOL_GPL(kvm_task_switch); static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - (sregs->cr4 & X86_CR4_OSXSAVE)) - return -EINVAL; - if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* * When EFER.LME and CR0.PG are set, the processor is in @@ -8736,7 +8740,7 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return -EINVAL; } - return 0; + return kvm_valid_cr4(vcpu, sregs->cr4); } static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) From 43561123ab3759eb6ff47693aec1a307af0aef83 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 25 Sep 2019 17:04:17 -0700 Subject: [PATCH 1319/2880] kvm: x86: Improve emulation of CPUID leaves 0BH and 1FH For these CPUID leaves, the EDX output is not dependent on the ECX input (i.e. the SIGNIFCANT_INDEX flag doesn't apply to EDX). Furthermore, the low byte of the ECX output is always identical to the low byte of the ECX input. KVM does not produce the correct ECX and EDX outputs for any undefined subleaves beyond the first. Special-case these CPUID leaves in kvm_cpuid, so that the ECX and EDX outputs are properly generated for all undefined subleaves. Fixes: 0771671749b59a ("KVM: Enhance guest cpuid management") Fixes: a87f2d3a6eadab ("KVM: x86: Add Intel CPUID.1F cpuid emulation support") Signed-off-by: Jim Mattson Reviewed-by: Marc Orr Reviewed-by: Peter Shier Reviewed-by: Jacob Xu Cc: Sean Christopherson Cc: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 81 +++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6bf4a261b308..a26a1ae03145 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -973,53 +973,64 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* - * If no match is found, check whether we exceed the vCPU's limit - * and return the content of the highest valid _standard_ leaf instead. - * This is to satisfy the CPUID specification. + * If the basic or extended CPUID leaf requested is higher than the + * maximum supported basic or extended leaf, respectively, then it is + * out of range. */ -static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, - u32 function, u32 index) +static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function) { - struct kvm_cpuid_entry2 *maxlevel; + struct kvm_cpuid_entry2 *max; - maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); - if (!maxlevel || maxlevel->eax >= function) - return NULL; - if (function & 0x80000000) { - maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); - if (!maxlevel) - return NULL; - } - return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); + max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + return max && function <= max->eax; } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) { u32 function = *eax, index = *ecx; - struct kvm_cpuid_entry2 *best; - bool entry_found = true; + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry2 *max; + bool found; - best = kvm_find_cpuid_entry(vcpu, function, index); - - if (!best) { - entry_found = false; - if (!check_limit) - goto out; - - best = check_cpuid_limit(vcpu, function, index); + entry = kvm_find_cpuid_entry(vcpu, function, index); + found = entry; + /* + * Intel CPUID semantics treats any query for an out-of-range + * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were + * requested. + */ + if (!entry && check_limit && !cpuid_function_in_range(vcpu, function)) { + max = kvm_find_cpuid_entry(vcpu, 0, 0); + if (max) { + function = max->eax; + entry = kvm_find_cpuid_entry(vcpu, function, index); + } } - -out: - if (best) { - *eax = best->eax; - *ebx = best->ebx; - *ecx = best->ecx; - *edx = best->edx; - } else + if (entry) { + *eax = entry->eax; + *ebx = entry->ebx; + *ecx = entry->ecx; + *edx = entry->edx; + } else { *eax = *ebx = *ecx = *edx = 0; - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found); - return entry_found; + /* + * When leaf 0BH or 1FH is defined, CL is pass-through + * and EDX is always the x2APIC ID, even for undefined + * subleaves. Index 1 will exist iff the leaf is + * implemented, so we pass through CL iff leaf 1 + * exists. EDX can be copied from any existing index. + */ + if (function == 0xb || function == 0x1f) { + entry = kvm_find_cpuid_entry(vcpu, function, 1); + if (entry) { + *ecx = index & 0xff; + *edx = entry->edx; + } + } + } + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found); + return found; } EXPORT_SYMBOL_GPL(kvm_cpuid); From 5f41a37b151f6459e0b650a2f4d1d59b6c02d1ab Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 25 Sep 2019 17:04:18 -0700 Subject: [PATCH 1320/2880] kvm: x86: Use AMD CPUID semantics for AMD vCPUs When the guest CPUID information represents an AMD vCPU, return all zeroes for queries of undefined CPUID leaves, whether or not they are in range. Signed-off-by: Jim Mattson Fixes: bd22f5cfcfe8f6 ("KVM: move and fix substitue search for missing CPUID entries") Reviewed-by: Marc Orr Reviewed-by: Peter Shier Reviewed-by: Jacob Xu Cc: Sean Christopherson Cc: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a26a1ae03145..ce4a1b3cc3e8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -998,9 +998,11 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, /* * Intel CPUID semantics treats any query for an out-of-range * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were - * requested. + * requested. AMD CPUID semantics returns all zeroes for any + * undefined leaf, whether or not the leaf is in range. */ - if (!entry && check_limit && !cpuid_function_in_range(vcpu, function)) { + if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) && + !cpuid_function_in_range(vcpu, function)) { max = kvm_find_cpuid_entry(vcpu, 0, 0); if (max) { function = max->eax; From 40bc47b08b6efc8b64fef77cb46974f634215425 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 24 Sep 2019 13:51:08 -0700 Subject: [PATCH 1321/2880] kvm: x86: Enumerate support for CLZERO instruction CLZERO is available to the guest if it is supported on the host. Therefore, enumerate support for the instruction in KVM_GET_SUPPORTED_CPUID whenever it is supported on the host. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ce4a1b3cc3e8..34d9f9f481a3 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -485,8 +485,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = - F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); + F(CLZERO) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | + F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | F(AMD_SSBD) | + F(VIRT_SSBD) | F(AMD_SSB_NO); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = From 504ce1954fba888936c9d13ccc1e3db9b8f613d5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 25 Sep 2019 23:37:21 +0200 Subject: [PATCH 1322/2880] KVM: x86: Expose XSAVEERPTR to the guest I was surprised to see that the guest reported `fxsave_leak' while the host did not. After digging deeper I noticed that the bits are simply masked out during enumeration. The XSAVEERPTR feature is actually a bug fix on AMD which means the kernel can disable a workaround. Pass XSAVEERPTR to the guest if available on the host. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 34d9f9f481a3..9c5029cf6f3f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -485,9 +485,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = - F(CLZERO) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | - F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | F(AMD_SSBD) | - F(VIRT_SSBD) | F(AMD_SSB_NO); + F(CLZERO) | F(XSAVEERPTR) | + F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | + F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = From 715d14da670e353c8c827f97e61ba00969929e33 Mon Sep 17 00:00:00 2001 From: Sam Shih Date: Wed, 25 Sep 2019 22:32:33 +0800 Subject: [PATCH 1323/2880] pwm: mediatek: Add MT7629 compatible string This adds pwm support for MT7629, and separate mt7629 compatible string from mt7622 Signed-off-by: Sam Shih Signed-off-by: Thierry Reding --- drivers/pwm/pwm-mediatek.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 7782fadbc116..b94e0d09c300 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -297,6 +297,11 @@ static const struct pwm_mediatek_of_data mt7628_pwm_data = { .pwm45_fixup = true, }; +static const struct pwm_mediatek_of_data mt7629_pwm_data = { + .num_pwms = 1, + .pwm45_fixup = false, +}; + static const struct pwm_mediatek_of_data mt8516_pwm_data = { .num_pwms = 5, .pwm45_fixup = false, @@ -307,6 +312,7 @@ static const struct of_device_id pwm_mediatek_of_match[] = { { .compatible = "mediatek,mt7622-pwm", .data = &mt7622_pwm_data }, { .compatible = "mediatek,mt7623-pwm", .data = &mt7623_pwm_data }, { .compatible = "mediatek,mt7628-pwm", .data = &mt7628_pwm_data }, + { .compatible = "mediatek,mt7629-pwm", .data = &mt7629_pwm_data }, { .compatible = "mediatek,mt8516-pwm", .data = &mt8516_pwm_data }, { }, }; From 8f960106c150a9733f5d7408b975c0a687617961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 23 Sep 2019 10:49:35 +0200 Subject: [PATCH 1324/2880] MAINTAINERS: Add a selection of PWM related keywords to the PWM entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is just a small subset of the relevant functions, but should at least catch all new code as every consumer has to call pwm_apply_state() (or the legacy function pwm_config()) and every PWM provider has to implement pwm_ops. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 783569e3c4b4..778b55ca4bb3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13086,6 +13086,7 @@ F: drivers/video/backlight/pwm_bl.c F: include/linux/pwm_backlight.h F: drivers/gpio/gpio-mvebu.c F: Documentation/devicetree/bindings/gpio/gpio-mvebu.txt +K: pwm_(config|apply_state|ops) PXA GPIO DRIVER M: Robert Jarzmik From 6f736909f0a4e04f9ab9744ec7b1f2f0b10c2db7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 23 Sep 2019 10:49:36 +0200 Subject: [PATCH 1325/2880] MAINTAINERS: Add patchwork link for PWM entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This instance collects patches and Thierry updates the patches' status there, so I consider it used and suitable to document it officially. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 778b55ca4bb3..f0ab9b2f595c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13078,6 +13078,7 @@ M: Thierry Reding L: linux-pwm@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/thierry.reding/linux-pwm.git +Q: https://patchwork.ozlabs.org/project/linux-pwm/list/ F: Documentation/driver-api/pwm.rst F: Documentation/devicetree/bindings/pwm/ F: include/linux/pwm.h From da635e7abe3f4ec9a8270ca4f5ba946d1a43f678 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 23 Sep 2019 10:49:37 +0200 Subject: [PATCH 1326/2880] MAINTAINERS: Add myself as reviewer for the PWM subsystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I spend some time in the nearer past reviewing PWM patches. Honor this by adding me as a reviewer. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f0ab9b2f595c..6bd99e075c40 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13075,6 +13075,7 @@ F: drivers/media/rc/pwm-ir-tx.c PWM SUBSYSTEM M: Thierry Reding +R: Uwe Kleine-König L: linux-pwm@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/thierry.reding/linux-pwm.git From 39529a9948d8f67f39cb72bec914c1adab38562d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 25 Sep 2019 13:37:45 -0700 Subject: [PATCH 1327/2880] libbpf: Teach btf_dumper to emit stand-alone anonymous enum definitions BTF-to-C converter previously skipped anonymous enums in an assumption that those are embedded in struct's field definitions. This is not always the case and a lot of kernel constants are defined as part of anonymous enums. This change fixes the logic by eagerly marking all types as either referenced by any other type or not. This is enough to distinguish two classes of anonymous enums and emit previously omitted enum definitions. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20190925203745.3173184-1-andriin@fb.com --- tools/lib/bpf/btf_dump.c | 93 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 84b0661db7f3..ede55fec3618 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -48,6 +48,8 @@ struct btf_dump_type_aux_state { __u8 fwd_emitted: 1; /* whether unique non-duplicate name was already assigned */ __u8 name_resolved: 1; + /* whether type is referenced from any other type */ + __u8 referenced: 1; }; struct btf_dump { @@ -173,6 +175,7 @@ void btf_dump__free(struct btf_dump *d) free(d); } +static int btf_dump_mark_referenced(struct btf_dump *d); static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr); static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id); @@ -213,6 +216,11 @@ int btf_dump__dump_type(struct btf_dump *d, __u32 id) /* VOID is special */ d->type_states[0].order_state = ORDERED; d->type_states[0].emit_state = EMITTED; + + /* eagerly determine referenced types for anon enums */ + err = btf_dump_mark_referenced(d); + if (err) + return err; } d->emit_queue_cnt = 0; @@ -226,6 +234,79 @@ int btf_dump__dump_type(struct btf_dump *d, __u32 id) return 0; } +/* + * Mark all types that are referenced from any other type. This is used to + * determine top-level anonymous enums that need to be emitted as an + * independent type declarations. + * Anonymous enums come in two flavors: either embedded in a struct's field + * definition, in which case they have to be declared inline as part of field + * type declaration; or as a top-level anonymous enum, typically used for + * declaring global constants. It's impossible to distinguish between two + * without knowning whether given enum type was referenced from other type: + * top-level anonymous enum won't be referenced by anything, while embedded + * one will. + */ +static int btf_dump_mark_referenced(struct btf_dump *d) +{ + int i, j, n = btf__get_nr_types(d->btf); + const struct btf_type *t; + __u16 vlen; + + for (i = 1; i <= n; i++) { + t = btf__type_by_id(d->btf, i); + vlen = btf_vlen(t); + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + break; + + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + d->type_states[t->type].referenced = 1; + break; + + case BTF_KIND_ARRAY: { + const struct btf_array *a = btf_array(t); + + d->type_states[a->index_type].referenced = 1; + d->type_states[a->type].referenced = 1; + break; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + + for (j = 0; j < vlen; j++, m++) + d->type_states[m->type].referenced = 1; + break; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + + for (j = 0; j < vlen; j++, p++) + d->type_states[p->type].referenced = 1; + break; + } + case BTF_KIND_DATASEC: { + const struct btf_var_secinfo *v = btf_var_secinfos(t); + + for (j = 0; j < vlen; j++, v++) + d->type_states[v->type].referenced = 1; + break; + } + default: + return -EINVAL; + } + } + return 0; +} static int btf_dump_add_emit_queue_id(struct btf_dump *d, __u32 id) { __u32 *new_queue; @@ -395,7 +476,12 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) } case BTF_KIND_ENUM: case BTF_KIND_FWD: - if (t->name_off != 0) { + /* + * non-anonymous or non-referenced enums are top-level + * declarations and should be emitted. Same logic can be + * applied to FWDs, it won't hurt anyways. + */ + if (t->name_off != 0 || !tstate->referenced) { err = btf_dump_add_emit_queue_id(d, id); if (err) return err; @@ -536,11 +622,6 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) t = btf__type_by_id(d->btf, id); kind = btf_kind(t); - if (top_level_def && t->name_off == 0) { - pr_warning("unexpected nameless definition, id:[%u]\n", id); - return; - } - if (tstate->emit_state == EMITTING) { if (tstate->fwd_emitted) return; From a3bc18a48e2e678efe62f1f9989902f9cd19e0ff Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 26 Sep 2019 15:21:18 +0100 Subject: [PATCH 1328/2880] jffs2: Fix mounting under new mount API The mounting of jffs2 is broken due to the changes from the new mount API because it specifies a "source" operation, but then doesn't actually process it. But because it specified it, it doesn't return -ENOPARAM and the caller doesn't process it either and the source gets lost. Fix this by simply removing the source parameter from jffs2 and letting the VFS deal with it in the default manner. To test it, enable CONFIG_MTD_MTDRAM and allow the default size and erase block size parameters, then try and mount the /dev/mtdblock file that that creates as jffs2. No need to initialise it. Fixes: ec10a24f10c8 ("vfs: Convert jffs2 to use the new mount API") Reported-by: Al Viro Signed-off-by: David Howells cc: David Woodhouse cc: Richard Weinberger cc: linux-mtd@lists.infradead.org Signed-off-by: Al Viro --- fs/jffs2/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index cbe70637c117..0e6406c4f362 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -163,13 +163,11 @@ static const struct export_operations jffs2_export_ops = { * Opt_rp_size: size of reserved pool in KiB */ enum { - Opt_source, Opt_override_compr, Opt_rp_size, }; static const struct fs_parameter_spec jffs2_param_specs[] = { - fsparam_string ("source", Opt_source), fsparam_enum ("compr", Opt_override_compr), fsparam_u32 ("rp_size", Opt_rp_size), {} From e3439af4a339acd7fddbd6d59b8ecefaac07a611 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 25 Sep 2019 10:38:35 +0100 Subject: [PATCH 1329/2880] bpf: Clean up indentation issue in BTF kflag processing There is a statement that is indented one level too deeply, remove the extraneous tab. Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20190925093835.19515-1-colin.king@canonical.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 722d38e543e9..29c7c06c6bd6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2332,7 +2332,7 @@ static int btf_enum_check_kflag_member(struct btf_verifier_env *env, if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); - return -EINVAL; + return -EINVAL; } nr_bits = int_bitsize; From 752c938a5c14b8cbf0ed3ffbfa637fb166255c3f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 14:06:24 +0300 Subject: [PATCH 1330/2880] ASoC: topology: Fix a signedness bug in soc_tplg_dapm_widget_create() The "template.id" variable is an enum and in this context GCC will treat it as an unsigned int so it can never be less than zero. Fixes: 8a9782346dcc ("ASoC: topology: Add topology core") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20190925110624.GR3264@mwanda Signed-off-by: Mark Brown --- sound/soc/soc-topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index b8690715abb5..c25939c5611b 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1588,7 +1588,7 @@ static int soc_tplg_dapm_widget_create(struct soc_tplg *tplg, /* map user to kernel widget ID */ template.id = get_widget_id(le32_to_cpu(w->id)); - if (template.id < 0) + if ((int)template.id < 0) return template.id; /* strings are allocated here, but used and freed by the widget */ From 89f403541325181748b491fd96118e68292f47e1 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:28 -0700 Subject: [PATCH 1331/2880] xen/events: remove unlikely() from WARN() condition "unlikely(WARN(x))" is excessive. WARN() already uses unlikely() internally. Link: http://lkml.kernel.org/r/20190829165025.15750-4-efremov@linux.com Signed-off-by: Denis Efremov Cc: Boris Ostrovsky Cc: Joe Perches Reviewed-by: Juergen Gross Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/xen/events/events_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 2e8570c09789..6c8843968a52 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -247,7 +247,7 @@ static void xen_irq_info_cleanup(struct irq_info *info) */ unsigned int evtchn_from_irq(unsigned irq) { - if (unlikely(WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq))) + if (WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq)) return 0; return info_for_irq(irq)->evtchn; From 7159d54418e0a1a3df91e74501363a1c05379517 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:31 -0700 Subject: [PATCH 1332/2880] fs: remove unlikely() from WARN_ON() condition "unlikely(WARN_ON(x))" is excessive. WARN_ON() already uses unlikely() internally. Link: http://lkml.kernel.org/r/20190829165025.15750-5-efremov@linux.com Signed-off-by: Denis Efremov Cc: Alexander Viro Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/open.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index c60cd22cc052..b62f5c0923a8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -776,7 +776,7 @@ static int do_dentry_open(struct file *f, f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); - if (unlikely(WARN_ON(!f->f_op))) { + if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } From 77c0e745bd11fc1ccc4690409eca92ea07200141 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:34 -0700 Subject: [PATCH 1333/2880] wimax/i2400m: remove unlikely() from WARN*() condition "unlikely(WARN_ON(x))" is excessive. WARN_ON() already uses unlikely() internally. Link: http://lkml.kernel.org/r/20190829165025.15750-6-efremov@linux.com Signed-off-by: Denis Efremov Cc: Inaky Perez-Gonzalez Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/wimax/i2400m/tx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wimax/i2400m/tx.c b/drivers/net/wimax/i2400m/tx.c index ebd64e083726..1255302e251e 100644 --- a/drivers/net/wimax/i2400m/tx.c +++ b/drivers/net/wimax/i2400m/tx.c @@ -654,8 +654,7 @@ void i2400m_tx_close(struct i2400m *i2400m) padding = aligned_size - tx_msg_moved->size; if (padding > 0) { pad_buf = i2400m_tx_fifo_push(i2400m, padding, 0, 0); - if (unlikely(WARN_ON(pad_buf == NULL - || pad_buf == TAIL_FULL))) { + if (WARN_ON(pad_buf == NULL || pad_buf == TAIL_FULL)) { /* This should not happen -- append should verify * there is always space left at least to append * tx_block_size */ From 14ed8688074ad7ba62d460ab87da841fa5407285 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:37 -0700 Subject: [PATCH 1334/2880] xfs: remove unlikely() from WARN_ON() condition "unlikely(WARN_ON(x))" is excessive. WARN_ON() already uses unlikely() internally. Link: http://lkml.kernel.org/r/20190829165025.15750-7-efremov@linux.com Signed-off-by: Denis Efremov Reviewed-by: Darrick J. Wong Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/xfs/xfs_buf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 120ef99d09e8..21c243622a79 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2097,7 +2097,7 @@ xfs_verify_magic( int idx; idx = xfs_sb_version_hascrc(&mp->m_sb); - if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) return false; return dmagic == bp->b_ops->magic[idx]; } @@ -2115,7 +2115,7 @@ xfs_verify_magic16( int idx; idx = xfs_sb_version_hascrc(&mp->m_sb); - if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) return false; return dmagic == bp->b_ops->magic16[idx]; } From 7b0b69259433fc1758408a899224db4fcc41b865 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:40 -0700 Subject: [PATCH 1335/2880] IB/hfi1: remove unlikely() from IS_ERR*() condition "unlikely(IS_ERR_OR_NULL(x))" is excessive. IS_ERR_OR_NULL() already uses unlikely() internally. Link: http://lkml.kernel.org/r/20190829165025.15750-8-efremov@linux.com Signed-off-by: Denis Efremov Cc: Mike Marciniszyn Cc: Joe Perches Acked-by: Dennis Dalessandro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/hw/hfi1/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 9f53f63b1453..7bff0a1e713d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1041,7 +1041,7 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (cb) iowait_pio_inc(&priv->s_iowait); pbuf = sc_buffer_alloc(sc, plen, cb, qp); - if (unlikely(IS_ERR_OR_NULL(pbuf))) { + if (IS_ERR_OR_NULL(pbuf)) { if (cb) verbs_pio_complete(qp, 0); if (IS_ERR(pbuf)) { From cc22c800e15b03c87f0e97400f75eba998e75c6a Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:43 -0700 Subject: [PATCH 1336/2880] ntfs: remove (un)?likely() from IS_ERR() conditions "likely(!IS_ERR(x))" is excessive. IS_ERR() already uses unlikely() internally. Link: http://lkml.kernel.org/r/20190829165025.15750-11-efremov@linux.com Signed-off-by: Denis Efremov Cc: Anton Altaparmakov Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/mft.c | 12 ++++++------ fs/ntfs/namei.c | 2 +- fs/ntfs/runlist.c | 2 +- fs/ntfs/super.c | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 20c841a906f2..3aac5c917afe 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -71,7 +71,7 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) } /* Read, map, and pin the page. */ page = ntfs_map_page(mft_vi->i_mapping, index); - if (likely(!IS_ERR(page))) { + if (!IS_ERR(page)) { /* Catch multi sector transfer fixup errors. */ if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + ofs)))) { @@ -154,7 +154,7 @@ MFT_RECORD *map_mft_record(ntfs_inode *ni) mutex_lock(&ni->mrec_lock); m = map_mft_record_page(ni); - if (likely(!IS_ERR(m))) + if (!IS_ERR(m)) return m; mutex_unlock(&ni->mrec_lock); @@ -271,7 +271,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, m = map_mft_record(ni); /* map_mft_record() has incremented this on success. */ atomic_dec(&ni->count); - if (likely(!IS_ERR(m))) { + if (!IS_ERR(m)) { /* Verify the sequence number. */ if (likely(le16_to_cpu(m->sequence_number) == seq_no)) { ntfs_debug("Done 1."); @@ -1303,7 +1303,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, (ll - 1) >> vol->cluster_size_bits, NULL); - if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { up_write(&mftbmp_ni->runlist.lock); ntfs_error(vol->sb, "Failed to determine last allocated " "cluster of mft bitmap attribute."); @@ -1734,7 +1734,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) read_unlock_irqrestore(&mft_ni->size_lock, flags); rl = ntfs_attr_find_vcn_nolock(mft_ni, (ll - 1) >> vol->cluster_size_bits, NULL); - if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { up_write(&mft_ni->runlist.lock); ntfs_error(vol->sb, "Failed to determine last allocated " "cluster of mft data attribute."); @@ -1776,7 +1776,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) do { rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, true); - if (likely(!IS_ERR(rl2))) + if (!IS_ERR(rl2)) break; if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { ntfs_error(vol->sb, "Failed to allocate the minimal " diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 2d3cc9e3395d..4e6a44bc654c 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -115,7 +115,7 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, dent_ino = MREF(mref); ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino); dent_inode = ntfs_iget(vol->sb, dent_ino); - if (likely(!IS_ERR(dent_inode))) { + if (!IS_ERR(dent_inode)) { /* Consistency check. */ if (is_bad_inode(dent_inode) || MSEQNO(mref) == NTFS_I(dent_inode)->seq_no || diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c index 508744a93180..97932fb5179c 100644 --- a/fs/ntfs/runlist.c +++ b/fs/ntfs/runlist.c @@ -951,7 +951,7 @@ mpa_err: } /* Now combine the new and old runlists checking for overlaps. */ old_rl = ntfs_runlists_merge(old_rl, rl); - if (likely(!IS_ERR(old_rl))) + if (!IS_ERR(old_rl)) return old_rl; ntfs_free(rl); ntfs_error(vol->sb, "Failed to merge runlists."); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 29621d40f448..7dc3bc604f78 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1475,7 +1475,7 @@ not_enabled: kfree(name); /* Get the inode. */ tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) { + if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) { if (!IS_ERR(tmp_ino)) iput(tmp_ino); ntfs_error(vol->sb, "Failed to load $UsnJrnl."); From b4ed71f557e458257e0f71b11969954acb389240 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 25 Sep 2019 16:49:46 -0700 Subject: [PATCH 1337/2880] mm: treewide: clarify pgtable_page_{ctor,dtor}() naming The naming of pgtable_page_{ctor,dtor}() seems to have confused a few people, and until recently arm64 used these erroneously/pointlessly for other levels of page table. To make it incredibly clear that these only apply to the PTE level, and to align with the naming of pgtable_pmd_page_{ctor,dtor}(), let's rename them to pgtable_pte_page_{ctor,dtor}(). These changes were generated with the following shell script: ---- git grep -lw 'pgtable_page_.tor' | while read FILE; do sed -i '{s/pgtable_page_ctor/pgtable_pte_page_ctor/}' $FILE; sed -i '{s/pgtable_page_dtor/pgtable_pte_page_dtor/}' $FILE; done ---- ... with the documentation re-flowed to remain under 80 columns, and whitespace fixed up in macros to keep backslashes aligned. There should be no functional change as a result of this patch. Link: http://lkml.kernel.org/r/20190722141133.3116-1-mark.rutland@arm.com Signed-off-by: Mark Rutland Reviewed-by: Mike Rapoport Acked-by: Geert Uytterhoeven [m68k] Cc: Anshuman Khandual Cc: Matthew Wilcox Cc: Michal Hocko Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/split_page_table_lock.rst | 10 +++++----- arch/arc/include/asm/pgalloc.h | 4 ++-- arch/arm/include/asm/tlb.h | 2 +- arch/arm/mm/mmu.c | 2 +- arch/arm64/include/asm/tlb.h | 2 +- arch/arm64/mm/mmu.c | 2 +- arch/csky/include/asm/pgalloc.h | 2 +- arch/hexagon/include/asm/pgalloc.h | 2 +- arch/m68k/include/asm/mcf_pgalloc.h | 6 +++--- arch/m68k/include/asm/motorola_pgalloc.h | 6 +++--- arch/m68k/include/asm/sun3_pgalloc.h | 2 +- arch/mips/include/asm/pgalloc.h | 2 +- arch/nios2/include/asm/pgalloc.h | 2 +- arch/openrisc/include/asm/pgalloc.h | 6 +++--- arch/powerpc/mm/pgtable-frag.c | 6 +++--- arch/riscv/include/asm/pgalloc.h | 2 +- arch/s390/mm/pgalloc.c | 6 +++--- arch/sh/include/asm/pgalloc.h | 2 +- arch/sparc/mm/init_64.c | 4 ++-- arch/sparc/mm/srmmu.c | 4 ++-- arch/um/include/asm/pgalloc.h | 2 +- arch/unicore32/include/asm/tlb.h | 2 +- arch/x86/mm/pgtable.c | 2 +- arch/xtensa/include/asm/pgalloc.h | 4 ++-- include/asm-generic/pgalloc.h | 8 ++++---- include/linux/mm.h | 4 ++-- 26 files changed, 48 insertions(+), 48 deletions(-) diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/vm/split_page_table_lock.rst index 889b00be469f..ff51f4a5494d 100644 --- a/Documentation/vm/split_page_table_lock.rst +++ b/Documentation/vm/split_page_table_lock.rst @@ -54,9 +54,9 @@ Hugetlb-specific helpers: Support of split page table lock by an architecture =================================================== -There's no need in special enabling of PTE split page table lock: -everything required is done by pgtable_page_ctor() and pgtable_page_dtor(), -which must be called on PTE table allocation / freeing. +There's no need in special enabling of PTE split page table lock: everything +required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which +must be called on PTE table allocation / freeing. Make sure the architecture doesn't use slab allocator for page table allocation: slab uses page->slab_cache for its pages. @@ -74,7 +74,7 @@ paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. -NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must +NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must be handled properly. page->ptl @@ -94,7 +94,7 @@ trick: split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs one more cache line for indirect access; -The spinlock_t allocated in pgtable_page_ctor() for PTE table and in +The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in pgtable_pmd_page_ctor() for PMD table. Please, never access page->ptl directly -- use appropriate helper. diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 4751f2251cd9..b747f2ec2928 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -108,7 +108,7 @@ pte_alloc_one(struct mm_struct *mm) return 0; memzero((void *)pte_pg, PTRS_PER_PTE * sizeof(pte_t)); page = virt_to_page(pte_pg); - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return 0; } @@ -123,7 +123,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, pgtable_t ptep) { - pgtable_page_dtor(virt_to_page(ptep)); + pgtable_pte_page_dtor(virt_to_page(ptep)); free_pages((unsigned long)ptep, __get_order_pte()); } diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index b75ea15b85c0..669474add486 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -44,7 +44,7 @@ static inline void __tlb_remove_table(void *_table) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); #ifndef CONFIG_ARM_LPAE /* diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 25da9b2d9610..48c2888297dd 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -731,7 +731,7 @@ static void *__init late_alloc(unsigned long sz) { void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz)); - if (!ptr || !pgtable_page_ctor(virt_to_page(ptr))) + if (!ptr || !pgtable_pte_page_ctor(virt_to_page(ptr))) BUG(); return ptr; } diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index a95d1fcb7e21..b76df828e6b7 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -44,7 +44,7 @@ static inline void tlb_flush(struct mmu_gather *tlb) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); tlb_remove_table(tlb, pte); } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 53dc6f24cfb7..60c929f3683b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -384,7 +384,7 @@ static phys_addr_t pgd_pgtable_alloc(int shift) * folded, and if so pgtable_pmd_page_ctor() becomes nop. */ if (shift == PAGE_SHIFT) - BUG_ON(!pgtable_page_ctor(phys_to_page(pa))); + BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa))); else if (shift == PMD_SHIFT) BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa))); diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index d089113fe41f..c7c1ed27e348 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -71,7 +71,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #define __pte_free_tlb(tlb, pte, address) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page(tlb, pte); \ } while (0) diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index 5a6e79e7926d..cc9be514a676 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -94,7 +94,7 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor((pte)); \ + pgtable_pte_page_dtor((pte)); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index 4399d712f6db..b34d44d666a4 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -41,7 +41,7 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, unsigned long address) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } @@ -54,7 +54,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -73,7 +73,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) static inline void pte_free(struct mm_struct *mm, struct page *page) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index d04d9ba9b976..acab315c851f 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -36,7 +36,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) page = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0); if(!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -51,7 +51,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) static inline void pte_free(struct mm_struct *mm, pgtable_t page) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); cache_page(kmap(page)); kunmap(page); __free_page(page); @@ -60,7 +60,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t page) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, unsigned long address) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); cache_page(kmap(page)); kunmap(page); __free_page(page); diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 1a8ddbd0d23c..856121122b91 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -21,7 +21,7 @@ extern const char bad_pmd_string[]; #define __pte_free_tlb(tlb,pte,addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index aa73cb187a07..166842337eb2 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -54,7 +54,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #define __pte_free_tlb(tlb,pte,address) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index 750d18d5980b..0b146d773c85 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -41,7 +41,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 787c1b9d2f6d..da12a4c38c4b 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -75,7 +75,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) if (!pte) return NULL; clear_page(page_address(pte)); - if (!pgtable_page_ctor(pte)) { + if (!pgtable_pte_page_ctor(pte)) { __free_page(pte); return NULL; } @@ -89,13 +89,13 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, struct page *pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); __free_page(pte); } #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index a7b05214760c..ee4bd6d38602 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -25,7 +25,7 @@ void pte_frag_destroy(void *pte_frag) count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } } @@ -61,7 +61,7 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -113,7 +113,7 @@ void pte_fragment_free(unsigned long *table, int kernel) BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&page->pt_frag_refcount)) { if (!kernel) - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } } diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index f66a00d8cb19..d59ea92285ec 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -78,7 +78,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #define __pte_free_tlb(tlb, pte, buf) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 54fcdf66ae96..3dd253f81a77 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -210,7 +210,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) page = alloc_page(GFP_KERNEL); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -256,7 +256,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) atomic_xor_bits(&page->_refcount, 3U << 24); } - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } @@ -308,7 +308,7 @@ void __tlb_remove_table(void *_table) case 3: /* 4K page table with pgstes */ if (mask & 3) atomic_xor_bits(&page->_refcount, 3 << 24); - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); break; } diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 8c6341a4d807..22d968bfe9bb 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -29,7 +29,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #define __pte_free_tlb(tlb,pte,addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 4b099dd7a767..e6d91819da92 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2903,7 +2903,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { free_unref_page(page); return NULL; } @@ -2919,7 +2919,7 @@ static void __pte_free(pgtable_t pte) { struct page *page = virt_to_page(pte); - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index aaebbc00d262..cc3ad64479ac 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -378,7 +378,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) if ((pte = (unsigned long)pte_alloc_one_kernel(mm)) == 0) return NULL; page = pfn_to_page(__nocache_pa(pte) >> PAGE_SHIFT); - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -389,7 +389,7 @@ void pte_free(struct mm_struct *mm, pgtable_t pte) { unsigned long p; - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); p = (unsigned long)page_address(pte); /* Cached address (for test) */ if (p == 0) BUG(); diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 446e0c0f4018..881e76da1938 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -29,7 +29,7 @@ extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); #define __pte_free_tlb(tlb,pte, address) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb),(pte)); \ } while (0) diff --git a/arch/unicore32/include/asm/tlb.h b/arch/unicore32/include/asm/tlb.h index 10d2356bfddd..4663d8cc80ef 100644 --- a/arch/unicore32/include/asm/tlb.h +++ b/arch/unicore32/include/asm/tlb.h @@ -15,7 +15,7 @@ #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 463940faf52f..3e4b9035bb9a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -45,7 +45,7 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); paravirt_tlb_remove_table(tlb, pte); } diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h index dd744aa450fa..1d38f0e755ba 100644 --- a/arch/xtensa/include/asm/pgalloc.h +++ b/arch/xtensa/include/asm/pgalloc.h @@ -55,7 +55,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) if (!pte) return NULL; page = virt_to_page(pte); - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -69,7 +69,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, pgtable_t pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); __free_page(pte); } #define pmd_pgtable(pmd) pmd_page(pmd) diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 6f8cc06ee44e..73f7421413cb 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -49,7 +49,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) * @mm: the mm_struct of the current context * @gfp: GFP flags to use for the allocation * - * Allocates a page and runs the pgtable_page_ctor(). + * Allocates a page and runs the pgtable_pte_page_ctor(). * * This function is intended for architectures that need * anything beyond simple page allocation or must have custom GFP flags. @@ -63,7 +63,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) pte = alloc_page(gfp); if (!pte) return NULL; - if (!pgtable_page_ctor(pte)) { + if (!pgtable_pte_page_ctor(pte)) { __free_page(pte); return NULL; } @@ -76,7 +76,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) * pte_alloc_one - allocate a page for PTE-level user page table * @mm: the mm_struct of the current context * - * Allocates a page and runs the pgtable_page_ctor(). + * Allocates a page and runs the pgtable_pte_page_ctor(). * * Return: `struct page` initialized as page table or %NULL on error */ @@ -98,7 +98,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) */ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { - pgtable_page_dtor(pte_page); + pgtable_pte_page_dtor(pte_page); __free_page(pte_page); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 294a67b94147..cc292273e6ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1949,7 +1949,7 @@ static inline void pgtable_init(void) pgtable_cache_init(); } -static inline bool pgtable_page_ctor(struct page *page) +static inline bool pgtable_pte_page_ctor(struct page *page) { if (!ptlock_init(page)) return false; @@ -1958,7 +1958,7 @@ static inline bool pgtable_page_ctor(struct page *page) return true; } -static inline void pgtable_page_dtor(struct page *page) +static inline void pgtable_pte_page_dtor(struct page *page) { ptlock_free(page); __ClearPageTable(page); From a22fea94992a2bc5328005e62f368413ede49c14 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 26 Sep 2019 07:28:17 -0700 Subject: [PATCH 1338/2880] arch/sparc/include/asm/pgtable_64.h: fix build A last-minute fixlet which I'd failed to merge at the appropriate time had the predictable effect. Fixes: f672e2c217e2d4b2 ("lib: untag user pointers in strn*_user") Cc: Andrey Konovalov Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 6ec514fe3bef..6ae8016ef4ec 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1099,7 +1099,7 @@ static inline unsigned long __untagged_addr(unsigned long start) return start; } #define untagged_addr(addr) \ - ((__typeof__(addr))(__untagged_addr((unsigned long)(addr))) + ((__typeof__(addr))(__untagged_addr((unsigned long)(addr)))) static inline bool pte_access_permitted(pte_t pte, bool write) { From 7be3cb019db1cbd5fd5ffe6d64a23fefa4b6f229 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 26 Sep 2019 10:15:25 -0700 Subject: [PATCH 1339/2880] binfmt_elf: Do not move brk for INTERP-less ET_EXEC When brk was moved for binaries without an interpreter, it should have been limited to ET_DYN only. In other words, the special case was an ET_DYN that lacks an INTERP, not just an executable that lacks INTERP. The bug manifested for giant static executables, where the brk would end up in the middle of the text area on 32-bit architectures. Reported-and-tested-by: Richard Kojedzinszky Fixes: bbdc6076d2e5 ("binfmt_elf: move brk out of mmap when doing direct loader exec") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index cec3b4146440..ad4c6b1d5074 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1121,7 +1121,8 @@ out_free_interp: * (since it grows up, and may collide early with the stack * growing down), and into the unused ELF_ET_DYN_BASE region. */ - if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && !interpreter) + if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && + loc->elf_ex.e_type == ET_DYN && !interpreter) current->mm->brk = current->mm->start_brk = ELF_ET_DYN_BASE; From 26acf400d2dcc72c7e713e1f55db47ad92010cc2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 26 Sep 2019 14:36:48 -0300 Subject: [PATCH 1340/2880] perf unwind: Fix libunwind build failure on i386 systems Naresh Kamboju reported, that on the i386 build pr_err() doesn't get defined properly due to header ordering: perf-in.o: In function `libunwind__x86_reg_id': tools/perf/util/libunwind/../../arch/x86/util/unwind-libunwind.c:109: undefined reference to `pr_err' Reported-by: Naresh Kamboju Signed-off-by: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/arch/x86/util/unwind-libunwind.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/arch/x86/util/unwind-libunwind.c b/tools/perf/arch/x86/util/unwind-libunwind.c index 05920e3edf7a..47357973b55b 100644 --- a/tools/perf/arch/x86/util/unwind-libunwind.c +++ b/tools/perf/arch/x86/util/unwind-libunwind.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include "../../util/debug.h" #ifndef REMOTE_UNWIND_LIBUNWIND #include #include "perf_regs.h" #include "../../util/unwind.h" -#include "../../util/debug.h" #endif #ifdef HAVE_ARCH_X86_64_SUPPORT From f968688f44f529f96a64c9853fb2fb5d0a329aff Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Sep 2019 12:44:39 +0900 Subject: [PATCH 1341/2880] nvme: Move ctrl sqsize to generic space This isn't specific to fabrics. Signed-off-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b5013c101b35..38a83ef5bcd3 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -221,6 +221,7 @@ struct nvme_ctrl { u16 oacs; u16 nssa; u16 nr_streams; + u16 sqsize; u32 max_namespaces; atomic_t abort_limit; u8 vwc; @@ -269,7 +270,6 @@ struct nvme_ctrl { u16 hmmaxd; /* Fabrics only */ - u16 sqsize; u32 ioccsz; u32 iorcsz; u16 icdoff; From ff3ee62a55869b1a64266b5c15af16f2eb37c8a7 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 26 Sep 2019 04:37:18 -0500 Subject: [PATCH 1342/2880] smb3: missing ACL related flags Various SMB3 ACL related flags (for security descriptor and ACEs for example) were missing and some fields are different in SMB3 and CIFS. Update cifsacl.h definitions based on current MS-DTYP specification. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Reviewed-by: Aurelien Aptel --- fs/cifs/cifsacl.h | 81 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index eb428349f29a..439b99cefeb0 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -90,14 +90,93 @@ struct cifs_acl { __le32 num_aces; } __attribute__((packed)); +/* ACE types - see MS-DTYP 2.4.4.1 */ +#define ACCESS_ALLOWED_ACE_TYPE 0x00 +#define ACCESS_DENIED_ACE_TYPE 0x01 +#define SYSTEM_AUDIT_ACE_TYPE 0x02 +#define SYSTEM_ALARM_ACE_TYPE 0x03 +#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */ +#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */ +#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11 +#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12 +#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13 + +/* ACE flags */ +#define OBJECT_INHERIT_ACE 0x01 +#define CONTAINER_INHERIT_ACE 0x02 +#define NO_PROPAGATE_INHERIT_ACE 0x04 +#define INHERIT_ONLY_ACE 0x08 +#define INHERITED_ACE 0x10 +#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40 +#define FAILED_ACCESS_ACE_FLAG 0x80 + struct cifs_ace { - __u8 type; + __u8 type; /* see above and MS-DTYP 2.4.4.1 */ __u8 flags; __le16 size; __le32 access_req; struct cifs_sid sid; /* ie UUID of user or group who gets these perms */ } __attribute__((packed)); +/* + * The current SMB3 form of security descriptor is similar to what was used for + * cifs (see above) but some fields are split, and fields in the struct below + * matches names of fields to the the spec, MS-DTYP (see sections 2.4.5 and + * 2.4.6). Note that "CamelCase" fields are used in this struct in order to + * match the MS-DTYP and MS-SMB2 specs which define the wire format. + */ +struct smb3_sd { + __u8 Revision; /* revision level, MUST be one */ + __u8 Sbz1; /* only meaningful if 'RM' flag set below */ + __le16 Control; + __le32 OffsetOwner; + __le32 OffsetGroup; + __le32 OffsetSacl; + __le32 OffsetDacl; +} __packed; + +/* Meaning of 'Control' field flags */ +#define ACL_CONTROL_SR 0x0001 /* Self relative */ +#define ACL_CONTROL_RM 0x0002 /* Resource manager control bits */ +#define ACL_CONTROL_PS 0x0004 /* SACL protected from inherits */ +#define ACL_CONTROL_PD 0x0008 /* DACL protected from inherits */ +#define ACL_CONTROL_SI 0x0010 /* SACL Auto-Inherited */ +#define ACL_CONTROL_DI 0x0020 /* DACL Auto-Inherited */ +#define ACL_CONTROL_SC 0x0040 /* SACL computed through inheritance */ +#define ACL_CONTROL_DC 0x0080 /* DACL computed through inheritence */ +#define ACL_CONTROL_SS 0x0100 /* Create server ACL */ +#define ACL_CONTROL_DT 0x0200 /* DACL provided by trusteed source */ +#define ACL_CONTROL_SD 0x0400 /* SACL defaulted */ +#define ACL_CONTROL_SP 0x0800 /* SACL is present on object */ +#define ACL_CONTROL_DD 0x1000 /* DACL defaulted */ +#define ACL_CONTROL_DP 0x2000 /* DACL is present on object */ +#define ACL_CONTROL_GD 0x4000 /* Group was defaulted */ +#define ACL_CONTROL_OD 0x8000 /* User was defaulted */ + +/* Meaning of AclRevision flags */ +#define ACL_REVISION 0x02 /* See section 2.4.4.1 of MS-DTYP */ +#define ACL_REVISION_DS 0x04 /* Additional AceTypes allowed */ + +struct smb3_acl { + u8 AclRevision; /* revision level */ + u8 Sbz1; /* MBZ */ + __le16 AclSize; + __le16 AceCount; + __le16 Sbz2; /* MBZ */ +} __packed; + + /* * Minimum security identifier can be one for system defined Users * and Groups such as NULL SID and World or Built-in accounts such From a016e2794fc3a245a91946038dd8f34d65e53cc3 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 26 Sep 2019 12:31:20 -0700 Subject: [PATCH 1343/2880] CIFS: Fix oplock handling for SMB 2.1+ protocols There may be situations when a server negotiates SMB 2.1 protocol version or higher but responds to a CREATE request with an oplock rather than a lease. Currently the client doesn't handle such a case correctly: when another CREATE comes in the server sends an oplock break to the initial CREATE and the client doesn't send an ack back due to a wrong caching level being set (READ instead of RWH). Missing an oplock break ack makes the server wait until the break times out which dramatically increases the latency of the second CREATE. Fix this by properly detecting oplocks when using SMB 2.1 protocol version and higher. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/smb2ops.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 3010066a8709..4c0922596467 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3333,6 +3333,11 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; + /* Check if the server granted an oplock rather than a lease */ + if (oplock & SMB2_OPLOCK_LEVEL_EXCLUSIVE) + return smb2_set_oplock_level(cinode, oplock, epoch, + purge_cache); + if (oplock & SMB2_LEASE_READ_CACHING_HE) { new_oplock |= CIFS_CACHE_READ_FLG; strcat(message, "R"); From 253c892193ab58da6b1d94371285971b22c63260 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Thu, 26 Sep 2019 22:25:02 +1000 Subject: [PATCH 1344/2880] powerpc/eeh: Fix eeh eeh_debugfs_break_device() with SRIOV devices s/CONFIG_IOV/CONFIG_PCI_IOV/ Whoops. Fixes: bd6461cc7b3c ("powerpc/eeh: Add a eeh_dev_break debugfs interface") Signed-off-by: Oliver O'Halloran [mpe: Fixup the #endif comment as well] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190926122502.14826-1-oohall@gmail.com --- arch/powerpc/kernel/eeh.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 0a91dee51245..bc8a551013be 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1960,7 +1960,7 @@ static int eeh_debugfs_break_device(struct pci_dev *pdev) pci_err(pdev, "Going to break: %pR\n", bar); if (pdev->is_virtfn) { -#ifndef CONFIG_IOV +#ifndef CONFIG_PCI_IOV return -ENXIO; #else /* @@ -1980,7 +1980,7 @@ static int eeh_debugfs_break_device(struct pci_dev *pdev) pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV); pos += PCI_SRIOV_CTRL; bit = PCI_SRIOV_CTRL_MSE; -#endif /* !CONFIG_IOV */ +#endif /* !CONFIG_PCI_IOV */ } else { bit = PCI_COMMAND_MEMORY; pos = PCI_COMMAND; From 2af2783f2ea4f273598557b247e320509247df80 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Tue, 17 Sep 2019 20:04:27 +0800 Subject: [PATCH 1345/2880] rq-qos: get rid of redundant wbt_update_limits() We have updated limits after calling wbt_set_min_lat(). No need to update again. Reviewed-by: Bob Liu Signed-off-by: Yufen Yu Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 962fc0c44381..46f5198be017 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -482,7 +482,6 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, blk_mq_quiesce_queue(q); wbt_set_min_lat(q, val); - wbt_update_limits(q); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); From 424adc329bcbf3bfad80a6b01cffe5f2ee5080a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 24 Sep 2019 18:02:59 +0200 Subject: [PATCH 1346/2880] dimlib: make DIMLIB a hidden symbol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to Tal Gilboa the only benefit from DIM comes from a driver that uses it. So it doesn't make sense to make this symbol user visible, instead all drivers that use it should select it (as is already the case AFAICT). Signed-off-by: Uwe Kleine-König Acked-by: Randy Dunlap Signed-off-by: David S. Miller --- lib/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 4e6b1c3e4c98..d7fc9eb33b9b 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -555,8 +555,7 @@ config SIGNATURE Implementation is done using GnuPG MPI library config DIMLIB - bool "DIM library" - default y + bool help Dynamic Interrupt Moderation library. Implements an algorithm for dynamically change CQ modertion values From 31aefe14bc9f56566041303d733fda511d3a1c3e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:54:30 +0300 Subject: [PATCH 1347/2880] net: aquantia: Fix aq_vec_isr_legacy() return value The irqreturn_t type is an enum or an unsigned int in GCC. That creates to problems because it can't detect if the self->aq_hw_ops->hw_irq_read() call fails and at the end the function always returns IRQ_HANDLED. drivers/net/ethernet/aquantia/atlantic/aq_vec.c:316 aq_vec_isr_legacy() warn: unsigned 'err' is never less than zero. drivers/net/ethernet/aquantia/atlantic/aq_vec.c:329 aq_vec_isr_legacy() warn: always true condition '(err >= 0) => (0-u32max >= 0)' Fixes: 970a2e9864b0 ("net: ethernet: aquantia: Vector operations") Signed-off-by: Dan Carpenter Reviewed-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_vec.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c index 28892b8acd0e..a95c263a45aa 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c @@ -306,15 +306,13 @@ irqreturn_t aq_vec_isr_legacy(int irq, void *private) { struct aq_vec_s *self = private; u64 irq_mask = 0U; - irqreturn_t err = 0; + int err; - if (!self) { - err = -EINVAL; - goto err_exit; - } + if (!self) + return IRQ_NONE; err = self->aq_hw_ops->hw_irq_read(self->aq_hw, &irq_mask); if (err < 0) - goto err_exit; + return IRQ_NONE; if (irq_mask) { self->aq_hw_ops->hw_irq_disable(self->aq_hw, @@ -322,11 +320,10 @@ irqreturn_t aq_vec_isr_legacy(int irq, void *private) napi_schedule(&self->napi); } else { self->aq_hw_ops->hw_irq_enable(self->aq_hw, 1U); - err = IRQ_NONE; + return IRQ_NONE; } -err_exit: - return err >= 0 ? IRQ_HANDLED : IRQ_NONE; + return IRQ_HANDLED; } cpumask_t *aq_vec_get_affinity_mask(struct aq_vec_s *self) From 286183147666fb76c057836c57d86e9e6f508bca Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:54:59 +0300 Subject: [PATCH 1348/2880] cxgb4: Signedness bug in init_one() The "chip" variable is an enum, and it's treated as unsigned int by GCC in this context so the error handling isn't triggered. Fixes: e8d452923ae6 ("cxgb4: clean up init_one") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 71854a19cebe..38024877751c 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -5701,7 +5701,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) whoami = t4_read_reg(adapter, PL_WHOAMI_A); pci_read_config_word(pdev, PCI_DEVICE_ID, &device_id); chip = t4_get_chip_type(adapter, CHELSIO_PCI_ID_VER(device_id)); - if (chip < 0) { + if ((int)chip < 0) { dev_err(&pdev->dev, "Device %d is not supported\n", device_id); err = chip; goto out_free_adapter; From 002dfe8085255b7bf1e0758c3d195c5412d35be9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:55:32 +0300 Subject: [PATCH 1349/2880] net: hisilicon: Fix signedness bug in hix5hd2_dev_probe() The "priv->phy_mode" variable is an enum and in this context GCC will treat it as unsigned to the error handling will never trigger. Fixes: 57c5bc9ad7d7 ("net: hisilicon: add hix5hd2 mac driver") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hix5hd2_gmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c index 95a6b0926170..c41b19c760f8 100644 --- a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c +++ b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c @@ -1194,7 +1194,7 @@ static int hix5hd2_dev_probe(struct platform_device *pdev) goto err_free_mdio; priv->phy_mode = of_get_phy_mode(node); - if (priv->phy_mode < 0) { + if ((int)priv->phy_mode < 0) { netdev_err(ndev, "not find phy-mode\n"); ret = -EINVAL; goto err_mdiobus; From 25a584955f020d6ec499c513923fb220f3112d2b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:56:04 +0300 Subject: [PATCH 1350/2880] net: broadcom/bcmsysport: Fix signedness in bcm_sysport_probe() The "priv->phy_interface" variable is an enum and in this context GCC will treat it as unsigned so the error handling will never be triggered. Fixes: 80105befdb4b ("net: systemport: add Broadcom SYSTEMPORT Ethernet MAC driver") Signed-off-by: Dan Carpenter Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 7df887e4024c..a977a459bd20 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2481,7 +2481,7 @@ static int bcm_sysport_probe(struct platform_device *pdev) priv->phy_interface = of_get_phy_mode(dn); /* Default to GMII interface mode */ - if (priv->phy_interface < 0) + if ((int)priv->phy_interface < 0) priv->phy_interface = PHY_INTERFACE_MODE_GMII; /* In the case of a fixed PHY, the DT node associated From bd55f8ddbc437c225391ca8f487e7ec10243c4cc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:56:38 +0300 Subject: [PATCH 1351/2880] net: netsec: Fix signedness bug in netsec_probe() The "priv->phy_interface" variable is an enum and in this context GCC will treat it as an unsigned int so the error handling is never triggered. Fixes: 533dd11a12f6 ("net: socionext: Add Synquacer NetSec driver") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/netsec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 1502fe8b0456..55db7fbd43cc 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -2007,7 +2007,7 @@ static int netsec_probe(struct platform_device *pdev) NETIF_MSG_LINK | NETIF_MSG_PROBE; priv->phy_interface = device_get_phy_mode(&pdev->dev); - if (priv->phy_interface < 0) { + if ((int)priv->phy_interface < 0) { dev_err(&pdev->dev, "missing required property 'phy-mode'\n"); ret = -ENODEV; goto free_ndev; From ced81eb84d6ab0a993ff79c04cc0238d44415af4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:57:14 +0300 Subject: [PATCH 1352/2880] enetc: Fix a signedness bug in enetc_of_get_phy() The "priv->if_mode" is type phy_interface_t which is an enum. In this context GCC will treat the enum as an unsigned int so this error handling is never triggered. Fixes: d4fd0404c1c9 ("enetc: Introduce basic PF and VF ENETC ethernet drivers") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 7d6513ff8507..b73421c3e25b 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -785,7 +785,7 @@ static int enetc_of_get_phy(struct enetc_ndev_priv *priv) } priv->if_mode = of_get_phy_mode(np); - if (priv->if_mode < 0) { + if ((int)priv->if_mode < 0) { dev_err(priv->dev, "missing phy type\n"); of_node_put(priv->phy_node); if (of_phy_is_fixed_link(np)) From a0ecd6fdbf5d648123a7315c695fb6850d702835 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Tue, 24 Sep 2019 23:30:30 -0500 Subject: [PATCH 1353/2880] drm/komeda: prevent memory leak in komeda_wb_connector_add In komeda_wb_connector_add if drm_writeback_connector_init fails the allocated memory for kwb_conn should be released. Signed-off-by: Navid Emamdoost Reviewed-by: James Qian Wang (Arm Technology China) Signed-off-by: james qian wang (Arm Technology China) Link: https://patchwork.freedesktop.org/patch/msgid/20190925043031.32308-1-navid.emamdoost@gmail.com --- drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c index 23fbee268119..b72840c06ab7 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c @@ -165,8 +165,10 @@ static int komeda_wb_connector_add(struct komeda_kms_dev *kms, &komeda_wb_encoder_helper_funcs, formats, n_formats); komeda_put_fourcc_list(formats); - if (err) + if (err) { + kfree(kwb_conn); return err; + } drm_connector_helper_add(&wb_conn->base, &komeda_wb_conn_helper_funcs); From 7f9e88e6ef8c971f2c638b5ff7044c59b5d0f58d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:57:50 +0300 Subject: [PATCH 1354/2880] net: socionext: Fix a signedness bug in ave_probe() The "phy_mode" variable is an enum and in this context GCC treats it as an unsigned int so the error handling is never triggered. Fixes: 4c270b55a5af ("net: ethernet: socionext: add AVE ethernet driver") Signed-off-by: Dan Carpenter Reviewed-by: Kunihiko Hayashi Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/sni_ave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c index 10d0c3e478ab..d047a53f34f2 100644 --- a/drivers/net/ethernet/socionext/sni_ave.c +++ b/drivers/net/ethernet/socionext/sni_ave.c @@ -1566,7 +1566,7 @@ static int ave_probe(struct platform_device *pdev) np = dev->of_node; phy_mode = of_get_phy_mode(np); - if (phy_mode < 0) { + if ((int)phy_mode < 0) { dev_err(dev, "phy-mode not found\n"); return -EINVAL; } From f10210517a2f37feea2edf85eb34c98977265c16 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:58:22 +0300 Subject: [PATCH 1355/2880] net: stmmac: dwmac-meson8b: Fix signedness bug in probe The "dwmac->phy_mode" is an enum and in this context GCC treats it as an unsigned int so the error handling is never triggered. Fixes: 566e82516253 ("net: stmmac: add a glue driver for the Amlogic Meson 8b / GXBB DWMAC") Signed-off-by: Dan Carpenter Reviewed-by: Martin Blumenstingl Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c index 9cda29e4b89d..306da8f6b7d5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c @@ -339,7 +339,7 @@ static int meson8b_dwmac_probe(struct platform_device *pdev) dwmac->dev = &pdev->dev; dwmac->phy_mode = of_get_phy_mode(pdev->dev.of_node); - if (dwmac->phy_mode < 0) { + if ((int)dwmac->phy_mode < 0) { dev_err(&pdev->dev, "missing phy-mode property\n"); ret = -EINVAL; goto err_remove_config_dt; From 73e211e11be86715d66bd3c9d38b3c34b05fca9a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 13:59:11 +0300 Subject: [PATCH 1356/2880] net: axienet: fix a signedness bug in probe The "lp->phy_mode" is an enum but in this context GCC treats it as an unsigned int so the error handling is never triggered. Fixes: ee06b1728b95 ("net: axienet: add support for standard phy-mode binding") Signed-off-by: Dan Carpenter Reviewed-by: Radhey Shyam Pandey Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 4fc627fb4d11..676006f32f91 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1762,7 +1762,7 @@ static int axienet_probe(struct platform_device *pdev) } } else { lp->phy_mode = of_get_phy_mode(pdev->dev.of_node); - if (lp->phy_mode < 0) { + if ((int)lp->phy_mode < 0) { ret = -EINVAL; goto free_netdev; } From d7eb651212fdbafa82d485d8e76095ac3b14c193 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 14:01:00 +0300 Subject: [PATCH 1357/2880] of: mdio: Fix a signedness bug in of_phy_get_and_connect() The "iface" variable is an enum and in this context GCC treats it as an unsigned int so the error handling is never triggered. Fixes: b78624125304 ("of_mdio: Abstract a general interface for phy connect") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/of/of_mdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 000b95787df1..bd6129db6417 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -362,7 +362,7 @@ struct phy_device *of_phy_get_and_connect(struct net_device *dev, int ret; iface = of_get_phy_mode(np); - if (iface < 0) + if ((int)iface < 0) return NULL; if (of_phy_is_fixed_link(np)) { ret = of_phy_register_fixed_link(np); From 1a4b62a0b8a3b81eca24366f63e214a7144b9f02 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 14:05:24 +0300 Subject: [PATCH 1358/2880] net: nixge: Fix a signedness bug in nixge_probe() The "priv->phy_mode" is an enum and in this context GCC will treat it as an unsigned int so it can never be less than zero. Fixes: 492caffa8a1a ("net: ethernet: nixge: Add support for National Instruments XGE netdev") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/ni/nixge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c index 0b384f97d2fd..2761f3a3ae50 100644 --- a/drivers/net/ethernet/ni/nixge.c +++ b/drivers/net/ethernet/ni/nixge.c @@ -1347,7 +1347,7 @@ static int nixge_probe(struct platform_device *pdev) } priv->phy_mode = of_get_phy_mode(pdev->dev.of_node); - if (priv->phy_mode < 0) { + if ((int)priv->phy_mode < 0) { netdev_err(ndev, "not find \"phy-mode\" property\n"); err = -EINVAL; goto unregister_mdio; From 231042181dc9d6122c6faba64e99ccb25f13cc6c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Sep 2019 14:05:54 +0300 Subject: [PATCH 1359/2880] net: ethernet: stmmac: Fix signedness bug in ipq806x_gmac_of_parse() The "gmac->phy_mode" variable is an enum and in this context GCC will treat it as an unsigned int so the error handling will never be triggered. Fixes: b1c17215d718 ("stmmac: add ipq806x glue layer") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c index 2c6d7c69c8f7..0d21082ceb93 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c @@ -191,7 +191,7 @@ static int ipq806x_gmac_of_parse(struct ipq806x_gmac *gmac) struct device *dev = &gmac->pdev->dev; gmac->phy_mode = of_get_phy_mode(dev->of_node); - if (gmac->phy_mode < 0) { + if ((int)gmac->phy_mode < 0) { dev_err(dev, "missing phy mode property\n"); return -EINVAL; } From 0355d6c1d591b8f9e281783ec0cf95fbed893194 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 12:29:34 -0700 Subject: [PATCH 1360/2880] kcm: disable preemption in kcm_parse_func_strparser() After commit a2c11b034142 ("kcm: use BPF_PROG_RUN") syzbot easily triggers the warning in cant_sleep(). As explained in commit 6cab5e90ab2b ("bpf: run bpf programs with preemption disabled") we need to disable preemption before running bpf programs. BUG: assuming atomic context at net/kcm/kcmsock.c:382 in_atomic(): 0, irqs_disabled(): 0, pid: 7, name: kworker/u4:0 3 locks held by kworker/u4:0/7: #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: __write_once_size include/linux/compiler.h:226 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: atomic64_set include/asm-generic/atomic-instrumented.h:855 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: atomic_long_set include/asm-generic/atomic-long.h:40 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: set_work_data kernel/workqueue.c:620 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: set_work_pool_and_clear_pending kernel/workqueue.c:647 [inline] #0: ffff888216726128 ((wq_completion)kstrp){+.+.}, at: process_one_work+0x88b/0x1740 kernel/workqueue.c:2240 #1: ffff8880a989fdc0 ((work_completion)(&strp->work)){+.+.}, at: process_one_work+0x8c1/0x1740 kernel/workqueue.c:2244 #2: ffff888098998d10 (sk_lock-AF_INET){+.+.}, at: lock_sock include/net/sock.h:1522 [inline] #2: ffff888098998d10 (sk_lock-AF_INET){+.+.}, at: strp_sock_lock+0x2e/0x40 net/strparser/strparser.c:440 CPU: 0 PID: 7 Comm: kworker/u4:0 Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: kstrp strp_work Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 __cant_sleep kernel/sched/core.c:6826 [inline] __cant_sleep.cold+0xa4/0xbc kernel/sched/core.c:6803 kcm_parse_func_strparser+0x54/0x200 net/kcm/kcmsock.c:382 __strp_recv+0x5dc/0x1b20 net/strparser/strparser.c:221 strp_recv+0xcf/0x10b net/strparser/strparser.c:343 tcp_read_sock+0x285/0xa00 net/ipv4/tcp.c:1639 strp_read_sock+0x14d/0x200 net/strparser/strparser.c:366 do_strp_work net/strparser/strparser.c:414 [inline] strp_work+0xe3/0x130 net/strparser/strparser.c:423 process_one_work+0x9af/0x1740 kernel/workqueue.c:2269 Fixes: a2c11b034142 ("kcm: use BPF_PROG_RUN") Fixes: 6cab5e90ab2b ("bpf: run bpf programs with preemption disabled") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 8f12f5c6ab87..ea9e73428ed9 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -378,8 +378,12 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct bpf_prog *prog = psock->bpf_prog; + int res; - return BPF_PROG_RUN(prog, skb); + preempt_disable(); + res = BPF_PROG_RUN(prog, skb); + preempt_enable(); + return res; } static int kcm_read_sock_done(struct strparser *strp, int err) From 159d2c7d8106177bd9a986fd005a311fe0d11285 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 13:11:26 -0700 Subject: [PATCH 1361/2880] sch_netem: fix rcu splat in netem_enqueue() qdisc_root() use from netem_enqueue() triggers a lockdep warning. __dev_queue_xmit() uses rcu_read_lock_bh() which is not equivalent to rcu_read_lock() + local_bh_disable_bh as far as lockdep is concerned. WARNING: suspicious RCU usage 5.3.0-rc7+ #0 Not tainted ----------------------------- include/net/sch_generic.h:492 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by syz-executor427/8855: #0: 00000000b5525c01 (rcu_read_lock_bh){....}, at: lwtunnel_xmit_redirect include/net/lwtunnel.h:92 [inline] #0: 00000000b5525c01 (rcu_read_lock_bh){....}, at: ip_finish_output2+0x2dc/0x2570 net/ipv4/ip_output.c:214 #1: 00000000b5525c01 (rcu_read_lock_bh){....}, at: __dev_queue_xmit+0x20a/0x3650 net/core/dev.c:3804 #2: 00000000364bae92 (&(&sch->q.lock)->rlock){+.-.}, at: spin_lock include/linux/spinlock.h:338 [inline] #2: 00000000364bae92 (&(&sch->q.lock)->rlock){+.-.}, at: __dev_xmit_skb net/core/dev.c:3502 [inline] #2: 00000000364bae92 (&(&sch->q.lock)->rlock){+.-.}, at: __dev_queue_xmit+0x14b8/0x3650 net/core/dev.c:3838 stack backtrace: CPU: 0 PID: 8855 Comm: syz-executor427 Not tainted 5.3.0-rc7+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 lockdep_rcu_suspicious+0x153/0x15d kernel/locking/lockdep.c:5357 qdisc_root include/net/sch_generic.h:492 [inline] netem_enqueue+0x1cfb/0x2d80 net/sched/sch_netem.c:479 __dev_xmit_skb net/core/dev.c:3527 [inline] __dev_queue_xmit+0x15d2/0x3650 net/core/dev.c:3838 dev_queue_xmit+0x18/0x20 net/core/dev.c:3902 neigh_hh_output include/net/neighbour.h:500 [inline] neigh_output include/net/neighbour.h:509 [inline] ip_finish_output2+0x1726/0x2570 net/ipv4/ip_output.c:228 __ip_finish_output net/ipv4/ip_output.c:308 [inline] __ip_finish_output+0x5fc/0xb90 net/ipv4/ip_output.c:290 ip_finish_output+0x38/0x1f0 net/ipv4/ip_output.c:318 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip_mc_output+0x292/0xf40 net/ipv4/ip_output.c:417 dst_output include/net/dst.h:436 [inline] ip_local_out+0xbb/0x190 net/ipv4/ip_output.c:125 ip_send_skb+0x42/0xf0 net/ipv4/ip_output.c:1555 udp_send_skb.isra.0+0x6b2/0x1160 net/ipv4/udp.c:887 udp_sendmsg+0x1e96/0x2820 net/ipv4/udp.c:1174 inet_sendmsg+0x9e/0xe0 net/ipv4/af_inet.c:807 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x3e2/0x920 net/socket.c:2311 __sys_sendmmsg+0x1bf/0x4d0 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2439 do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 +++++ net/sched/sch_netem.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 43f5b7ed02bd..637548d54b3e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -494,6 +494,11 @@ static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc) return q; } +static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc) +{ + return rcu_dereference_bh(qdisc->dev_queue->qdisc); +} + static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc) { return qdisc->dev_queue->qdisc_sleeping; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index f5cb35e550f8..0e44039e729c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -476,7 +476,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, * skb will be queued. */ if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { - struct Qdisc *rootq = qdisc_root(sch); + struct Qdisc *rootq = qdisc_root_bh(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ q->duplicate = 0; From 2b6fd3ea438c742d162a40a124b0181922633163 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 25 Sep 2019 02:47:07 +0200 Subject: [PATCH 1362/2880] net: dsa: qca8k: Fix port enable for CPU port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CPU port does not have a PHY connected to it. So calling phy_support_asym_pause() results in an Opps. As with other DSA drivers, add a guard that the port is a user port. Reported-by: Michal Vokáč Fixes: 0394a63acfe2 ("net: dsa: enable and disable all ports") Signed-off-by: Andrew Lunn Tested-by: Michal Vokáč Signed-off-by: David S. Miller --- drivers/net/dsa/qca8k.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 16f15c93a102..684aa51684db 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -936,6 +936,9 @@ qca8k_port_enable(struct dsa_switch *ds, int port, { struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv; + if (!dsa_is_user_port(ds, port)) + return 0; + qca8k_port_set_status(priv, port, 1); priv->port_sts[port].enabled = 1; From 768fb61fcc13b2acaca758275d54c09a65e2968b Mon Sep 17 00:00:00 2001 From: Allan Zhang Date: Wed, 25 Sep 2019 16:43:12 -0700 Subject: [PATCH 1363/2880] bpf: Fix bpf_event_output re-entry issue BPF_PROG_TYPE_SOCK_OPS program can reenter bpf_event_output because it can be called from atomic and non-atomic contexts since we don't have bpf_prog_active to prevent it happen. This patch enables 3 levels of nesting to support normal, irq and nmi context. We can easily reproduce the issue by running netperf crr mode with 100 flows and 10 threads from netperf client side. Here is the whole stack dump: [ 515.228898] WARNING: CPU: 20 PID: 14686 at kernel/trace/bpf_trace.c:549 bpf_event_output+0x1f9/0x220 [ 515.228903] CPU: 20 PID: 14686 Comm: tcp_crr Tainted: G W 4.15.0-smp-fixpanic #44 [ 515.228904] Hardware name: Intel TBG,ICH10/Ikaria_QC_1b, BIOS 1.22.0 06/04/2018 [ 515.228905] RIP: 0010:bpf_event_output+0x1f9/0x220 [ 515.228906] RSP: 0018:ffff9a57ffc03938 EFLAGS: 00010246 [ 515.228907] RAX: 0000000000000012 RBX: 0000000000000001 RCX: 0000000000000000 [ 515.228907] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffffffff836b0f80 [ 515.228908] RBP: ffff9a57ffc039c8 R08: 0000000000000004 R09: 0000000000000012 [ 515.228908] R10: ffff9a57ffc1de40 R11: 0000000000000000 R12: 0000000000000002 [ 515.228909] R13: ffff9a57e13bae00 R14: 00000000ffffffff R15: ffff9a57ffc1e2c0 [ 515.228910] FS: 00007f5a3e6ec700(0000) GS:ffff9a57ffc00000(0000) knlGS:0000000000000000 [ 515.228910] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 515.228911] CR2: 0000537082664fff CR3: 000000061fed6002 CR4: 00000000000226f0 [ 515.228911] Call Trace: [ 515.228913] [ 515.228919] [] bpf_sockopt_event_output+0x3b/0x50 [ 515.228923] [] ? bpf_ktime_get_ns+0xe/0x10 [ 515.228927] [] ? __cgroup_bpf_run_filter_sock_ops+0x85/0x100 [ 515.228930] [] ? tcp_init_transfer+0x125/0x150 [ 515.228933] [] ? tcp_finish_connect+0x89/0x110 [ 515.228936] [] ? tcp_rcv_state_process+0x704/0x1010 [ 515.228939] [] ? sk_filter_trim_cap+0x53/0x2a0 [ 515.228942] [] ? tcp_v6_inbound_md5_hash+0x6f/0x1d0 [ 515.228945] [] ? tcp_v6_do_rcv+0x1c0/0x460 [ 515.228947] [] ? tcp_v6_rcv+0x9f8/0xb30 [ 515.228951] [] ? ip6_route_input+0x190/0x220 [ 515.228955] [] ? ip6_protocol_deliver_rcu+0x6d/0x450 [ 515.228958] [] ? ip6_rcv_finish+0xb6/0x170 [ 515.228961] [] ? ip6_protocol_deliver_rcu+0x450/0x450 [ 515.228963] [] ? ipv6_rcv+0x61/0xe0 [ 515.228966] [] ? ipv6_list_rcv+0x330/0x330 [ 515.228969] [] ? __netif_receive_skb_one_core+0x5b/0xa0 [ 515.228972] [] ? __netif_receive_skb+0x21/0x70 [ 515.228975] [] ? process_backlog+0xb2/0x150 [ 515.228978] [] ? net_rx_action+0x16f/0x410 [ 515.228982] [] ? __do_softirq+0xdd/0x305 [ 515.228986] [] ? irq_exit+0x9c/0xb0 [ 515.228989] [] ? smp_call_function_single_interrupt+0x65/0x120 [ 515.228991] [] ? call_function_single_interrupt+0x81/0x90 [ 515.228992] [ 515.228996] [] ? io_serial_in+0x20/0x20 [ 515.229000] [] ? console_unlock+0x230/0x490 [ 515.229003] [] ? vprintk_emit+0x26a/0x2a0 [ 515.229006] [] ? vprintk_default+0x1f/0x30 [ 515.229008] [] ? vprintk_func+0x35/0x70 [ 515.229011] [] ? printk+0x50/0x66 [ 515.229013] [] ? bpf_event_output+0xb7/0x220 [ 515.229016] [] ? bpf_sockopt_event_output+0x3b/0x50 [ 515.229019] [] ? bpf_ktime_get_ns+0xe/0x10 [ 515.229023] [] ? release_sock+0x97/0xb0 [ 515.229026] [] ? tcp_recvmsg+0x31a/0xda0 [ 515.229029] [] ? __cgroup_bpf_run_filter_sock_ops+0x85/0x100 [ 515.229032] [] ? tcp_set_state+0x191/0x1b0 [ 515.229035] [] ? tcp_disconnect+0x2e/0x600 [ 515.229038] [] ? tcp_close+0x3eb/0x460 [ 515.229040] [] ? inet_release+0x42/0x70 [ 515.229043] [] ? inet6_release+0x39/0x50 [ 515.229046] [] ? __sock_release+0x4d/0xd0 [ 515.229049] [] ? sock_close+0x15/0x20 [ 515.229052] [] ? __fput+0xe7/0x1f0 [ 515.229055] [] ? ____fput+0xe/0x10 [ 515.229058] [] ? task_work_run+0x82/0xb0 [ 515.229061] [] ? exit_to_usermode_loop+0x7e/0x11f [ 515.229064] [] ? do_syscall_64+0x111/0x130 [ 515.229067] [] ? entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: a5a3a828cd00 ("bpf: add perf event notificaton support for sock_ops") Signed-off-by: Allan Zhang Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20190925234312.94063-2-allanzhang@google.com --- kernel/trace/bpf_trace.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1255d14576..3e38a010003c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -500,14 +500,17 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); -static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); +static DEFINE_PER_CPU(int, bpf_event_output_nest_level); +struct bpf_nested_pt_regs { + struct pt_regs regs[3]; +}; +static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); - struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); + int nest_level = this_cpu_inc_return(bpf_event_output_nest_level); struct perf_raw_frag frag = { .copy = ctx_copy, .size = ctx_size, @@ -522,12 +525,25 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, .data = meta, }, }; + struct perf_sample_data *sd; + struct pt_regs *regs; + u64 ret; + + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { + ret = -EBUSY; + goto out; + } + sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]); + regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]); perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, sd); + ret = __bpf_perf_event_output(regs, map, flags, sd); +out: + this_cpu_dec(bpf_event_output_nest_level); + return ret; } BPF_CALL_0(bpf_get_current_task) From 4f6570d7206bb052f42718d55fbe72977f0318ea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 08:01:14 -0700 Subject: [PATCH 1364/2880] ipv6: add priority parameter to ip6_xmit() Currently, ip6_xmit() sets skb->priority based on sk->sk_priority This is not desirable for TCP since TCP shares the same ctl socket for a given netns. We want to be able to send RST or ACK packets with a non zero skb->priority. This patch has no functional change. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 +- net/dccp/ipv6.c | 5 +++-- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 4 ++-- net/ipv6/tcp_ipv6.c | 6 ++++-- net/sctp/ipv6.c | 2 +- 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 8dfc65639aa4..009605c56f20 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -981,7 +981,7 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb); * upper-layer output functions */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, - __u32 mark, struct ipv6_txoptions *opt, int tclass); + __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 1b7381ff787b..25aab672fc99 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -230,7 +230,8 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass); + err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass, + sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -284,7 +285,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(skb, dst); - ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0); + ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0); DCCP_INC_STATS(DCCP_MIB_OUTSEGS); DCCP_INC_STATS(DCCP_MIB_OUTRSTS); return; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 4da24aa6c696..0a0945a5b30d 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -133,7 +133,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused fl6.daddr = sk->sk_v6_daddr; res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt), - np->tclass); + np->tclass, sk->sk_priority); rcu_read_unlock(); return res; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 89a4c7c2e25d..edadee4a7e76 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -193,7 +193,7 @@ bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) * which are using proper atomic operations or spinlocks. */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, - __u32 mark, struct ipv6_txoptions *opt, int tclass) + __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); @@ -258,7 +258,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hdr->daddr = *first_hop; skb->protocol = htons(ETH_P_IPV6); - skb->priority = sk->sk_priority; + skb->priority = priority; skb->mark = mark; mtu = dst_mtu(dst); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 87f44d3250ee..806064c28867 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -512,7 +512,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass); + err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass, + sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -907,7 +908,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); - ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass); + ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass, + 0); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e5f2fc726a98..dd860fea0148 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -215,7 +215,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), - tclass); + tclass, sk->sk_priority); rcu_read_unlock(); return res; } From e9a5dceee56cb527a3498f1a59bd8726baa1e717 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 08:01:15 -0700 Subject: [PATCH 1365/2880] ipv6: tcp: provide sk->sk_priority to ctl packets We can populate skb->priority for some ctl packets instead of always using zero. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 806064c28867..5f557bf27da2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -804,7 +804,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int rst, - u8 tclass, __be32 label) + u8 tclass, __be32 label, u32 priority) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; @@ -909,7 +909,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass, - 0); + priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); @@ -932,6 +932,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) struct sock *sk1 = NULL; #endif __be32 label = 0; + u32 priority = 0; struct net *net; int oif = 0; @@ -992,6 +993,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) trace_tcp_send_reset(sk, skb); if (np->repflow) label = ip6_flowlabel(ipv6h); + priority = sk->sk_priority; } if (sk->sk_state == TCP_TIME_WAIT) label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); @@ -1001,7 +1003,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) } tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, - label); + label, priority); #ifdef CONFIG_TCP_MD5SIG out: @@ -1012,10 +1014,10 @@ out: static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, u8 tclass, - __be32 label) + __be32 label, u32 priority) { tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0, - tclass, label); + tclass, label, priority); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -1027,7 +1029,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel)); + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), 0); inet_twsk_put(tw); } @@ -1050,7 +1052,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), - 0, 0); + 0, 0, sk->sk_priority); } From f6c0f5d209fa80eb808e08aa4206f6e264041ef6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2019 08:01:16 -0700 Subject: [PATCH 1366/2880] tcp: honor SO_PRIORITY in TIME_WAIT state ctl packets sent on behalf of TIME_WAIT sockets currently have a zero skb->priority, which can cause various problems. In this patch we : - add a tw_priority field in struct inet_timewait_sock. - populate it from sk->sk_priority when a TIME_WAIT is created. - For IPv4, change ip_send_unicast_reply() and its two callers to propagate tw_priority correctly. ip_send_unicast_reply() no longer changes sk->sk_priority. - For IPv6, make sure TIME_WAIT sockets pass their tw_priority field to tcp_v6_send_response() and tcp_v6_send_ack(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 1 + net/ipv4/ip_output.c | 1 - net/ipv4/tcp_ipv4.c | 4 ++++ net/ipv4/tcp_minisocks.c | 1 + net/ipv6/tcp_ipv6.c | 6 ++++-- 5 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index aef38c140014..dfd919b3119e 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -71,6 +71,7 @@ struct inet_timewait_sock { tw_pad : 2, /* 2 bits hole */ tw_tos : 8; u32 tw_txhash; + u32 tw_priority; struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; }; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a77c3a4c24de..28fca408812c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1694,7 +1694,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, inet_sk(sk)->tos = arg->tos; - sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; sk->sk_sndbuf = sysctl_wmem_default; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fd394ad179a0..2ee45e3755e9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -771,6 +771,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); } ip_send_unicast_reply(ctl_sk, @@ -866,6 +868,8 @@ static void tcp_v4_send_ack(const struct sock *sk, ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8bcaf2586b68..bb140a5db8c0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -266,6 +266,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_transparent = inet->transparent; tw->tw_mark = sk->sk_mark; + tw->tw_priority = sk->sk_priority; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f557bf27da2..e3d9f4559c99 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -995,8 +995,10 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) label = ip6_flowlabel(ipv6h); priority = sk->sk_priority; } - if (sk->sk_state == TCP_TIME_WAIT) + if (sk->sk_state == TCP_TIME_WAIT) { label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); + priority = inet_twsk(sk)->tw_priority; + } } else { if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) label = ip6_flowlabel(ipv6h); @@ -1029,7 +1031,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), 0); + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority); inet_twsk_put(tw); } From 05733434ee9ae6548723a808647248583e347cca Mon Sep 17 00:00:00 2001 From: Ka-Cheong Poon Date: Tue, 24 Sep 2019 08:51:16 -0700 Subject: [PATCH 1367/2880] net/rds: Check laddr_check before calling it In rds_bind(), laddr_check is called without checking if it is NULL or not. And rs_transport should be reset if rds_add_bound() fails. Fixes: c5c1a030a7db ("net/rds: An rds_sock is added too early to the hash table") Reported-by: syzbot+fae39afd2101a17ec624@syzkaller.appspotmail.com Signed-off-by: Ka-Cheong Poon Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/bind.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/rds/bind.c b/net/rds/bind.c index 20c156a73e73..5b5fb4ca8d3e 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -244,7 +244,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ if (rs->rs_transport) { trans = rs->rs_transport; - if (trans->laddr_check(sock_net(sock->sk), + if (!trans->laddr_check || + trans->laddr_check(sock_net(sock->sk), binding_addr, scope_id) != 0) { ret = -ENOPROTOOPT; goto out; @@ -263,6 +264,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sock_set_flag(sk, SOCK_RCU_FREE); ret = rds_add_bound(rs, binding_addr, &port, scope_id); + if (ret) + rs->rs_transport = NULL; out: release_sock(sk); From 4ce70b4aed5752332b268909336b351721965dc4 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 24 Sep 2019 18:51:16 +0300 Subject: [PATCH 1368/2880] net: sched: sch_htb: don't call qdisc_put() while holding tree lock Recent changes that removed rtnl dependency from rules update path of tc also made tcf_block_put() function sleeping. This function is called from ops->destroy() of several Qdisc implementations, which in turn is called by qdisc_put(). Some Qdiscs call qdisc_put() while holding sch tree spinlock, which results sleeping-while-atomic BUG. Steps to reproduce for htb: tc qdisc add dev ens1f0 root handle 1: htb default 12 tc class add dev ens1f0 parent 1: classid 1:1 htb rate 100kbps ceil 100kbps tc qdisc add dev ens1f0 parent 1:1 handle 40: sfq perturb 10 tc class add dev ens1f0 parent 1:1 classid 1:2 htb rate 100kbps ceil 100kbps Resulting dmesg: [ 4791.148551] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 4791.151354] in_atomic(): 1, irqs_disabled(): 0, pid: 27273, name: tc [ 4791.152805] INFO: lockdep is turned off. [ 4791.153605] CPU: 19 PID: 27273 Comm: tc Tainted: G W 5.3.0-rc8+ #721 [ 4791.154336] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 4791.155075] Call Trace: [ 4791.155803] dump_stack+0x85/0xc0 [ 4791.156529] ___might_sleep.cold+0xac/0xbc [ 4791.157251] __mutex_lock+0x5b/0x960 [ 4791.157966] ? console_unlock+0x363/0x5d0 [ 4791.158676] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 4791.159395] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 4791.160103] tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 4791.160815] tcf_block_put_ext.part.0+0x21/0x50 [ 4791.161530] tcf_block_put+0x50/0x70 [ 4791.162233] sfq_destroy+0x15/0x50 [sch_sfq] [ 4791.162936] qdisc_destroy+0x5f/0x160 [ 4791.163642] htb_change_class.cold+0x5df/0x69d [sch_htb] [ 4791.164505] tc_ctl_tclass+0x19d/0x480 [ 4791.165360] rtnetlink_rcv_msg+0x170/0x4b0 [ 4791.166191] ? netlink_deliver_tap+0x95/0x400 [ 4791.166907] ? rtnl_dellink+0x2d0/0x2d0 [ 4791.167625] netlink_rcv_skb+0x49/0x110 [ 4791.168345] netlink_unicast+0x171/0x200 [ 4791.169058] netlink_sendmsg+0x224/0x3f0 [ 4791.169771] sock_sendmsg+0x5e/0x60 [ 4791.170475] ___sys_sendmsg+0x2ae/0x330 [ 4791.171183] ? ___sys_recvmsg+0x159/0x1f0 [ 4791.171894] ? do_wp_page+0x9c/0x790 [ 4791.172595] ? __handle_mm_fault+0xcd3/0x19e0 [ 4791.173309] __sys_sendmsg+0x59/0xa0 [ 4791.174024] do_syscall_64+0x5c/0xb0 [ 4791.174725] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 4791.175435] RIP: 0033:0x7f0aa41497b8 [ 4791.176129] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 65 8f 0c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 89 5 4 [ 4791.177532] RSP: 002b:00007fff4e37d588 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 4791.178243] RAX: ffffffffffffffda RBX: 000000005d8132f7 RCX: 00007f0aa41497b8 [ 4791.178947] RDX: 0000000000000000 RSI: 00007fff4e37d5f0 RDI: 0000000000000003 [ 4791.179662] RBP: 0000000000000000 R08: 0000000000000001 R09: 00000000020149a0 [ 4791.180382] R10: 0000000000404eda R11: 0000000000000246 R12: 0000000000000001 [ 4791.181100] R13: 000000000047f640 R14: 0000000000000000 R15: 0000000000000000 In htb_change_class() function save parent->leaf.q to local temporary variable and put reference to it after sch tree lock is released in order not to call potentially sleeping cls API in atomic section. This is safe to do because Qdisc has already been reset by qdisc_purge_queue() inside sch tree lock critical section. Fixes: c266f64dbfa2 ("net: sched: protect block state with mutex") Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 7bcf20ef9145..8184c87da8be 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1302,6 +1302,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_HTB_MAX + 1]; + struct Qdisc *parent_qdisc = NULL; struct tc_htb_opt *hopt; u64 rate64, ceil64; int warn = 0; @@ -1401,7 +1402,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (parent && !parent->level) { /* turn parent into inner node */ qdisc_purge_queue(parent->leaf.q); - qdisc_put(parent->leaf.q); + parent_qdisc = parent->leaf.q; if (parent->prio_activity) htb_deactivate(q, parent); @@ -1480,6 +1481,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer); sch_tree_unlock(sch); + qdisc_put(parent_qdisc); if (warn) pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n", From c2999f7fb05b87da4060e38150c70fa46794d82b Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 24 Sep 2019 18:51:17 +0300 Subject: [PATCH 1369/2880] net: sched: multiq: don't call qdisc_put() while holding tree lock Recent changes that removed rtnl dependency from rules update path of tc also made tcf_block_put() function sleeping. This function is called from ops->destroy() of several Qdisc implementations, which in turn is called by qdisc_put(). Some Qdiscs call qdisc_put() while holding sch tree spinlock, which results sleeping-while-atomic BUG. Steps to reproduce for multiq: tc qdisc add dev ens1f0 root handle 1: multiq tc qdisc add dev ens1f0 parent 1:10 handle 50: sfq perturb 10 ethtool -L ens1f0 combined 2 tc qdisc change dev ens1f0 root handle 1: multiq Resulting dmesg: [ 5539.419344] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 5539.420945] in_atomic(): 1, irqs_disabled(): 0, pid: 27658, name: tc [ 5539.422435] INFO: lockdep is turned off. [ 5539.423904] CPU: 21 PID: 27658 Comm: tc Tainted: G W 5.3.0-rc8+ #721 [ 5539.425400] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 5539.426911] Call Trace: [ 5539.428380] dump_stack+0x85/0xc0 [ 5539.429823] ___might_sleep.cold+0xac/0xbc [ 5539.431262] __mutex_lock+0x5b/0x960 [ 5539.432682] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 5539.434103] ? __nla_validate_parse+0x51/0x840 [ 5539.435493] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 5539.436903] tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 5539.438327] tcf_block_put_ext.part.0+0x21/0x50 [ 5539.439752] tcf_block_put+0x50/0x70 [ 5539.441165] sfq_destroy+0x15/0x50 [sch_sfq] [ 5539.442570] qdisc_destroy+0x5f/0x160 [ 5539.444000] multiq_tune+0x14a/0x420 [sch_multiq] [ 5539.445421] tc_modify_qdisc+0x324/0x840 [ 5539.446841] rtnetlink_rcv_msg+0x170/0x4b0 [ 5539.448269] ? netlink_deliver_tap+0x95/0x400 [ 5539.449691] ? rtnl_dellink+0x2d0/0x2d0 [ 5539.451116] netlink_rcv_skb+0x49/0x110 [ 5539.452522] netlink_unicast+0x171/0x200 [ 5539.453914] netlink_sendmsg+0x224/0x3f0 [ 5539.455304] sock_sendmsg+0x5e/0x60 [ 5539.456686] ___sys_sendmsg+0x2ae/0x330 [ 5539.458071] ? ___sys_recvmsg+0x159/0x1f0 [ 5539.459461] ? do_wp_page+0x9c/0x790 [ 5539.460846] ? __handle_mm_fault+0xcd3/0x19e0 [ 5539.462263] __sys_sendmsg+0x59/0xa0 [ 5539.463661] do_syscall_64+0x5c/0xb0 [ 5539.465044] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 5539.466454] RIP: 0033:0x7f1fe08177b8 [ 5539.467863] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 65 8f 0c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 89 5 4 [ 5539.470906] RSP: 002b:00007ffe812de5d8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 5539.472483] RAX: ffffffffffffffda RBX: 000000005d8135e3 RCX: 00007f1fe08177b8 [ 5539.474069] RDX: 0000000000000000 RSI: 00007ffe812de640 RDI: 0000000000000003 [ 5539.475655] RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000182e9b0 [ 5539.477203] R10: 0000000000404eda R11: 0000000000000246 R12: 0000000000000001 [ 5539.478699] R13: 000000000047f640 R14: 0000000000000000 R15: 0000000000000000 Rearrange locking in multiq_tune() in following ways: - In loop that removes Qdiscs from disabled queues, call qdisc_purge_queue() instead of qdisc_tree_flush_backlog() on Qdisc that is being destroyed. Save the Qdisc in temporary allocated array and call qdisc_put() on each element of the array after sch tree lock is released. This is safe to do because Qdiscs have already been reset by qdisc_purge_queue() inside sch tree lock critical section. - Do the same change for second loop that initializes Qdiscs for newly enabled queues in multiq_tune() function. Since sch tree lock is obtained and released on each iteration of this loop, just call qdisc_put() directly outside of critical section. Don't verify that old Qdisc is not noop_qdisc before releasing reference to it because such check is already performed by qdisc_put*() functions. Fixes: c266f64dbfa2 ("net: sched: protect block state with mutex") Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/sch_multiq.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index e1087746f6a2..b2b7fdb06fc6 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -174,7 +174,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, { struct multiq_sched_data *q = qdisc_priv(sch); struct tc_multiq_qopt *qopt; - int i; + struct Qdisc **removed; + int i, n_removed = 0; if (!netif_is_multiqueue(qdisc_dev(sch))) return -EOPNOTSUPP; @@ -185,6 +186,11 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, qopt->bands = qdisc_dev(sch)->real_num_tx_queues; + removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands), + GFP_KERNEL); + if (!removed) + return -ENOMEM; + sch_tree_lock(sch); q->bands = qopt->bands; for (i = q->bands; i < q->max_bands; i++) { @@ -192,13 +198,17 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; - qdisc_tree_flush_backlog(child); - qdisc_put(child); + qdisc_purge_queue(child); + removed[n_removed++] = child; } } sch_tree_unlock(sch); + for (i = 0; i < n_removed; i++) + qdisc_put(removed[i]); + kfree(removed); + for (i = 0; i < q->bands; i++) { if (q->queues[i] == &noop_qdisc) { struct Qdisc *child, *old; @@ -213,11 +223,10 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, if (child != &noop_qdisc) qdisc_hash_add(child, true); - if (old != &noop_qdisc) { - qdisc_tree_flush_backlog(old); - qdisc_put(old); - } + if (old != &noop_qdisc) + qdisc_purge_queue(old); sch_tree_unlock(sch); + qdisc_put(old); } } } From e3ae1f96accd21405715fe9c56b4d83bc7d96d44 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 24 Sep 2019 18:51:18 +0300 Subject: [PATCH 1370/2880] net: sched: sch_sfb: don't call qdisc_put() while holding tree lock Recent changes that removed rtnl dependency from rules update path of tc also made tcf_block_put() function sleeping. This function is called from ops->destroy() of several Qdisc implementations, which in turn is called by qdisc_put(). Some Qdiscs call qdisc_put() while holding sch tree spinlock, which results sleeping-while-atomic BUG. Steps to reproduce for sfb: tc qdisc add dev ens1f0 handle 1: root sfb tc qdisc add dev ens1f0 parent 1:10 handle 50: sfq perturb 10 tc qdisc change dev ens1f0 root handle 1: sfb Resulting dmesg: [ 7265.938717] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 7265.940152] in_atomic(): 1, irqs_disabled(): 0, pid: 28579, name: tc [ 7265.941455] INFO: lockdep is turned off. [ 7265.942744] CPU: 11 PID: 28579 Comm: tc Tainted: G W 5.3.0-rc8+ #721 [ 7265.944065] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 7265.945396] Call Trace: [ 7265.946709] dump_stack+0x85/0xc0 [ 7265.947994] ___might_sleep.cold+0xac/0xbc [ 7265.949282] __mutex_lock+0x5b/0x960 [ 7265.950543] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 7265.951803] ? tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 7265.953022] tcf_chain0_head_change_cb_del.isra.0+0x1b/0xf0 [ 7265.954248] tcf_block_put_ext.part.0+0x21/0x50 [ 7265.955478] tcf_block_put+0x50/0x70 [ 7265.956694] sfq_destroy+0x15/0x50 [sch_sfq] [ 7265.957898] qdisc_destroy+0x5f/0x160 [ 7265.959099] sfb_change+0x175/0x330 [sch_sfb] [ 7265.960304] tc_modify_qdisc+0x324/0x840 [ 7265.961503] rtnetlink_rcv_msg+0x170/0x4b0 [ 7265.962692] ? netlink_deliver_tap+0x95/0x400 [ 7265.963876] ? rtnl_dellink+0x2d0/0x2d0 [ 7265.965064] netlink_rcv_skb+0x49/0x110 [ 7265.966251] netlink_unicast+0x171/0x200 [ 7265.967427] netlink_sendmsg+0x224/0x3f0 [ 7265.968595] sock_sendmsg+0x5e/0x60 [ 7265.969753] ___sys_sendmsg+0x2ae/0x330 [ 7265.970916] ? ___sys_recvmsg+0x159/0x1f0 [ 7265.972074] ? do_wp_page+0x9c/0x790 [ 7265.973233] ? __handle_mm_fault+0xcd3/0x19e0 [ 7265.974407] __sys_sendmsg+0x59/0xa0 [ 7265.975591] do_syscall_64+0x5c/0xb0 [ 7265.976753] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 7265.977938] RIP: 0033:0x7f229069f7b8 [ 7265.979117] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 65 8f 0c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 89 5 4 [ 7265.981681] RSP: 002b:00007ffd7ed2d158 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 7265.983001] RAX: ffffffffffffffda RBX: 000000005d813ca1 RCX: 00007f229069f7b8 [ 7265.984336] RDX: 0000000000000000 RSI: 00007ffd7ed2d1c0 RDI: 0000000000000003 [ 7265.985682] RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000165c9a0 [ 7265.987021] R10: 0000000000404eda R11: 0000000000000246 R12: 0000000000000001 [ 7265.988309] R13: 000000000047f640 R14: 0000000000000000 R15: 0000000000000000 In sfb_change() function use qdisc_purge_queue() instead of qdisc_tree_flush_backlog() to properly reset old child Qdisc and save pointer to it into local temporary variable. Put reference to Qdisc after sch tree lock is released in order not to call potentially sleeping cls API in atomic section. This is safe to do because Qdisc has already been reset by qdisc_purge_queue() inside sch tree lock critical section. Reported-by: syzbot+ac54455281db908c581e@syzkaller.appspotmail.com Fixes: c266f64dbfa2 ("net: sched: protect block state with mutex") Suggested-by: Cong Wang Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/sch_sfb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 1dff8506a715..d448fe3068e5 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -488,7 +488,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); - struct Qdisc *child; + struct Qdisc *child, *old; struct nlattr *tb[TCA_SFB_MAX + 1]; const struct tc_sfb_qopt *ctl = &sfb_default_ops; u32 limit; @@ -518,8 +518,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, qdisc_hash_add(child, true); sch_tree_lock(sch); - qdisc_tree_flush_backlog(q->qdisc); - qdisc_put(q->qdisc); + qdisc_purge_queue(q->qdisc); + old = q->qdisc; q->qdisc = child; q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); @@ -542,6 +542,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, sfb_init_perturbation(1, q); sch_tree_unlock(sch); + qdisc_put(old); return 0; } From 6eeb4ef049e7cd89783e8ebe1ea2f1dac276f82c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 24 Sep 2019 12:43:08 +0200 Subject: [PATCH 1371/2880] KVM: x86: assign two bits to track SPTE kinds Currently, we are overloading SPTE_SPECIAL_MASK to mean both "A/D bits unavailable" and MMIO, where the difference between the two is determined by mio_mask and mmio_value. However, the next patch will need two bits to distinguish availability of A/D bits from write protection. So, while at it give MMIO its own bit pattern, and move the two bits from bit 62 to bits 52..53 since Intel is allocating EPT page table bits from the top. Reviewed-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 7 ------- arch/x86/kvm/mmu.c | 28 ++++++++++++++++++---------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 23edf56cf577..50eb430b0ad8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -219,13 +219,6 @@ enum { PFERR_WRITE_MASK | \ PFERR_PRESENT_MASK) -/* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting - * with the SVE bit in EPT PTEs. - */ -#define SPTE_SPECIAL_MASK (1ULL << 62) - /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5269aa057dfa..bac8d228d82b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -83,7 +83,16 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 #define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 52 +#define PT64_SECOND_AVAIL_BITS_SHIFT 54 + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. + */ +#define SPTE_SPECIAL_MASK (3ULL << 52) +#define SPTE_AD_ENABLED_MASK (0ULL << 52) +#define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_MMIO_MASK (3ULL << 52) #define PT64_LEVEL_BITS 9 @@ -219,12 +228,11 @@ static u64 __read_mostly shadow_present_mask; static u64 __read_mostly shadow_me_mask; /* - * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. - * Non-present SPTEs with shadow_acc_track_value set are in place for access - * tracking. + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. */ static u64 __read_mostly shadow_acc_track_mask; -static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -304,7 +312,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); BUG_ON((mmio_mask & mmio_value) != mmio_value); - shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; shadow_mmio_access_mask = access_mask; } @@ -323,7 +331,7 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { MMU_WARN_ON(is_mmio_spte(spte)); - return !(spte & shadow_acc_track_value); + return (spte & SPTE_SPECIAL_MASK) == SPTE_AD_ENABLED_MASK; } static inline u64 spte_shadow_accessed_mask(u64 spte) @@ -461,7 +469,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, { BUG_ON(!dirty_mask != !accessed_mask); BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & shadow_acc_track_value); + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -2622,7 +2630,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, shadow_user_mask | shadow_x_mask | shadow_me_mask; if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; else spte |= shadow_accessed_mask; @@ -2968,7 +2976,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, sp = page_header(__pa(sptep)); if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; /* * For the EPT case, shadow_present_mask is 0 if hardware From 1f4e5fc83a4217fc61b23370b07573827329d7bd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 26 Sep 2019 18:47:59 +0200 Subject: [PATCH 1372/2880] KVM: x86: fix nested guest live migration with PML Shadow paging is fundamentally incompatible with the page-modification log, because the GPAs in the log come from the wrong memory map. In particular, for the EPT page-modification log, the GPAs in the log come from L2 rather than L1. (If there was a non-EPT page-modification log, we couldn't use it for shadow paging because it would log GVAs rather than GPAs). Therefore, we need to rely on write protection to record dirty pages. This has the side effect of bypassing PML, since writes now result in an EPT violation vmexit. This is relatively easy to add to KVM, because pretty much the only place that needs changing is spte_clear_dirty. The first access to the page already goes through the page fault path and records the correct GPA; it's only subsequent accesses that are wrong. Therefore, we can equip set_spte (where the first access happens) to record that the SPTE will have to be write protected, and then spte_clear_dirty will use this information to do the right thing. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bac8d228d82b..24c23c66b226 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -92,6 +92,7 @@ module_param(dbg, bool, 0644); #define SPTE_SPECIAL_MASK (3ULL << 52) #define SPTE_AD_ENABLED_MASK (0ULL << 52) #define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) #define SPTE_MMIO_MASK (3ULL << 52) #define PT64_LEVEL_BITS 9 @@ -328,10 +329,27 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) return sp->role.ad_disabled; } +static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +{ + /* + * When using the EPT page-modification log, the GPAs in the log + * would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages. This also bypasses + * PML, since writes now result in a vmexit. + */ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + static inline bool spte_ad_enabled(u64 spte) { MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) == SPTE_AD_ENABLED_MASK; + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; +} + +static inline bool spte_ad_need_write_protect(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; } static inline u64 spte_shadow_accessed_mask(u64 spte) @@ -1597,16 +1615,16 @@ static bool spte_clear_dirty(u64 *sptep) rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; - return mmu_spte_update(sptep, spte); } -static bool wrprot_ad_disabled_spte(u64 *sptep) +static bool spte_wrprot_for_clear_dirty(u64 *sptep) { bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); - if (was_writable) + if (was_writable && !spte_ad_enabled(*sptep)) kvm_set_pfn_dirty(spte_to_pfn(*sptep)); return was_writable; @@ -1625,10 +1643,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - if (spte_ad_enabled(*sptep)) - flush |= spte_clear_dirty(sptep); + if (spte_ad_need_write_protect(*sptep)) + flush |= spte_wrprot_for_clear_dirty(sptep); else - flush |= wrprot_ad_disabled_spte(sptep); + flush |= spte_clear_dirty(sptep); return flush; } @@ -1639,6 +1657,11 @@ static bool spte_set_dirty(u64 *sptep) rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); + /* + * Similar to the !kvm_x86_ops->slot_disable_log_dirty case, + * do not bother adding back write access to pages marked + * SPTE_AD_WRPROT_ONLY_MASK. + */ spte |= shadow_dirty_mask; return mmu_spte_update(sptep, spte); @@ -2977,6 +3000,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, sp = page_header(__pa(sptep)); if (sp_ad_disabled(sp)) spte |= SPTE_AD_DISABLED_MASK; + else if (kvm_vcpu_ad_need_write_protect(vcpu)) + spte |= SPTE_AD_WRPROT_ONLY_MASK; /* * For the EPT case, shadow_present_mask is 0 if hardware From 094444204570a5420d9e6ce3d4558877c3487856 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 26 Sep 2019 15:01:15 +0200 Subject: [PATCH 1373/2880] selftests: kvm: add test for dirty logging inside nested guests Check that accesses by nested guests are logged according to the L1 physical addresses rather than L2. Most of the patch is really adding EPT support to the testing framework. Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/include/x86_64/processor.h | 3 + .../selftests/kvm/include/x86_64/vmx.h | 14 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- .../selftests/kvm/lib/kvm_util_internal.h | 3 + tools/testing/selftests/kvm/lib/x86_64/vmx.c | 201 +++++++++++++++++- .../selftests/kvm/x86_64/vmx_dirty_log_test.c | 156 ++++++++++++++ 7 files changed, 377 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 62c591f87dab..fd84b7a78dcf 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -22,6 +22,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += clear_dirty_log_test diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 0c17f2ee685e..ff234018219c 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -1083,6 +1083,9 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); #define VMX_BASIC_MEM_TYPE_WB 6LLU #define VMX_BASIC_INOUT 0x0040000000000000LLU +/* VMX_EPT_VPID_CAP bits */ +#define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21) + /* MSR_IA32_VMX_MISC bits */ #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 69b17055f63d..6ae5a47fe067 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -569,6 +569,10 @@ struct vmx_pages { void *enlightened_vmcs_hva; uint64_t enlightened_vmcs_gpa; void *enlightened_vmcs; + + void *eptp_hva; + uint64_t eptp_gpa; + void *eptp; }; struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva); @@ -576,4 +580,14 @@ bool prepare_for_vmx_operation(struct vmx_pages *vmx); void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp); bool load_vmcs(struct vmx_pages *vmx); +void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot); +void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size, + uint32_t eptp_memslot); +void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t memslot, uint32_t eptp_memslot); +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t eptp_memslot); + #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 80a338b5403c..41cf45416060 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -705,7 +705,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, * on error (e.g. currently no memory region using memslot as a KVM * memory slot ID). */ -static struct userspace_mem_region * +struct userspace_mem_region * memslot2region(struct kvm_vm *vm, uint32_t memslot) { struct userspace_mem_region *region; diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index f36262e0f655..ac50c42750cf 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -68,4 +68,7 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent); void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent); +struct userspace_mem_region * +memslot2region(struct kvm_vm *vm, uint32_t memslot); + #endif /* SELFTEST_KVM_UTIL_INTERNAL_H */ diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 9cef0455b819..fab8f6b0bf52 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -7,11 +7,39 @@ #include "test_util.h" #include "kvm_util.h" +#include "../kvm_util_internal.h" #include "processor.h" #include "vmx.h" +#define PAGE_SHIFT_4K 12 + +#define KVM_EPT_PAGE_TABLE_MIN_PADDR 0x1c0000 + bool enable_evmcs; +struct eptPageTableEntry { + uint64_t readable:1; + uint64_t writable:1; + uint64_t executable:1; + uint64_t memory_type:3; + uint64_t ignore_pat:1; + uint64_t page_size:1; + uint64_t accessed:1; + uint64_t dirty:1; + uint64_t ignored_11_10:2; + uint64_t address:40; + uint64_t ignored_62_52:11; + uint64_t suppress_ve:1; +}; + +struct eptPageTablePointer { + uint64_t memory_type:3; + uint64_t page_walk_length:3; + uint64_t ad_enabled:1; + uint64_t reserved_11_07:5; + uint64_t address:40; + uint64_t reserved_63_52:12; +}; int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) { uint16_t evmcs_ver; @@ -174,15 +202,35 @@ bool load_vmcs(struct vmx_pages *vmx) */ static inline void init_vmcs_control_fields(struct vmx_pages *vmx) { + uint32_t sec_exec_ctl = 0; + vmwrite(VIRTUAL_PROCESSOR_ID, 0); vmwrite(POSTED_INTR_NV, 0); vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS)); - if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, 0)) + + if (vmx->eptp_gpa) { + uint64_t ept_paddr; + struct eptPageTablePointer eptp = { + .memory_type = VMX_BASIC_MEM_TYPE_WB, + .page_walk_length = 3, /* + 1 */ + .ad_enabled = !!(rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & VMX_EPT_VPID_CAP_AD_BITS), + .address = vmx->eptp_gpa >> PAGE_SHIFT_4K, + }; + + memcpy(&ept_paddr, &eptp, sizeof(ept_paddr)); + vmwrite(EPT_POINTER, ept_paddr); + sec_exec_ctl |= SECONDARY_EXEC_ENABLE_EPT; + } + + if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, sec_exec_ctl)) vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); - else + else { vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS)); + GUEST_ASSERT(!sec_exec_ctl); + } + vmwrite(EXCEPTION_BITMAP, 0); vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */ @@ -327,3 +375,152 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_host_state(); init_vmcs_guest_state(guest_rip, guest_rsp); } + +void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot) +{ + uint16_t index[4]; + struct eptPageTableEntry *pml4e; + + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " + "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + TEST_ASSERT((nested_paddr % vm->page_size) == 0, + "Nested physical address not on page boundary,\n" + " nested_paddr: 0x%lx vm->page_size: 0x%x", + nested_paddr, vm->page_size); + TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond beyond maximum supported,\n" + " nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + TEST_ASSERT((paddr % vm->page_size) == 0, + "Physical address not on page boundary,\n" + " paddr: 0x%lx vm->page_size: 0x%x", + paddr, vm->page_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + + index[0] = (nested_paddr >> 12) & 0x1ffu; + index[1] = (nested_paddr >> 21) & 0x1ffu; + index[2] = (nested_paddr >> 30) & 0x1ffu; + index[3] = (nested_paddr >> 39) & 0x1ffu; + + /* Allocate page directory pointer table if not present. */ + pml4e = vmx->eptp_hva; + if (!pml4e[index[3]].readable) { + pml4e[index[3]].address = vm_phy_page_alloc(vm, + KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) + >> vm->page_shift; + pml4e[index[3]].writable = true; + pml4e[index[3]].readable = true; + pml4e[index[3]].executable = true; + } + + /* Allocate page directory table if not present. */ + struct eptPageTableEntry *pdpe; + pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); + if (!pdpe[index[2]].readable) { + pdpe[index[2]].address = vm_phy_page_alloc(vm, + KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) + >> vm->page_shift; + pdpe[index[2]].writable = true; + pdpe[index[2]].readable = true; + pdpe[index[2]].executable = true; + } + + /* Allocate page table if not present. */ + struct eptPageTableEntry *pde; + pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); + if (!pde[index[1]].readable) { + pde[index[1]].address = vm_phy_page_alloc(vm, + KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) + >> vm->page_shift; + pde[index[1]].writable = true; + pde[index[1]].readable = true; + pde[index[1]].executable = true; + } + + /* Fill in page table entry. */ + struct eptPageTableEntry *pte; + pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); + pte[index[0]].address = paddr >> vm->page_shift; + pte[index[0]].writable = true; + pte[index[0]].readable = true; + pte[index[0]].executable = true; + + /* + * For now mark these as accessed and dirty because the only + * testcase we have needs that. Can be reconsidered later. + */ + pte[index[0]].accessed = true; + pte[index[0]].dirty = true; +} + +/* + * Map a range of EPT guest physical addresses to the VM's physical address + * + * Input Args: + * vm - Virtual Machine + * nested_paddr - Nested guest physical address to map + * paddr - VM Physical Address + * size - The size of the range to map + * eptp_memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within the VM given by vm, creates a nested guest translation for the + * page range starting at nested_paddr to the page range starting at paddr. + */ +void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size, + uint32_t eptp_memslot) +{ + size_t page_size = vm->page_size; + size_t npages = size / page_size; + + TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); + TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); + + while (npages--) { + nested_pg_map(vmx, vm, nested_paddr, paddr, eptp_memslot); + nested_paddr += page_size; + paddr += page_size; + } +} + +/* Prepare an identity extended page table that maps all the + * physical pages in VM. + */ +void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t memslot, uint32_t eptp_memslot) +{ + sparsebit_idx_t i, last; + struct userspace_mem_region *region = + memslot2region(vm, memslot); + + i = (region->region.guest_phys_addr >> vm->page_shift) - 1; + last = i + (region->region.memory_size >> vm->page_shift); + for (;;) { + i = sparsebit_next_clear(region->unused_phy_pages, i); + if (i > last) + break; + + nested_map(vmx, vm, + (uint64_t)i << vm->page_shift, + (uint64_t)i << vm->page_shift, + 1 << vm->page_shift, + eptp_memslot); + } +} + +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t eptp_memslot) +{ + vmx->eptp = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); + vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c new file mode 100644 index 000000000000..0bca1cfe2c1e --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty page logging test + * + * Copyright (C) 2018, Red Hat, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define VCPU_ID 1 + +/* The memory slot index to track dirty pages */ +#define TEST_MEM_SLOT_INDEX 1 +#define TEST_MEM_SIZE 3 + +/* L1 guest test virtual memory offset */ +#define GUEST_TEST_MEM 0xc0000000 + +/* L2 guest test virtual memory offset */ +#define NESTED_TEST_MEM1 0xc0001000 +#define NESTED_TEST_MEM2 0xc0002000 + +static void l2_guest_code(void) +{ + *(volatile uint64_t *)NESTED_TEST_MEM1; + *(volatile uint64_t *)NESTED_TEST_MEM1 = 1; + GUEST_SYNC(true); + GUEST_SYNC(false); + + *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + GUEST_SYNC(true); + *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + GUEST_SYNC(true); + GUEST_SYNC(false); + + /* Exit to L1 and never come back. */ + vmcall(); +} + +void l1_guest_code(struct vmx_pages *vmx) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + + prepare_vmcs(vmx, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(false); + GUEST_ASSERT(!vmlaunch()); + GUEST_SYNC(false); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva = 0; + struct vmx_pages *vmx; + unsigned long *bmap; + uint64_t *host_test_mem; + + struct kvm_vm *vm; + struct kvm_run *run; + struct ucall uc; + bool done = false; + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, l1_guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + run = vcpu_state(vm, VCPU_ID); + + /* Add an extra memory slot for testing dirty logging */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + GUEST_TEST_MEM, + TEST_MEM_SLOT_INDEX, + TEST_MEM_SIZE, + KVM_MEM_LOG_DIRTY_PAGES); + + /* + * Add an identity map for GVA range [0xc0000000, 0xc0002000). This + * affects both L1 and L2. However... + */ + virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, + TEST_MEM_SIZE * 4096, 0); + + /* + * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to + * 0xc0000000. + * + * Note that prepare_eptp should be called only L1's GPA map is done, + * meaning after the last call to virt_map. + */ + prepare_eptp(vmx, vm, 0); + nested_map_memslot(vmx, vm, 0, 0); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096, 0); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096, 0); + + bmap = bitmap_alloc(TEST_MEM_SIZE); + host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); + + while (!done) { + memset(host_test_mem, 0xaa, TEST_MEM_SIZE * 4096); + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + /* + * The nested guest wrote at offset 0x1000 in the memslot, but the + * dirty bitmap must be filled in according to L1 GPA, not L2. + */ + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + if (uc.args[1]) { + TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean\n"); + TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest\n"); + } else { + TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty\n"); + TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest\n"); + } + + TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty\n"); + TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest\n"); + TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty\n"); + TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest\n"); + break; + case UCALL_DONE: + done = true; + break; + default: + TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); + } + } +} From 4b0b2b096da9d296e0e5668cdfba8613bd6f5bc8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 25 Sep 2019 12:59:23 -0700 Subject: [PATCH 1374/2880] libsubcmd: Make _FORTIFY_SOURCE defines dependent on the feature Unconditionally defining _FORTIFY_SOURCE can break tools that don't work with it, such as memory sanitizers: https://github.com/google/sanitizers/wiki/AddressSanitizer#faq Fixes: 4b6ab94eabe4 ("perf subcmd: Create subcmd library") Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20190925195924.152834-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/subcmd/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/lib/subcmd/Makefile b/tools/lib/subcmd/Makefile index ed61fb3a46c0..5b2cd5e58df0 100644 --- a/tools/lib/subcmd/Makefile +++ b/tools/lib/subcmd/Makefile @@ -20,7 +20,13 @@ MAKEFLAGS += --no-print-directory LIBFILE = $(OUTPUT)libsubcmd.a CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS) -CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -fPIC +CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -fPIC + +ifeq ($(DEBUG),0) + ifeq ($(feature-fortify-source), 1) + CFLAGS += -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 + endif +endif ifeq ($(CC_NO_CLANG), 0) CFLAGS += -O3 From e3e2cf3d5b1fe800b032e14c0fdcd9a6fb20cf3b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 25 Sep 2019 12:59:24 -0700 Subject: [PATCH 1375/2880] perf tests: Avoid raising SEGV using an obvious NULL dereference An optimized build such as: make -C tools/perf CLANG=1 CC=clang EXTRA_CFLAGS="-O3 will turn the dereference operation into a ud2 instruction, raising a SIGILL rather than a SIGSEGV. Use raise(..) for correctness and clarity. Similar issues were addressed in Numfor Mbiziwo-Tiapo's patch: https://lkml.org/lkml/2019/7/8/1234 Committer testing: Before: [root@quaco ~]# perf test hooks 55: perf hooks : Ok [root@quaco ~]# perf test -v hooks 55: perf hooks : --- start --- test child forked, pid 17092 SIGSEGV is observed as expected, try to recover. Fatal error (SEGFAULT) in perf hook 'test' test child finished with 0 ---- end ---- perf hooks: Ok [root@quaco ~]# After: [root@quaco ~]# perf test hooks 55: perf hooks : Ok [root@quaco ~]# perf test -v hooks 55: perf hooks : --- start --- test child forked, pid 17909 SIGSEGV is observed as expected, try to recover. Fatal error (SEGFAULT) in perf hook 'test' test child finished with 0 ---- end ---- perf hooks: Ok [root@quaco ~]# Fixes: a074865e60ed ("perf tools: Introduce perf hooks") Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Wang Nan Link: http://lore.kernel.org/lkml/20190925195924.152834-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/perf-hooks.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/perf/tests/perf-hooks.c b/tools/perf/tests/perf-hooks.c index dbc27199c65e..dd865e0bea12 100644 --- a/tools/perf/tests/perf-hooks.c +++ b/tools/perf/tests/perf-hooks.c @@ -19,12 +19,11 @@ static void sigsegv_handler(int sig __maybe_unused) static void the_hook(void *_hook_flags) { int *hook_flags = _hook_flags; - int *p = NULL; *hook_flags = 1234; /* Generate a segfault, test perf_hooks__recover */ - *p = 0; + raise(SIGSEGV); } int test__perf_hooks(struct test *test __maybe_unused, int subtest __maybe_unused) From d586ac10ce56b2381b8e1d8ed74660c1b2b8ab0d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 20 Sep 2019 21:13:27 -0700 Subject: [PATCH 1376/2880] perf docs: Allow man page date to be specified With this change if a perf_date parameter is provided to asciidoc then it will override the default date written to the man page metadata. Without this change, or if the perf_date isn't specified, then the current date is written to the metadata. Having this parameter allows the metadata to be constant if builds happen on different dates. The name of the parameter is intended to be consistent with the existing perf_version parameter. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20190921041327.155054-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/asciidoc.conf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/Documentation/asciidoc.conf b/tools/perf/Documentation/asciidoc.conf index 356b23a40339..2b62ba1e72b7 100644 --- a/tools/perf/Documentation/asciidoc.conf +++ b/tools/perf/Documentation/asciidoc.conf @@ -71,6 +71,9 @@ ifdef::backend-docbook[] [header] template::[header-declarations] +ifdef::perf_date[] +{perf_date} +endif::perf_date[] {mantitle} {manvolnum} From 08a96a31474a732fd654575ced843b94bc3212e1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 27 Sep 2019 09:28:11 -0300 Subject: [PATCH 1377/2880] tools headers uapi: Sync drm/i915_drm.h with the kernel sources To pick the change in: bf73fc0fa9cf ("drm/i915: Show support for accurate sw PMU busyness tracking") That don't result in any changes in tooling, just silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/drm/i915_drm.h' differs from latest version at 'include/uapi/drm/i915_drm.h' diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h Cc: Adrian Hunter Cc: Chris Wilson Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-o651nt7vpz93tu3nmx4f3xql@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/i915_drm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index 328d05e77d9f..469dc512cca3 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -521,6 +521,7 @@ typedef struct drm_i915_irq_wait { #define I915_SCHEDULER_CAP_PRIORITY (1ul << 1) #define I915_SCHEDULER_CAP_PREEMPTION (1ul << 2) #define I915_SCHEDULER_CAP_SEMAPHORES (1ul << 3) +#define I915_SCHEDULER_CAP_ENGINE_BUSY_STATS (1ul << 4) #define I915_PARAM_HUC_STATUS 42 From b9023b91dd020ad7e093baa5122b6968c48cc9e0 Mon Sep 17 00:00:00 2001 From: Balasubramani Vivekanandan Date: Thu, 26 Sep 2019 15:51:01 +0200 Subject: [PATCH 1378/2880] tick: broadcast-hrtimer: Fix a race in bc_set_next When a cpu requests broadcasting, before starting the tick broadcast hrtimer, bc_set_next() checks if the timer callback (bc_handler) is active using hrtimer_try_to_cancel(). But hrtimer_try_to_cancel() does not provide the required synchronization when the callback is active on other core. The callback could have already executed tick_handle_oneshot_broadcast() and could have also returned. But still there is a small time window where the hrtimer_try_to_cancel() returns -1. In that case bc_set_next() returns without doing anything, but the next_event of the tick broadcast clock device is already set to a timeout value. In the race condition diagram below, CPU #1 is running the timer callback and CPU #2 is entering idle state and so calls bc_set_next(). In the worst case, the next_event will contain an expiry time, but the hrtimer will not be started which happens when the racing callback returns HRTIMER_NORESTART. The hrtimer might never recover if all further requests from the CPUs to subscribe to tick broadcast have timeout greater than the next_event of tick broadcast clock device. This leads to cascading of failures and finally noticed as rcu stall warnings Here is a depiction of the race condition CPU #1 (Running timer callback) CPU #2 (Enter idle and subscribe to tick broadcast) --------------------- --------------------- __run_hrtimer() tick_broadcast_enter() bc_handler() __tick_broadcast_oneshot_control() tick_handle_oneshot_broadcast() raw_spin_lock(&tick_broadcast_lock); dev->next_event = KTIME_MAX; //wait for tick_broadcast_lock //next_event for tick broadcast clock set to KTIME_MAX since no other cores subscribed to tick broadcasting raw_spin_unlock(&tick_broadcast_lock); if (dev->next_event == KTIME_MAX) return HRTIMER_NORESTART // callback function exits without restarting the hrtimer //tick_broadcast_lock acquired raw_spin_lock(&tick_broadcast_lock); tick_broadcast_set_event() clockevents_program_event() dev->next_event = expires; bc_set_next() hrtimer_try_to_cancel() //returns -1 since the timer callback is active. Exits without restarting the timer cpu_base->running = NULL; The comment that hrtimer cannot be armed from within the callback is wrong. It is fine to start the hrtimer from within the callback. Also it is safe to start the hrtimer from the enter/exit idle code while the broadcast handler is active. The enter/exit idle code and the broadcast handler are synchronized using tick_broadcast_lock. So there is no need for the existing try to cancel logic. All this can be removed which will eliminate the race condition as well. Fixes: 5d1638acb9f6 ("tick: Introduce hrtimer based broadcast") Originally-by: Thomas Gleixner Signed-off-by: Balasubramani Vivekanandan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190926135101.12102-2-balasubramani_vivekanandan@mentor.com --- kernel/time/tick-broadcast-hrtimer.c | 62 +++++++++++++--------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index c1f5bb590b5e..b5a65e212df2 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -42,39 +42,39 @@ static int bc_shutdown(struct clock_event_device *evt) */ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) { - int bc_moved; /* - * We try to cancel the timer first. If the callback is on - * flight on some other cpu then we let it handle it. If we - * were able to cancel the timer nothing can rearm it as we - * own broadcast_lock. + * This is called either from enter/exit idle code or from the + * broadcast handler. In all cases tick_broadcast_lock is held. * - * However we can also be called from the event handler of - * ce_broadcast_hrtimer itself when it expires. We cannot - * restart the timer because we are in the callback, but we - * can set the expiry time and let the callback return - * HRTIMER_RESTART. + * hrtimer_cancel() cannot be called here neither from the + * broadcast handler nor from the enter/exit idle code. The idle + * code can run into the problem described in bc_shutdown() and the + * broadcast handler cannot wait for itself to complete for obvious + * reasons. * - * Since we are in the idle loop at this point and because - * hrtimer_{start/cancel} functions call into tracing, - * calls to these functions must be bound within RCU_NONIDLE. + * Each caller tries to arm the hrtimer on its own CPU, but if the + * hrtimer callbback function is currently running, then + * hrtimer_start() cannot move it and the timer stays on the CPU on + * which it is assigned at the moment. + * + * As this can be called from idle code, the hrtimer_start() + * invocation has to be wrapped with RCU_NONIDLE() as + * hrtimer_start() can call into tracing. */ - RCU_NONIDLE( - { - bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; - if (bc_moved) { - hrtimer_start(&bctimer, expires, - HRTIMER_MODE_ABS_PINNED_HARD); - } - } - ); - - if (bc_moved) { - /* Bind the "device" to the cpu */ - bc->bound_on = smp_processor_id(); - } else if (bc->bound_on == smp_processor_id()) { - hrtimer_set_expires(&bctimer, expires); - } + RCU_NONIDLE( { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* + * The core tick broadcast mode expects bc->bound_on to be set + * correctly to prevent a CPU which has the broadcast hrtimer + * armed from going deep idle. + * + * As tick_broadcast_lock is held, nothing can change the cpu + * base which was just established in hrtimer_start() above. So + * the below access is safe even without holding the hrtimer + * base lock. + */ + bc->bound_on = bctimer.base->cpu_base->cpu; + } ); return 0; } @@ -100,10 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) { ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); - if (clockevent_state_oneshot(&ce_broadcast_hrtimer)) - if (ce_broadcast_hrtimer.next_event != KTIME_MAX) - return HRTIMER_RESTART; - return HRTIMER_NORESTART; } From 8d6996630c03d7ceeabe2611378fea5ca1c3f1b3 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 27 Sep 2019 16:19:55 +0800 Subject: [PATCH 1379/2880] block: fix null pointer dereference in blk_mq_rq_timed_out() We got a null pointer deference BUG_ON in blk_mq_rq_timed_out() as following: [ 108.825472] BUG: kernel NULL pointer dereference, address: 0000000000000040 [ 108.827059] PGD 0 P4D 0 [ 108.827313] Oops: 0000 [#1] SMP PTI [ 108.827657] CPU: 6 PID: 198 Comm: kworker/6:1H Not tainted 5.3.0-rc8+ #431 [ 108.829503] Workqueue: kblockd blk_mq_timeout_work [ 108.829913] RIP: 0010:blk_mq_check_expired+0x258/0x330 [ 108.838191] Call Trace: [ 108.838406] bt_iter+0x74/0x80 [ 108.838665] blk_mq_queue_tag_busy_iter+0x204/0x450 [ 108.839074] ? __switch_to_asm+0x34/0x70 [ 108.839405] ? blk_mq_stop_hw_queue+0x40/0x40 [ 108.839823] ? blk_mq_stop_hw_queue+0x40/0x40 [ 108.840273] ? syscall_return_via_sysret+0xf/0x7f [ 108.840732] blk_mq_timeout_work+0x74/0x200 [ 108.841151] process_one_work+0x297/0x680 [ 108.841550] worker_thread+0x29c/0x6f0 [ 108.841926] ? rescuer_thread+0x580/0x580 [ 108.842344] kthread+0x16a/0x1a0 [ 108.842666] ? kthread_flush_work+0x170/0x170 [ 108.843100] ret_from_fork+0x35/0x40 The bug is caused by the race between timeout handle and completion for flush request. When timeout handle function blk_mq_rq_timed_out() try to read 'req->q->mq_ops', the 'req' have completed and reinitiated by next flush request, which would call blk_rq_init() to clear 'req' as 0. After commit 12f5b93145 ("blk-mq: Remove generation seqeunce"), normal requests lifetime are protected by refcount. Until 'rq->ref' drop to zero, the request can really be free. Thus, these requests cannot been reused before timeout handle finish. However, flush request has defined .end_io and rq->end_io() is still called even if 'rq->ref' doesn't drop to zero. After that, the 'flush_rq' can be reused by the next flush request handle, resulting in null pointer deference BUG ON. We fix this problem by covering flush request with 'rq->ref'. If the refcount is not zero, flush_end_io() return and wait the last holder recall it. To record the request status, we add a new entry 'rq_status', which will be used in flush_end_io(). Cc: Christoph Hellwig Cc: Keith Busch Cc: Bart Van Assche Cc: stable@vger.kernel.org # v4.18+ Reviewed-by: Ming Lei Reviewed-by: Bob Liu Signed-off-by: Yufen Yu ------- v2: - move rq_status from struct request to struct blk_flush_queue v3: - remove unnecessary '{}' pair. v4: - let spinlock to protect 'fq->rq_status' v5: - move rq_status after flush_running_idx member of struct blk_flush_queue Signed-off-by: Jens Axboe --- block/blk-flush.c | 10 ++++++++++ block/blk-mq.c | 5 ++++- block/blk.h | 7 +++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index aedd9320e605..1eec9cbe5a0a 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -214,6 +214,16 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); + + if (!refcount_dec_and_test(&flush_rq->ref)) { + fq->rq_status = error; + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + return; + } + + if (fq->rq_status != BLK_STS_OK) + error = fq->rq_status; + hctx = flush_rq->mq_hctx; if (!q->elevator) { blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); diff --git a/block/blk-mq.c b/block/blk-mq.c index 29275f5a996f..6e3b15f70cd7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -918,7 +918,10 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, */ if (blk_mq_req_expired(rq, next)) blk_mq_rq_timed_out(rq, reserved); - if (refcount_dec_and_test(&rq->ref)) + + if (is_flush_rq(rq, hctx)) + rq->end_io(rq, 0); + else if (refcount_dec_and_test(&rq->ref)) __blk_mq_free_request(rq); return true; diff --git a/block/blk.h b/block/blk.h index 25773d668ec0..47fba9362e60 100644 --- a/block/blk.h +++ b/block/blk.h @@ -19,6 +19,7 @@ struct blk_flush_queue { unsigned int flush_queue_delayed:1; unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; + blk_status_t rq_status; unsigned long flush_pending_since; struct list_head flush_queue[2]; struct list_head flush_data_in_flight; @@ -47,6 +48,12 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } +static inline bool +is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) +{ + return hctx->fq->flush_rq == req; +} + struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); From bab32fc069ce8829c416e8737c119f62a57970f9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 16 Sep 2019 20:02:38 +0800 Subject: [PATCH 1380/2880] btrfs: qgroup: Fix the wrong target io_tree when freeing reserved data space [BUG] Under the following case with qgroup enabled, if some error happened after we have reserved delalloc space, then in error handling path, we could cause qgroup data space leakage: From btrfs_truncate_block() in inode.c: ret = btrfs_delalloc_reserve_space(inode, &data_reserved, block_start, blocksize); if (ret) goto out; again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true); ret = -ENOMEM; goto out; } [CAUSE] In the above case, btrfs_delalloc_reserve_space() will call btrfs_qgroup_reserve_data() and mark the io_tree range with EXTENT_QGROUP_RESERVED flag. In the error handling path, we have the following call stack: btrfs_delalloc_release_space() |- btrfs_free_reserved_data_space() |- btrsf_qgroup_free_data() |- __btrfs_qgroup_release_data(reserved=@reserved, free=1) |- qgroup_free_reserved_data(reserved=@reserved) |- clear_record_extent_bits(); |- freed += changeset.bytes_changed; However due to a completion bug, qgroup_free_reserved_data() will clear EXTENT_QGROUP_RESERVED flag in BTRFS_I(inode)->io_failure_tree, other than the correct BTRFS_I(inode)->io_tree. Since io_failure_tree is never marked with that flag, btrfs_qgroup_free_data() will not free any data reserved space at all, causing a leakage. This type of error handling can only be triggered by errors outside of qgroup code. So EDQUOT error from qgroup can't trigger it. [FIX] Fix the wrong target io_tree. Reported-by: Josef Bacik Fixes: bc42bda22345 ("btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges") CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 52701c1be109..4ab85555a947 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3486,7 +3486,7 @@ static int qgroup_free_reserved_data(struct inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, free_start, free_start + free_len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) From d4e204948fe3e0dc8e1fbf3f8f3290c9c2823be3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 16 Sep 2019 20:02:39 +0800 Subject: [PATCH 1381/2880] btrfs: qgroup: Fix reserved data space leak if we have multiple reserve calls [BUG] The following script can cause btrfs qgroup data space leak: mkfs.btrfs -f $dev mount $dev -o nospace_cache $mnt btrfs subv create $mnt/subv btrfs quota en $mnt btrfs quota rescan -w $mnt btrfs qgroup limit 128m $mnt/subv for (( i = 0; i < 3; i++)); do # Create 3 64M holes for latter fallocate to fail truncate -s 192m $mnt/subv/file xfs_io -c "pwrite 64m 4k" $mnt/subv/file > /dev/null xfs_io -c "pwrite 128m 4k" $mnt/subv/file > /dev/null sync # it's supposed to fail, and each failure will leak at least 64M # data space xfs_io -f -c "falloc 0 192m" $mnt/subv/file &> /dev/null rm $mnt/subv/file sync done # Shouldn't fail after we removed the file xfs_io -f -c "falloc 0 64m" $mnt/subv/file [CAUSE] Btrfs qgroup data reserve code allow multiple reservations to happen on a single extent_changeset: E.g: btrfs_qgroup_reserve_data(inode, &data_reserved, 0, SZ_1M); btrfs_qgroup_reserve_data(inode, &data_reserved, SZ_1M, SZ_2M); btrfs_qgroup_reserve_data(inode, &data_reserved, 0, SZ_4M); Btrfs qgroup code has its internal tracking to make sure we don't double-reserve in above example. The only pattern utilizing this feature is in the main while loop of btrfs_fallocate() function. However btrfs_qgroup_reserve_data()'s error handling has a bug in that on error it clears all ranges in the io_tree with EXTENT_QGROUP_RESERVED flag but doesn't free previously reserved bytes. This bug has a two fold effect: - Clearing EXTENT_QGROUP_RESERVED ranges This is the correct behavior, but it prevents btrfs_qgroup_check_reserved_leak() to catch the leakage as the detector is purely EXTENT_QGROUP_RESERVED flag based. - Leak the previously reserved data bytes. The bug manifests when N calls to btrfs_qgroup_reserve_data are made and the last one fails, leaking space reserved in the previous ones. [FIX] Also free previously reserved data bytes when btrfs_qgroup_reserve_data fails. Fixes: 524725537023 ("btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4ab85555a947..c4bb69941c77 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3442,6 +3442,9 @@ cleanup: while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); + /* Also free data bytes of already reserved one */ + btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, + orig_reserved, BTRFS_QGROUP_RSV_DATA); extent_changeset_release(reserved); return ret; } From 19a36d329f5b1e5d3d06f7093de992693d0fdee6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 26 Aug 2019 15:30:23 -0400 Subject: [PATCH 1382/2880] KVM: VMX: Set VMENTER_L1D_FLUSH_NOT_REQUIRED if !X86_BUG_L1TF The l1tf_vmx_mitigation is only set to VMENTER_L1D_FLUSH_NOT_REQUIRED when the ARCH_CAPABILITIES MSR indicates that L1D flush is not required. However, if the CPU is not affected by L1TF, l1tf_vmx_mitigation will still be set to VMENTER_L1D_FLUSH_AUTO. This is certainly not the best option for a !X86_BUG_L1TF CPU. So force l1tf_vmx_mitigation to VMENTER_L1D_FLUSH_NOT_REQUIRED to make it more explicit in case users are checking the vmentry_l1d_flush parameter. Signed-off-by: Waiman Long [Patch rewritten accoring to Borislav Petkov's suggestion. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d4575ffb3cec..e7970a2e8eae 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -209,6 +209,11 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) struct page *page; unsigned int i; + if (!boot_cpu_has_bug(X86_BUG_L1TF)) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + if (!enable_ept) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; return 0; @@ -7995,12 +8000,10 @@ static int __init vmx_init(void) * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ - if (boot_cpu_has(X86_BUG_L1TF)) { - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + if (r) { + vmx_exit(); + return r; } #ifdef CONFIG_KEXEC_CORE From 2e4a75976dfb88ee6ce019b1d4fa507aabd02c14 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 27 Sep 2019 17:54:13 +0200 Subject: [PATCH 1383/2880] KVM: selftests: x86: clarify what is reported on KVM_GET_MSRS failure When KVM_GET_MSRS fail the report looks like ==== Test Assertion Failure ==== lib/x86_64/processor.c:1089: r == nmsrs pid=28775 tid=28775 - Argument list too long 1 0x000000000040a55f: vcpu_save_state at processor.c:1088 (discriminator 3) 2 0x00000000004010e3: main at state_test.c:171 (discriminator 4) 3 0x00007fb8e69223d4: ?? ??:0 4 0x0000000000401287: _start at ??:? Unexpected result from KVM_GET_MSRS, r: 36 (failed at 194) and it's not obvious that '194' here is the failed MSR index and that it's printed in hex. Change that. Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index c53dbc6bc568..6698cb741e10 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1085,7 +1085,7 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) for (i = 0; i < nmsrs; i++) state->msrs.entries[i].index = list->indices[i]; r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); - TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed at %x)", + TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", r, r == nmsrs ? -1 : list->indices[r]); r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); From ea1e2bbec6edb34fd13032bdfc80276526e87967 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Fri, 27 Sep 2019 17:18:05 +0100 Subject: [PATCH 1384/2880] keys: Add Jarkko Sakkinen as co-maintainer To address a major procedural concern on Linus's part the keyrings needs a co-maintainer. Suggested-by: David Howells Signed-off-by: Jarkko Sakkinen Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index a97f1be63b9d..d9ebe514708b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9060,6 +9060,7 @@ F: include/keys/trusted.h KEYS/KEYRINGS: M: David Howells +M: Jarkko Sakkinen L: keyrings@vger.kernel.org S: Maintained F: Documentation/security/keys/core.rst From 67b483dd03c4cd9e90e4c3943132dce514ea4e88 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 24 Sep 2019 11:27:05 -0700 Subject: [PATCH 1385/2880] nvme-rdma: fix possible use-after-free in connect timeout If the connect times out, we may have already destroyed the queue in the timeout handler, so test if the queue is still allocated in the connect error handler. Reported-by: Yi Zhang Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 9d16dfc29368..4d280160dd3f 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -620,7 +620,8 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) if (!ret) { set_bit(NVME_RDMA_Q_LIVE, &queue->flags); } else { - __nvme_rdma_stop_queue(queue); + if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) + __nvme_rdma_stop_queue(queue); dev_info(ctrl->ctrl.device, "failed to connect queue: %d ret=%d\n", idx, ret); } From a12de1d42d74ef3c80e9fb9a2da94daaef747869 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 27 Sep 2019 15:24:30 +0800 Subject: [PATCH 1386/2880] blk-mq: honor IO scheduler for multiqueue devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a device is using multiple queues, the IO scheduler may be bypassed. This may hurt performance for some slow MQ devices, and it also breaks zoned devices which depend on mq-deadline for respecting the write order in one zone. Don't bypass io scheduler if we have one setup. This patch can double sequential write performance basically on MQ scsi_debug when mq-deadline is applied. Cc: Bart Van Assche Cc: Hannes Reinecke Cc: Dave Chinner Reviewed-by: Javier González Reviewed-by: Damien Le Moal Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 6e3b15f70cd7..5d4bef3c378d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2012,6 +2012,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } blk_add_rq_to_plug(plug, rq); + } else if (q->elevator) { + blk_mq_sched_insert_request(rq, false, true, true); } else if (plug && !blk_queue_nomerges(q)) { /* * We do limited plugging. If the bio can be merged, do that. @@ -2035,8 +2037,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); } - } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && - !data.hctx->dispatch_busy)) { + } else if ((q->nr_hw_queues > 1 && is_sync) || + !data.hctx->dispatch_busy) { blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { blk_mq_sched_insert_request(rq, false, true, true); From 3154df262db52f1d27e01020e871f4345ec2f9a8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 27 Sep 2019 15:24:31 +0800 Subject: [PATCH 1387/2880] blk-mq: apply normal plugging for HDD Some HDD drive may expose multiple hardware queues, such as MegraRaid. Let's apply the normal plugging for such devices because sequential IO may benefit a lot from plug merging. Cc: Bart Van Assche Cc: Hannes Reinecke Cc: Dave Chinner Reviewed-by: Damien Le Moal Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 5d4bef3c378d..ec791156e9cc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1992,10 +1992,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) /* bypass scheduler for flush rq */ blk_insert_flush(rq); blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) { + } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs || + !blk_queue_nonrot(q))) { /* * Use plugging if we have a ->commit_rqs() hook as well, as * we know the driver uses bd->last in a smart fashion. + * + * Use normal plugging if this disk is slow HDD, as sequential + * IO may benefit a lot from plug merging. */ unsigned int request_count = plug->rq_count; struct request *last = NULL; From dac91170f8e9c73784af5fad6225e954b795601c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 25 Sep 2019 07:53:19 -0700 Subject: [PATCH 1388/2880] vrf: Do not attempt to create IPv6 mcast rule if IPv6 is disabled A user reported that vrf create fails when IPv6 is disabled at boot using 'ipv6.disable=1': https://bugzilla.kernel.org/show_bug.cgi?id=204903 The failure is adding fib rules at create time. Add RTNL_FAMILY_IP6MR to the check in vrf_fib_rule if ipv6_mod_enabled is disabled. Fixes: e4a38c0c4b27 ("ipv6: add vrf table handling code for ipv6 mcast") Signed-off-by: David Ahern Cc: Patrick Ruddy Signed-off-by: David S. Miller --- drivers/net/vrf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 6e84328bdd40..a4b38a980c3c 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1154,7 +1154,8 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) struct sk_buff *skb; int err; - if (family == AF_INET6 && !ipv6_mod_enabled()) + if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && + !ipv6_mod_enabled()) return 0; skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); From dfe5999dc03e321d08190772c98843284d5cf419 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 25 Sep 2019 18:02:35 +0300 Subject: [PATCH 1389/2880] net/sched: Set default of CONFIG_NET_TC_SKB_EXT to N This a new feature, it is preferred that it defaults to N. We will probe the feature support from userspace before actually using it. Fixes: 95a7233c452a ('net: openvswitch: Set OvS recirc_id from tc chain index') Signed-off-by: Paul Blakey Signed-off-by: David S. Miller --- net/sched/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 5b044ae6dc1e..2985509147a2 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -966,7 +966,6 @@ config NET_IFE_SKBTCINDEX config NET_TC_SKB_EXT bool "TC recirculation support" depends on NET_CLS_ACT - default y if NET_CLS_ACT select SKB_EXTENSIONS help From 8572cea1461a006bce1d06c0c4b0575869125fa4 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Wed, 25 Sep 2019 13:24:02 -0500 Subject: [PATCH 1390/2880] nfp: flower: prevent memory leak in nfp_flower_spawn_phy_reprs In nfp_flower_spawn_phy_reprs, in the for loop over eth_tbl if any of intermediate allocations or initializations fail memory is leaked. requiered releases are added. Fixes: b94524529741 ("nfp: flower: add per repr private data for LAG offload") Signed-off-by: Navid Emamdoost Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index 7a20447cca19..91a47899220f 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -515,6 +515,7 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) repr_priv = kzalloc(sizeof(*repr_priv), GFP_KERNEL); if (!repr_priv) { err = -ENOMEM; + nfp_repr_free(repr); goto err_reprs_clean; } @@ -525,11 +526,13 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) port = nfp_port_alloc(app, NFP_PORT_PHYS_PORT, repr); if (IS_ERR(port)) { err = PTR_ERR(port); + kfree(repr_priv); nfp_repr_free(repr); goto err_reprs_clean; } err = nfp_port_init_phy_port(app->pf, app, port, i); if (err) { + kfree(repr_priv); nfp_port_free(port); nfp_repr_free(repr); goto err_reprs_clean; @@ -542,6 +545,7 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) err = nfp_repr_init(app, repr, cmsg_port_id, port, priv->nn->dp.netdev); if (err) { + kfree(repr_priv); nfp_port_free(port); nfp_repr_free(repr); goto err_reprs_clean; From 8ce39eb5a67aee25d9f05b40b673c95b23502e3e Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Wed, 25 Sep 2019 14:05:09 -0500 Subject: [PATCH 1391/2880] nfp: flower: fix memory leak in nfp_flower_spawn_vnic_reprs In nfp_flower_spawn_vnic_reprs in the loop if initialization or the allocations fail memory is leaked. Appropriate releases are added. Fixes: b94524529741 ("nfp: flower: add per repr private data for LAG offload") Signed-off-by: Navid Emamdoost Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index 91a47899220f..d8ad9346a26a 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -400,6 +400,7 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app, repr_priv = kzalloc(sizeof(*repr_priv), GFP_KERNEL); if (!repr_priv) { err = -ENOMEM; + nfp_repr_free(repr); goto err_reprs_clean; } @@ -413,6 +414,7 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app, port = nfp_port_alloc(app, port_type, repr); if (IS_ERR(port)) { err = PTR_ERR(port); + kfree(repr_priv); nfp_repr_free(repr); goto err_reprs_clean; } @@ -433,6 +435,7 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app, err = nfp_repr_init(app, repr, port_id, port, priv->nn->dp.netdev); if (err) { + kfree(repr_priv); nfp_port_free(port); nfp_repr_free(repr); goto err_reprs_clean; From a3aa6e65beebf3780026753ebf39db19f4c92990 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Thu, 26 Sep 2019 00:08:42 +0200 Subject: [PATCH 1392/2880] net: dsa: microchip: Always set regmap stride to 1 The regmap stride is set to 1 for regmap describing 8bit registers already. However, for 16/32/64bit registers, the stride is 2/4/8 respectively. This is not correct, as the switch protocol supports unaligned register reads and writes and the KSZ87xx even uses such unaligned register accesses to read e.g. MIB counter. This patch fixes MIB counter access on KSZ87xx. Signed-off-by: Marek Vasut Cc: Andrew Lunn Cc: David S. Miller Cc: Florian Fainelli Cc: George McCollister Cc: Tristram Ha Cc: Vivien Didelot Cc: Woojung Huh Fixes: 46558d601cb6 ("net: dsa: microchip: Initial SPI regmap support") Fixes: 255b59ad0db2 ("net: dsa: microchip: Factor out regmap config generation into common header") Reviewed-by: George McCollister Tested-by: George McCollister Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index a24d8e61fbe7..dd60d0837fc6 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -303,7 +303,7 @@ static inline void ksz_pwrite32(struct ksz_device *dev, int port, int offset, { \ .name = #width, \ .val_bits = (width), \ - .reg_stride = (width) / 8, \ + .reg_stride = 1, \ .reg_bits = (regbits) + (regalign), \ .pad_bits = (regpad), \ .max_register = BIT(regbits) - 1, \ From 991ad2b24da2fa7b4ba9775ca1fed2d660d54ab0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Sep 2019 17:20:42 -0700 Subject: [PATCH 1393/2880] lib: dimlib: fix help text typos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix help text typos for DIMLIB. Fixes: 4f75da3666c0 ("linux/dim: Move implementation to .c files") Signed-off-by: Randy Dunlap Cc: Uwe Kleine-König Cc: Tal Gilboa Cc: Saeed Mahameed Acked-by: Uwe Kleine-König Signed-off-by: David S. Miller --- lib/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig b/lib/Kconfig index d7fc9eb33b9b..183f92a297ca 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -558,7 +558,7 @@ config DIMLIB bool help Dynamic Interrupt Moderation library. - Implements an algorithm for dynamically change CQ modertion values + Implements an algorithm for dynamically changing CQ moderation values according to run time performance. # From 2df4de1681767df900e15e34195bbf7dc1b23e06 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 25 Sep 2019 19:28:19 -0700 Subject: [PATCH 1394/2880] ptp: correctly disable flags on old ioctls Commit 415606588c61 ("PTP: introduce new versions of IOCTLs", 2019-09-13) introduced new versions of the PTP ioctls which actually validate that the flags are acceptable values. As part of this, it cleared the flags value using a bitwise and+negation, in an attempt to prevent the old ioctl from accidentally enabling new features. This is incorrect for a couple of reasons. First, it results in accidentally preventing previously working flags on the request ioctl. By clearing the "valid" flags, we now no longer allow setting the enable, rising edge, or falling edge flags. Second, if we add new additional flags in the future, they must not be set by the old ioctl. (Since the flag wasn't checked before, we could potentially break userspace programs which sent garbage flag data. The correct way to resolve this is to check for and clear all but the originally valid flags. Create defines indicating which flags are correctly checked and interpreted by the original ioctls. Use these to clear any bits which will not be correctly interpreted by the original ioctls. In the future, new flags must be added to the VALID_FLAGS macros, but *not* to the V1_VALID_FLAGS macros. In this way, new features may be exposed over the v2 ioctls, but without breaking previous userspace which happened to not clear the flags value properly. The old ioctl will continue to behave the same way, while the new ioctl gains the benefit of using the flags fields. Cc: Richard Cochran Cc: Felipe Balbi Cc: David S. Miller Cc: Christopher Hall Signed-off-by: Jacob Keller Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 4 ++-- include/uapi/linux/ptp_clock.h | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 9c18476d8d10..67d0199840fd 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -155,7 +155,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) err = -EINVAL; break; } else if (cmd == PTP_EXTTS_REQUEST) { - req.extts.flags &= ~PTP_EXTTS_VALID_FLAGS; + req.extts.flags &= PTP_EXTTS_V1_VALID_FLAGS; req.extts.rsv[0] = 0; req.extts.rsv[1] = 0; } @@ -184,7 +184,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) err = -EINVAL; break; } else if (cmd == PTP_PEROUT_REQUEST) { - req.perout.flags &= ~PTP_PEROUT_VALID_FLAGS; + req.perout.flags &= PTP_PEROUT_V1_VALID_FLAGS; req.perout.rsv[0] = 0; req.perout.rsv[1] = 0; req.perout.rsv[2] = 0; diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index f16301015949..59e89a1bc3bb 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -31,15 +31,37 @@ #define PTP_ENABLE_FEATURE (1<<0) #define PTP_RISING_EDGE (1<<1) #define PTP_FALLING_EDGE (1<<2) + +/* + * flag fields valid for the new PTP_EXTTS_REQUEST2 ioctl. + */ #define PTP_EXTTS_VALID_FLAGS (PTP_ENABLE_FEATURE | \ PTP_RISING_EDGE | \ PTP_FALLING_EDGE) +/* + * flag fields valid for the original PTP_EXTTS_REQUEST ioctl. + * DO NOT ADD NEW FLAGS HERE. + */ +#define PTP_EXTTS_V1_VALID_FLAGS (PTP_ENABLE_FEATURE | \ + PTP_RISING_EDGE | \ + PTP_FALLING_EDGE) + /* * Bits of the ptp_perout_request.flags field: */ #define PTP_PEROUT_ONE_SHOT (1<<0) + +/* + * flag fields valid for the new PTP_PEROUT_REQUEST2 ioctl. + */ #define PTP_PEROUT_VALID_FLAGS (PTP_PEROUT_ONE_SHOT) + +/* + * No flags are valid for the original PTP_PEROUT_REQUEST ioctl + */ +#define PTP_PEROUT_V1_VALID_FLAGS (0) + /* * struct ptp_clock_time - represents a time value * From fd4a8093ec0bd37d450587d50f3e10f0af57cc47 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 26 Sep 2019 15:35:10 +0900 Subject: [PATCH 1395/2880] net: socionext: ave: Avoid using netdev_err() before calling register_netdev() Until calling register_netdev(), ndev->dev_name isn't specified, and netdev_err() displays "(unnamed net_device)". ave 65000000.ethernet (unnamed net_device) (uninitialized): invalid phy-mode setting ave: probe of 65000000.ethernet failed with error -22 This replaces netdev_err() with dev_err() before calling register_netdev(). Signed-off-by: Kunihiko Hayashi Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/sni_ave.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c index d047a53f34f2..6e984d5a729f 100644 --- a/drivers/net/ethernet/socionext/sni_ave.c +++ b/drivers/net/ethernet/socionext/sni_ave.c @@ -1662,19 +1662,19 @@ static int ave_probe(struct platform_device *pdev) "socionext,syscon-phy-mode", 1, 0, &args); if (ret) { - netdev_err(ndev, "can't get syscon-phy-mode property\n"); + dev_err(dev, "can't get syscon-phy-mode property\n"); goto out_free_netdev; } priv->regmap = syscon_node_to_regmap(args.np); of_node_put(args.np); if (IS_ERR(priv->regmap)) { - netdev_err(ndev, "can't map syscon-phy-mode\n"); + dev_err(dev, "can't map syscon-phy-mode\n"); ret = PTR_ERR(priv->regmap); goto out_free_netdev; } ret = priv->data->get_pinmode(priv, phy_mode, args.args[0]); if (ret) { - netdev_err(ndev, "invalid phy-mode setting\n"); + dev_err(dev, "invalid phy-mode setting\n"); goto out_free_netdev; } From 407d8098cb1ab338199f4753162799a488d87d23 Mon Sep 17 00:00:00 2001 From: Hans Andersson Date: Thu, 26 Sep 2019 09:54:37 +0200 Subject: [PATCH 1396/2880] net: phy: micrel: add Asym Pause workaround for KSZ9021 The Micrel KSZ9031 PHY may fail to establish a link when the Asymmetric Pause capability is set. This issue is described in a Silicon Errata (DS80000691D or DS80000692D), which advises to always disable the capability. Micrel KSZ9021 has no errata, but has the same issue with Asymmetric Pause. This patch apply the same workaround as the one for KSZ9031. Fixes: 3aed3e2a143c ("net: phy: micrel: add Asym Pause workaround") Signed-off-by: Hans Andersson Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 3c8186f269f9..2fea5541c35a 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -763,6 +763,8 @@ static int ksz9031_get_features(struct phy_device *phydev) * Whenever the device's Asymmetric Pause capability is set to 1, * link-up may fail after a link-up to link-down transition. * + * The Errata Sheet is for ksz9031, but ksz9021 has the same issue + * * Workaround: * Do not enable the Asymmetric Pause capability bit. */ @@ -1076,6 +1078,7 @@ static struct phy_driver ksphy_driver[] = { /* PHY_GBIT_FEATURES */ .driver_data = &ksz9021_type, .probe = kszphy_probe, + .get_features = ksz9031_get_features, .config_init = ksz9021_config_init, .ack_interrupt = kszphy_ack_interrupt, .config_intr = kszphy_config_intr, From d1c536e3177390da43d99f20143b810c35433d1f Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 22 Sep 2019 11:26:53 +0100 Subject: [PATCH 1397/2880] mmc: sdhci: improve ADMA error reporting ADMA errors are potentially data corrupting events; although we print the register state, we do not usefully print the ADMA descriptors. Worse than that, we print them by referencing their virtual address which is meaningless when the register state gives us the DMA address of the failing descriptor. Print the ADMA descriptors giving their DMA addresses rather than their virtual addresses, and print them using SDHCI_DUMP() rather than DBG(). We also do not show the correct value of the interrupt status register; the register dump shows the current value, after we have cleared the pending interrupts we are going to service. What is more useful is to print the interrupts that _were_ pending at the time the ADMA error was encountered. Fix that too. Signed-off-by: Russell King Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 4b297f397326..922a5b594c5e 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2874,6 +2874,7 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask, u32 *intmask_p) static void sdhci_adma_show_error(struct sdhci_host *host) { void *desc = host->adma_table; + dma_addr_t dma = host->adma_addr; sdhci_dumpregs(host); @@ -2881,18 +2882,21 @@ static void sdhci_adma_show_error(struct sdhci_host *host) struct sdhci_adma2_64_desc *dma_desc = desc; if (host->flags & SDHCI_USE_64_BIT_DMA) - DBG("%p: DMA 0x%08x%08x, LEN 0x%04x, Attr=0x%02x\n", - desc, le32_to_cpu(dma_desc->addr_hi), + SDHCI_DUMP("%08llx: DMA 0x%08x%08x, LEN 0x%04x, Attr=0x%02x\n", + (unsigned long long)dma, + le32_to_cpu(dma_desc->addr_hi), le32_to_cpu(dma_desc->addr_lo), le16_to_cpu(dma_desc->len), le16_to_cpu(dma_desc->cmd)); else - DBG("%p: DMA 0x%08x, LEN 0x%04x, Attr=0x%02x\n", - desc, le32_to_cpu(dma_desc->addr_lo), + SDHCI_DUMP("%08llx: DMA 0x%08x, LEN 0x%04x, Attr=0x%02x\n", + (unsigned long long)dma, + le32_to_cpu(dma_desc->addr_lo), le16_to_cpu(dma_desc->len), le16_to_cpu(dma_desc->cmd)); desc += host->desc_sz; + dma += host->desc_sz; if (dma_desc->cmd & cpu_to_le16(ADMA2_END)) break; @@ -2968,7 +2972,8 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask) != MMC_BUS_TEST_R) host->data->error = -EILSEQ; else if (intmask & SDHCI_INT_ADMA_ERROR) { - pr_err("%s: ADMA error\n", mmc_hostname(host->mmc)); + pr_err("%s: ADMA error: 0x%08x\n", mmc_hostname(host->mmc), + intmask); sdhci_adma_show_error(host); host->data->error = -EIO; if (host->ops->adma_workaround) From 121bd08b029e03404c451bb237729cdff76eafed Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 22 Sep 2019 11:26:58 +0100 Subject: [PATCH 1398/2880] mmc: sdhci-of-esdhc: set DMA snooping based on DMA coherence We must not unconditionally set the DMA snoop bit; if the DMA API is assuming that the device is not DMA coherent, and the device snoops the CPU caches, the device can see stale cache lines brought in by speculative prefetch. This leads to the device seeing stale data, potentially resulting in corrupted data transfers. Commonly, this results in a descriptor fetch error such as: mmc0: ADMA error mmc0: sdhci: ============ SDHCI REGISTER DUMP =========== mmc0: sdhci: Sys addr: 0x00000000 | Version: 0x00002202 mmc0: sdhci: Blk size: 0x00000008 | Blk cnt: 0x00000001 mmc0: sdhci: Argument: 0x00000000 | Trn mode: 0x00000013 mmc0: sdhci: Present: 0x01f50008 | Host ctl: 0x00000038 mmc0: sdhci: Power: 0x00000003 | Blk gap: 0x00000000 mmc0: sdhci: Wake-up: 0x00000000 | Clock: 0x000040d8 mmc0: sdhci: Timeout: 0x00000003 | Int stat: 0x00000001 mmc0: sdhci: Int enab: 0x037f108f | Sig enab: 0x037f108b mmc0: sdhci: ACmd stat: 0x00000000 | Slot int: 0x00002202 mmc0: sdhci: Caps: 0x35fa0000 | Caps_1: 0x0000af00 mmc0: sdhci: Cmd: 0x0000333a | Max curr: 0x00000000 mmc0: sdhci: Resp[0]: 0x00000920 | Resp[1]: 0x001d8a33 mmc0: sdhci: Resp[2]: 0x325b5900 | Resp[3]: 0x3f400e00 mmc0: sdhci: Host ctl2: 0x00000000 mmc0: sdhci: ADMA Err: 0x00000009 | ADMA Ptr: 0x000000236d43820c mmc0: sdhci: ============================================ mmc0: error -5 whilst initialising SD card but can lead to other errors, and potentially direct the SDHCI controller to read/write data to other memory locations (e.g. if a valid descriptor is visible to the device in a stale cache line.) Fix this by ensuring that the DMA snoop bit corresponds with the behaviour of the DMA API. Since the driver currently only supports DT, use of_dma_is_coherent(). Note that device_get_dma_attr() can not be used as that risks re-introducing this bug if/when the driver is converted to ACPI. Signed-off-by: Russell King Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-esdhc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index 3271c2d76629..1d1953dfc54b 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -495,7 +495,12 @@ static int esdhc_of_enable_dma(struct sdhci_host *host) dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40)); value = sdhci_readl(host, ESDHC_DMA_SYSCTL); - value |= ESDHC_DMA_SNOOP; + + if (of_dma_is_coherent(dev->of_node)) + value |= ESDHC_DMA_SNOOP; + else + value &= ~ESDHC_DMA_SNOOP; + sdhci_writel(host, value, ESDHC_DMA_SYSCTL); return 0; } From 4ee7dde4c777f14cb0f98dd201491bf6cc15899b Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 23 Sep 2019 12:08:09 +0200 Subject: [PATCH 1399/2880] mmc: sdhci: Let drivers define their DMA mask Add host operation ->set_dma_mask() so that drivers can define their own DMA masks. Signed-off-by: Adrian Hunter Tested-by: Nicolin Chen Signed-off-by: Thierry Reding Cc: stable@vger.kernel.org # v4.15 + Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci.c | 12 ++++-------- drivers/mmc/host/sdhci.h | 1 + 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 922a5b594c5e..b056400e34b1 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -3781,18 +3781,14 @@ int sdhci_setup_host(struct sdhci_host *host) host->flags &= ~SDHCI_USE_ADMA; } - /* - * It is assumed that a 64-bit capable device has set a 64-bit DMA mask - * and *must* do 64-bit DMA. A driver has the opportunity to change - * that during the first call to ->enable_dma(). Similarly - * SDHCI_QUIRK2_BROKEN_64_BIT_DMA must be left to the drivers to - * implement. - */ if (sdhci_can_64bit_dma(host)) host->flags |= SDHCI_USE_64_BIT_DMA; if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) { - ret = sdhci_set_dma_mask(host); + if (host->ops->set_dma_mask) + ret = host->ops->set_dma_mask(host); + else + ret = sdhci_set_dma_mask(host); if (!ret && host->ops->enable_dma) ret = host->ops->enable_dma(host); diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index a29c4cd2d92e..0ed3e0eaef5f 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -622,6 +622,7 @@ struct sdhci_ops { u32 (*irq)(struct sdhci_host *host, u32 intmask); + int (*set_dma_mask)(struct sdhci_host *host); int (*enable_dma)(struct sdhci_host *host); unsigned int (*get_max_clock)(struct sdhci_host *host); unsigned int (*get_min_clock)(struct sdhci_host *host); From b960bc448a252428bacca271f3416a8bda3b599b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 23 Sep 2019 12:08:10 +0200 Subject: [PATCH 1400/2880] mmc: tegra: Implement ->set_dma_mask() The SDHCI controller on Tegra186 supports 40-bit addressing, which is usually enough to address all of system memory. However, if the SDHCI controller is behind an IOMMU, the address space can go beyond. This happens on Tegra186 and later where the ARM SMMU has an input address space of 48 bits. If the DMA API is backed by this ARM SMMU, the top- down IOVA allocator will cause IOV addresses to be returned that the SDHCI controller cannot access. Unfortunately, prior to the introduction of the ->set_dma_mask() host operation, the SDHCI core would set either a 64-bit DMA mask if the controller claimed to support 64-bit addressing, or a 32-bit DMA mask otherwise. Since the full 64 bits cannot be addressed on Tegra, this had to be worked around in commit 68481a7e1c84 ("mmc: tegra: Mark 64 bit dma broken on Tegra186") by setting the SDHCI_QUIRK2_BROKEN_64_BIT_DMA quirk, which effectively restricts the DMA mask to 32 bits. One disadvantage of this is that dma_map_*() APIs will now try to use the swiotlb to bounce DMA to addresses beyond of the controller's DMA mask. This in turn caused degraded performance and can lead to situations where the swiotlb buffer is exhausted, which in turn leads to DMA transfers to fail. With the recent introduction of the ->set_dma_mask() host operation, this can now be properly fixed. For each generation of Tegra, the exact supported DMA mask can be configured. This kills two birds with one stone: it avoids the use of bounce buffers because system memory never exceeds the addressable memory range of the SDHCI controllers on these devices, and at the same time when an IOMMU is involved, it prevents IOV addresses from being allocated beyond the addressible range of the controllers. Since the DMA mask is now properly handled, the 64-bit DMA quirk can be removed. Signed-off-by: Nicolin Chen [treding@nvidia.com: provide more background in commit message] Tested-by: Nicolin Chen Acked-by: Adrian Hunter Signed-off-by: Thierry Reding Cc: stable@vger.kernel.org # v4.15 + Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-tegra.c | 48 ++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 02d8f524bb9e..7bc950520fd9 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -104,6 +105,7 @@ struct sdhci_tegra_soc_data { const struct sdhci_pltfm_data *pdata; + u64 dma_mask; u32 nvquirks; u8 min_tap_delay; u8 max_tap_delay; @@ -1233,11 +1235,25 @@ static const struct cqhci_host_ops sdhci_tegra_cqhci_ops = { .update_dcmd_desc = sdhci_tegra_update_dcmd_desc, }; +static int tegra_sdhci_set_dma_mask(struct sdhci_host *host) +{ + struct sdhci_pltfm_host *platform = sdhci_priv(host); + struct sdhci_tegra *tegra = sdhci_pltfm_priv(platform); + const struct sdhci_tegra_soc_data *soc = tegra->soc_data; + struct device *dev = mmc_dev(host->mmc); + + if (soc->dma_mask) + return dma_set_mask_and_coherent(dev, soc->dma_mask); + + return 0; +} + static const struct sdhci_ops tegra_sdhci_ops = { .get_ro = tegra_sdhci_get_ro, .read_w = tegra_sdhci_readw, .write_l = tegra_sdhci_writel, .set_clock = tegra_sdhci_set_clock, + .set_dma_mask = tegra_sdhci_set_dma_mask, .set_bus_width = sdhci_set_bus_width, .reset = tegra_sdhci_reset, .platform_execute_tuning = tegra_sdhci_execute_tuning, @@ -1257,6 +1273,7 @@ static const struct sdhci_pltfm_data sdhci_tegra20_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra20 = { .pdata = &sdhci_tegra20_pdata, + .dma_mask = DMA_BIT_MASK(32), .nvquirks = NVQUIRK_FORCE_SDHCI_SPEC_200 | NVQUIRK_ENABLE_BLOCK_GAP_DET, }; @@ -1283,6 +1300,7 @@ static const struct sdhci_pltfm_data sdhci_tegra30_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra30 = { .pdata = &sdhci_tegra30_pdata, + .dma_mask = DMA_BIT_MASK(32), .nvquirks = NVQUIRK_ENABLE_SDHCI_SPEC_300 | NVQUIRK_ENABLE_SDR50 | NVQUIRK_ENABLE_SDR104 | @@ -1295,6 +1313,7 @@ static const struct sdhci_ops tegra114_sdhci_ops = { .write_w = tegra_sdhci_writew, .write_l = tegra_sdhci_writel, .set_clock = tegra_sdhci_set_clock, + .set_dma_mask = tegra_sdhci_set_dma_mask, .set_bus_width = sdhci_set_bus_width, .reset = tegra_sdhci_reset, .platform_execute_tuning = tegra_sdhci_execute_tuning, @@ -1316,6 +1335,7 @@ static const struct sdhci_pltfm_data sdhci_tegra114_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra114 = { .pdata = &sdhci_tegra114_pdata, + .dma_mask = DMA_BIT_MASK(32), }; static const struct sdhci_pltfm_data sdhci_tegra124_pdata = { @@ -1325,22 +1345,13 @@ static const struct sdhci_pltfm_data sdhci_tegra124_pdata = { SDHCI_QUIRK_NO_HISPD_BIT | SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, - .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | - /* - * The TRM states that the SD/MMC controller found on - * Tegra124 can address 34 bits (the maximum supported by - * the Tegra memory controller), but tests show that DMA - * to or from above 4 GiB doesn't work. This is possibly - * caused by missing programming, though it's not obvious - * what sequence is required. Mark 64-bit DMA broken for - * now to fix this for existing users (e.g. Nyan boards). - */ - SDHCI_QUIRK2_BROKEN_64_BIT_DMA, + .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN, .ops = &tegra114_sdhci_ops, }; static const struct sdhci_tegra_soc_data soc_data_tegra124 = { .pdata = &sdhci_tegra124_pdata, + .dma_mask = DMA_BIT_MASK(34), }; static const struct sdhci_ops tegra210_sdhci_ops = { @@ -1349,6 +1360,7 @@ static const struct sdhci_ops tegra210_sdhci_ops = { .write_w = tegra210_sdhci_writew, .write_l = tegra_sdhci_writel, .set_clock = tegra_sdhci_set_clock, + .set_dma_mask = tegra_sdhci_set_dma_mask, .set_bus_width = sdhci_set_bus_width, .reset = tegra_sdhci_reset, .set_uhs_signaling = tegra_sdhci_set_uhs_signaling, @@ -1369,6 +1381,7 @@ static const struct sdhci_pltfm_data sdhci_tegra210_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra210 = { .pdata = &sdhci_tegra210_pdata, + .dma_mask = DMA_BIT_MASK(34), .nvquirks = NVQUIRK_NEEDS_PAD_CONTROL | NVQUIRK_HAS_PADCALIB | NVQUIRK_DIS_CARD_CLK_CONFIG_TAP | @@ -1383,6 +1396,7 @@ static const struct sdhci_ops tegra186_sdhci_ops = { .read_w = tegra_sdhci_readw, .write_l = tegra_sdhci_writel, .set_clock = tegra_sdhci_set_clock, + .set_dma_mask = tegra_sdhci_set_dma_mask, .set_bus_width = sdhci_set_bus_width, .reset = tegra_sdhci_reset, .set_uhs_signaling = tegra_sdhci_set_uhs_signaling, @@ -1398,20 +1412,13 @@ static const struct sdhci_pltfm_data sdhci_tegra186_pdata = { SDHCI_QUIRK_NO_HISPD_BIT | SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, - .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | - /* SDHCI controllers on Tegra186 support 40-bit addressing. - * IOVA addresses are 48-bit wide on Tegra186. - * With 64-bit dma mask used for SDHCI, accesses can - * be broken. Disable 64-bit dma, which would fall back - * to 32-bit dma mask. Ideally 40-bit dma mask would work, - * But it is not supported as of now. - */ - SDHCI_QUIRK2_BROKEN_64_BIT_DMA, + .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN, .ops = &tegra186_sdhci_ops, }; static const struct sdhci_tegra_soc_data soc_data_tegra186 = { .pdata = &sdhci_tegra186_pdata, + .dma_mask = DMA_BIT_MASK(40), .nvquirks = NVQUIRK_NEEDS_PAD_CONTROL | NVQUIRK_HAS_PADCALIB | NVQUIRK_DIS_CARD_CLK_CONFIG_TAP | @@ -1424,6 +1431,7 @@ static const struct sdhci_tegra_soc_data soc_data_tegra186 = { static const struct sdhci_tegra_soc_data soc_data_tegra194 = { .pdata = &sdhci_tegra186_pdata, + .dma_mask = DMA_BIT_MASK(39), .nvquirks = NVQUIRK_NEEDS_PAD_CONTROL | NVQUIRK_HAS_PADCALIB | NVQUIRK_DIS_CARD_CLK_CONFIG_TAP | From 6ba5bbba95f789d76ce3bf440ee02fdaf52ec486 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 26 Sep 2019 12:13:06 +0100 Subject: [PATCH 1401/2880] NFC: st95hf: clean up indentation issue The return statement is indented incorrectly, add in a missing tab and remove an extraneous space after the return Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/nfc/st95hf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nfc/st95hf/core.c b/drivers/nfc/st95hf/core.c index 7eda62a9e0df..9642971e89ce 100644 --- a/drivers/nfc/st95hf/core.c +++ b/drivers/nfc/st95hf/core.c @@ -661,7 +661,7 @@ static int st95hf_error_handling(struct st95hf_context *stcontext, result = -ETIMEDOUT; else result = -EIO; - return result; + return result; } /* Check for CRC err only if CRC is present in the tag response */ From 4208966f65f520d7f392dbaa62e39a8fa88ffb95 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 26 Sep 2019 12:22:52 +0100 Subject: [PATCH 1402/2880] net: ena: clean up indentation issue There memset is indented incorrectly, remove the extraneous tabs. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_eth_com.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_eth_com.c b/drivers/net/ethernet/amazon/ena/ena_eth_com.c index 38046bf0ff44..2845ac277724 100644 --- a/drivers/net/ethernet/amazon/ena/ena_eth_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_eth_com.c @@ -211,8 +211,8 @@ static int ena_com_sq_update_llq_tail(struct ena_com_io_sq *io_sq) pkt_ctrl->curr_bounce_buf = ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl); - memset(io_sq->llq_buf_ctrl.curr_bounce_buf, - 0x0, llq_info->desc_list_entry_size); + memset(io_sq->llq_buf_ctrl.curr_bounce_buf, + 0x0, llq_info->desc_list_entry_size); pkt_ctrl->idx = 0; if (unlikely(llq_info->desc_stride_ctrl == ENA_ADMIN_SINGLE_DESC_PER_ENTRY)) From 979b9b251ae06e3408153bd7b9342a290d65e826 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 26 Sep 2019 14:43:38 +0300 Subject: [PATCH 1403/2880] mlxsw: spectrum: Clear VLAN filters during port initialization When a port is created, its VLAN filters are not cleared by the firmware. This causes tagged packets to be later dropped by the ingress STP filters, which default to DISCARD state. The above did not matter much until commit b5ce611fd96e ("mlxsw: spectrum: Add devlink-trap support") where we exposed the drop reason to users. Without this patch, the drop reason users will see is not consistent. If a port is enslaved to a VLAN-aware bridge and a packet with an invalid VLAN tries to ingress the bridge, it will be dropped due to ingress STP filter. If the VLAN is later enabled and then disabled, the packet will be dropped by the ingress VLAN filter despite the above being a seemingly NOP operation. Fix this by clearing all the VLAN filters during port initialization. Adjust the test accordingly. Fixes: b5ce611fd96e ("mlxsw: spectrum: Add devlink-trap support") Reported-by: Alex Kushnarov Tested-by: Alex Kushnarov Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 9 +++++++++ .../selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh | 7 ------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index dd234cf7b39d..dcf9562bce8a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3771,6 +3771,14 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port, goto err_port_qdiscs_init; } + err = mlxsw_sp_port_vlan_set(mlxsw_sp_port, 0, VLAN_N_VID - 1, false, + false); + if (err) { + dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to clear VLAN filter\n", + mlxsw_sp_port->local_port); + goto err_port_vlan_clear; + } + err = mlxsw_sp_port_nve_init(mlxsw_sp_port); if (err) { dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize NVE\n", @@ -3818,6 +3826,7 @@ err_port_vlan_create: err_port_pvid_set: mlxsw_sp_port_nve_fini(mlxsw_sp_port); err_port_nve_init: +err_port_vlan_clear: mlxsw_sp_tc_qdisc_fini(mlxsw_sp_port); err_port_qdiscs_init: mlxsw_sp_port_fids_fini(mlxsw_sp_port); diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh index 5dcdfa20fc6c..126caf28b529 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh @@ -224,13 +224,6 @@ ingress_vlan_filter_test() local vid=10 bridge vlan add vid $vid dev $swp2 master - # During initialization the firmware enables all the VLAN filters and - # the driver does not turn them off since the traffic will be discarded - # by the STP filter whose default is DISCARD state. Add the VID on the - # ingress bridge port and then remove it to make sure it is not member - # in the VLAN. - bridge vlan add vid $vid dev $swp1 master - bridge vlan del vid $vid dev $swp1 master RET=0 From 44bde514eb13ac32b20442880e8175584af7592c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 26 Sep 2019 14:43:39 +0300 Subject: [PATCH 1404/2880] Documentation: Clarify trap's description Alex noted that the below description might not be obvious to all users. Clarify it by adding an example. Fixes: f3047ca01f12 ("Documentation: Add devlink-trap documentation") Reported-by: Alex Kushnarov Reviewed-by: Alex Kushnarov Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/devlink-trap.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/devlink-trap.rst b/Documentation/networking/devlink-trap.rst index c20c7c483664..8e90a85f3bd5 100644 --- a/Documentation/networking/devlink-trap.rst +++ b/Documentation/networking/devlink-trap.rst @@ -143,7 +143,8 @@ be added to the following table: * - ``port_list_is_empty`` - ``drop`` - Traps packets that the device decided to drop in case they need to be - flooded and the flood list is empty + flooded (e.g., unknown unicast, unregistered multicast) and there are + no ports the packets should be flooded to * - ``port_loopback_filter`` - ``drop`` - Traps packets that the device decided to drop in case after layer 2 From 52feb8b588f6d23673dd7cc2b44b203493b627f6 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 26 Sep 2019 14:43:40 +0300 Subject: [PATCH 1405/2880] mlxsw: spectrum_flower: Fail in case user specifies multiple mirror actions The ASIC can only mirror a packet to one port, but when user is trying to set more than one mirror action, it doesn't fail. Add a check if more than one mirror action was specified per rule and if so, fail for not being supported. Fixes: d0d13c1858a11 ("mlxsw: spectrum_acl: Add support for mirror action") Signed-off-by: Danielle Ratson Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 0ad1a24abfc6..b607919c8ad0 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -21,6 +21,7 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, struct netlink_ext_ack *extack) { const struct flow_action_entry *act; + int mirror_act_count = 0; int err, i; if (!flow_action_has_entries(flow_action)) @@ -105,6 +106,11 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, case FLOW_ACTION_MIRRED: { struct net_device *out_dev = act->dev; + if (mirror_act_count++) { + NL_SET_ERR_MSG_MOD(extack, "Multiple mirror actions per rule are not supported"); + return -EOPNOTSUPP; + } + err = mlxsw_sp_acl_rulei_act_mirror(mlxsw_sp, rulei, block, out_dev, extack); From 6b3656a60f2067738d1a423328199720806f0c44 Mon Sep 17 00:00:00 2001 From: "Kevin(Yudong) Yang" Date: Thu, 26 Sep 2019 10:30:05 -0400 Subject: [PATCH 1406/2880] tcp_bbr: fix quantization code to not raise cwnd if not probing bandwidth There was a bug in the previous logic that attempted to ensure gain cycling gets inflight above BDP even for small BDPs. This code correctly raised and lowered target inflight values during the gain cycle. And this code correctly ensured that cwnd was raised when probing bandwidth. However, it did not correspondingly ensure that cwnd was *not* raised in this way when *not* probing for bandwidth. The result was that small-BDP flows that were always cwnd-bound could go for many cycles with a fixed cwnd, and not probe or yield bandwidth at all. This meant that multiple small-BDP flows could fail to converge in their bandwidth allocations. Fixes: 3c346b233c68 ("tcp_bbr: fix bw probing to raise in-flight data for very small BDPs") Signed-off-by: Kevin(Yudong) Yang Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Acked-by: Priyaranjan Jha Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 95b59540eee1..32772d6ded4e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -388,7 +388,7 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) * which allows 2 outstanding 2-packet sequences, to try to keep pipe * full even with ACK-every-other-packet delayed ACKs. */ -static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) { struct bbr *bbr = inet_csk_ca(sk); @@ -399,7 +399,7 @@ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) cwnd = (cwnd + 1) & ~1U; /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ - if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT) + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) cwnd += 2; return cwnd; @@ -411,7 +411,7 @@ static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) u32 inflight; inflight = bbr_bdp(sk, bw, gain); - inflight = bbr_quantization_budget(sk, inflight, gain); + inflight = bbr_quantization_budget(sk, inflight); return inflight; } @@ -531,7 +531,7 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, * due to aggregation (of data and/or ACKs) visible in the ACK stream. */ target_cwnd += bbr_ack_aggregation_cwnd(sk); - target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); /* If we're below target cwnd, slow start cwnd toward target cwnd. */ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ From 174e23810cd3183dc2ca3f5166ef965a55eaaf54 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 26 Sep 2019 20:37:05 +0200 Subject: [PATCH 1407/2880] sk_buff: drop all skb extensions on free and skb scrubbing Now that we have a 3rd extension, add a new helper that drops the extension space and use it when we need to scrub an sk_buff. At this time, scrubbing clears secpath and bridge netfilter data, but retains the tc skb extension, after this patch all three get cleared. NAPI reuse/free assumes we can only have a secpath attached to skb, but it seems better to clear all extensions there as well. v2: add unlikely hint (Eric Dumazet) Fixes: 95a7233c452a ("net: openvswitch: Set OvS recirc_id from tc chain index") Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++++ net/core/dev.c | 4 ++-- net/core/skbuff.c | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 907209c0794e..e7d3b1a513ef 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4144,8 +4144,17 @@ static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id) return NULL; } + +static inline void skb_ext_reset(struct sk_buff *skb) +{ + if (unlikely(skb->active_extensions)) { + __skb_ext_put(skb->extensions); + skb->active_extensions = 0; + } +} #else static inline void skb_ext_put(struct sk_buff *skb) {} +static inline void skb_ext_reset(struct sk_buff *skb) {} static inline void skb_ext_del(struct sk_buff *skb, int unused) {} static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} diff --git a/net/core/dev.c b/net/core/dev.c index 71b18e80389f..bf3ed413abaf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5666,7 +5666,7 @@ EXPORT_SYMBOL(gro_find_complete_by_type); static void napi_skb_free_stolen_head(struct sk_buff *skb) { skb_dst_drop(skb); - secpath_reset(skb); + skb_ext_put(skb); kmem_cache_free(skbuff_head_cache, skb); } @@ -5733,7 +5733,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - secpath_reset(skb); + skb_ext_reset(skb); napi->skb = skb; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f12e8a050edb..01d65206f4fb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5119,7 +5119,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - secpath_reset(skb); + skb_ext_reset(skb); nf_reset(skb); nf_reset_trace(skb); From a41e8a88b06ee39fad4cb4a8ccf822563560a89c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Sep 2019 15:42:51 -0700 Subject: [PATCH 1408/2880] tcp: better handle TCP_USER_TIMEOUT in SYN_SENT state Yuchung Cheng and Marek Majkowski independently reported a weird behavior of TCP_USER_TIMEOUT option when used at connect() time. When the TCP_USER_TIMEOUT is reached, tcp_write_timeout() believes the flow should live, and the following condition in tcp_clamp_rto_to_user_timeout() programs one jiffie timers : remaining = icsk->icsk_user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ This silly situation ends when the max syn rtx count is reached. This patch makes sure we honor both TCP_SYNCNT and TCP_USER_TIMEOUT, avoiding these spurious SYN packets. Fixes: b701a99e431d ("tcp: Add tcp_clamp_rto_to_user_timeout() helper to improve accuracy") Signed-off-by: Eric Dumazet Reported-by: Yuchung Cheng Reported-by: Marek Majkowski Cc: Jon Maxwell Link: https://marc.info/?l=linux-netdev&m=156940118307949&w=2 Acked-by: Jon Maxwell Tested-by: Marek Majkowski Signed-off-by: Marek Majkowski Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index dbd9d2d0ee63..40de2d2364a1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -210,7 +210,7 @@ static int tcp_write_timeout(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - bool expired, do_reset; + bool expired = false, do_reset; int retry_until; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { @@ -242,9 +242,10 @@ static int tcp_write_timeout(struct sock *sk) if (tcp_out_of_resources(sk, do_reset)) return 1; } + } + if (!expired) expired = retransmits_timed_out(sk, retry_until, icsk->icsk_user_timeout); - } tcp_fastopen_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) From e51df6ce668a8f75ce27f83ce0f60103c568c375 Mon Sep 17 00:00:00 2001 From: Ben Chuang Date: Wed, 11 Sep 2019 15:23:44 +0800 Subject: [PATCH 1409/2880] mmc: host: sdhci-pci: Add Genesys Logic GL975x support Add support for the GL9750 and GL9755 chipsets. Enable v4 mode and wait 5ms after set 1.8V signal enable for GL9750/ GL9755. Fix the value of SDHCI_MAX_CURRENT register and use the vendor tuning flow for GL9750. Co-developed-by: Michael K Johnson Signed-off-by: Michael K Johnson Signed-off-by: Ben Chuang Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 1 + drivers/mmc/host/Makefile | 2 +- drivers/mmc/host/sdhci-pci-core.c | 2 + drivers/mmc/host/sdhci-pci-gli.c | 352 ++++++++++++++++++++++++++++++ drivers/mmc/host/sdhci-pci.h | 5 + 5 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 drivers/mmc/host/sdhci-pci-gli.c diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 3a52f5703286..49ea02c467bf 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -94,6 +94,7 @@ config MMC_SDHCI_PCI depends on MMC_SDHCI && PCI select MMC_CQHCI select IOSF_MBI if X86 + select MMC_SDHCI_IO_ACCESSORS help This selects the PCI Secure Digital Host Controller Interface. Most controllers found today are PCI devices. diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile index 390ee162fe71..11c4598e91d9 100644 --- a/drivers/mmc/host/Makefile +++ b/drivers/mmc/host/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_MMC_MXS) += mxs-mmc.o obj-$(CONFIG_MMC_SDHCI) += sdhci.o obj-$(CONFIG_MMC_SDHCI_PCI) += sdhci-pci.o sdhci-pci-y += sdhci-pci-core.o sdhci-pci-o2micro.o sdhci-pci-arasan.o \ - sdhci-pci-dwc-mshc.o + sdhci-pci-dwc-mshc.o sdhci-pci-gli.o obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += sdhci-pci-data.o obj-$(CONFIG_MMC_SDHCI_ACPI) += sdhci-acpi.o obj-$(CONFIG_MMC_SDHCI_PXAV3) += sdhci-pxav3.o diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index e1ca185d7328..eaffa85bc728 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -1685,6 +1685,8 @@ static const struct pci_device_id pci_ids[] = { SDHCI_PCI_DEVICE(O2, SEABIRD1, o2), SDHCI_PCI_DEVICE(ARASAN, PHY_EMMC, arasan), SDHCI_PCI_DEVICE(SYNOPSYS, DWC_MSHC, snps), + SDHCI_PCI_DEVICE(GLI, 9750, gl9750), + SDHCI_PCI_DEVICE(GLI, 9755, gl9755), SDHCI_PCI_DEVICE_CLASS(AMD, SYSTEM_SDHCI, PCI_CLASS_MASK, amd), /* Generic SD host controller */ {PCI_DEVICE_CLASS(SYSTEM_SDHCI, PCI_CLASS_MASK)}, diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c new file mode 100644 index 000000000000..5eea8d70a85d --- /dev/null +++ b/drivers/mmc/host/sdhci-pci-gli.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Genesys Logic, Inc. + * + * Authors: Ben Chuang + * + * Version: v0.9.0 (2019-08-08) + */ + +#include +#include +#include +#include +#include +#include "sdhci.h" +#include "sdhci-pci.h" + +/* Genesys Logic extra registers */ +#define SDHCI_GLI_9750_WT 0x800 +#define SDHCI_GLI_9750_WT_EN BIT(0) +#define GLI_9750_WT_EN_ON 0x1 +#define GLI_9750_WT_EN_OFF 0x0 + +#define SDHCI_GLI_9750_DRIVING 0x860 +#define SDHCI_GLI_9750_DRIVING_1 GENMASK(11, 0) +#define SDHCI_GLI_9750_DRIVING_2 GENMASK(27, 26) +#define GLI_9750_DRIVING_1_VALUE 0xFFF +#define GLI_9750_DRIVING_2_VALUE 0x3 + +#define SDHCI_GLI_9750_PLL 0x864 +#define SDHCI_GLI_9750_PLL_TX2_INV BIT(23) +#define SDHCI_GLI_9750_PLL_TX2_DLY GENMASK(22, 20) +#define GLI_9750_PLL_TX2_INV_VALUE 0x1 +#define GLI_9750_PLL_TX2_DLY_VALUE 0x0 + +#define SDHCI_GLI_9750_SW_CTRL 0x874 +#define SDHCI_GLI_9750_SW_CTRL_4 GENMASK(7, 6) +#define GLI_9750_SW_CTRL_4_VALUE 0x3 + +#define SDHCI_GLI_9750_MISC 0x878 +#define SDHCI_GLI_9750_MISC_TX1_INV BIT(2) +#define SDHCI_GLI_9750_MISC_RX_INV BIT(3) +#define SDHCI_GLI_9750_MISC_TX1_DLY GENMASK(6, 4) +#define GLI_9750_MISC_TX1_INV_VALUE 0x0 +#define GLI_9750_MISC_RX_INV_ON 0x1 +#define GLI_9750_MISC_RX_INV_OFF 0x0 +#define GLI_9750_MISC_RX_INV_VALUE GLI_9750_MISC_RX_INV_OFF +#define GLI_9750_MISC_TX1_DLY_VALUE 0x5 + +#define SDHCI_GLI_9750_TUNING_CONTROL 0x540 +#define SDHCI_GLI_9750_TUNING_CONTROL_EN BIT(4) +#define GLI_9750_TUNING_CONTROL_EN_ON 0x1 +#define GLI_9750_TUNING_CONTROL_EN_OFF 0x0 +#define SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_1 BIT(16) +#define SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_2 GENMASK(20, 19) +#define GLI_9750_TUNING_CONTROL_GLITCH_1_VALUE 0x1 +#define GLI_9750_TUNING_CONTROL_GLITCH_2_VALUE 0x2 + +#define SDHCI_GLI_9750_TUNING_PARAMETERS 0x544 +#define SDHCI_GLI_9750_TUNING_PARAMETERS_RX_DLY GENMASK(2, 0) +#define GLI_9750_TUNING_PARAMETERS_RX_DLY_VALUE 0x1 + +#define GLI_MAX_TUNING_LOOP 40 + +/* Genesys Logic chipset */ +static inline void gl9750_wt_on(struct sdhci_host *host) +{ + u32 wt_value; + u32 wt_enable; + + wt_value = sdhci_readl(host, SDHCI_GLI_9750_WT); + wt_enable = FIELD_GET(SDHCI_GLI_9750_WT_EN, wt_value); + + if (wt_enable == GLI_9750_WT_EN_ON) + return; + + wt_value &= ~SDHCI_GLI_9750_WT_EN; + wt_value |= FIELD_PREP(SDHCI_GLI_9750_WT_EN, GLI_9750_WT_EN_ON); + + sdhci_writel(host, wt_value, SDHCI_GLI_9750_WT); +} + +static inline void gl9750_wt_off(struct sdhci_host *host) +{ + u32 wt_value; + u32 wt_enable; + + wt_value = sdhci_readl(host, SDHCI_GLI_9750_WT); + wt_enable = FIELD_GET(SDHCI_GLI_9750_WT_EN, wt_value); + + if (wt_enable == GLI_9750_WT_EN_OFF) + return; + + wt_value &= ~SDHCI_GLI_9750_WT_EN; + wt_value |= FIELD_PREP(SDHCI_GLI_9750_WT_EN, GLI_9750_WT_EN_OFF); + + sdhci_writel(host, wt_value, SDHCI_GLI_9750_WT); +} + +static void gli_set_9750(struct sdhci_host *host) +{ + u32 driving_value; + u32 pll_value; + u32 sw_ctrl_value; + u32 misc_value; + u32 parameter_value; + u32 control_value; + u16 ctrl2; + + gl9750_wt_on(host); + + driving_value = sdhci_readl(host, SDHCI_GLI_9750_DRIVING); + pll_value = sdhci_readl(host, SDHCI_GLI_9750_PLL); + sw_ctrl_value = sdhci_readl(host, SDHCI_GLI_9750_SW_CTRL); + misc_value = sdhci_readl(host, SDHCI_GLI_9750_MISC); + parameter_value = sdhci_readl(host, SDHCI_GLI_9750_TUNING_PARAMETERS); + control_value = sdhci_readl(host, SDHCI_GLI_9750_TUNING_CONTROL); + + driving_value &= ~(SDHCI_GLI_9750_DRIVING_1); + driving_value &= ~(SDHCI_GLI_9750_DRIVING_2); + driving_value |= FIELD_PREP(SDHCI_GLI_9750_DRIVING_1, + GLI_9750_DRIVING_1_VALUE); + driving_value |= FIELD_PREP(SDHCI_GLI_9750_DRIVING_2, + GLI_9750_DRIVING_2_VALUE); + sdhci_writel(host, driving_value, SDHCI_GLI_9750_DRIVING); + + sw_ctrl_value &= ~SDHCI_GLI_9750_SW_CTRL_4; + sw_ctrl_value |= FIELD_PREP(SDHCI_GLI_9750_SW_CTRL_4, + GLI_9750_SW_CTRL_4_VALUE); + sdhci_writel(host, sw_ctrl_value, SDHCI_GLI_9750_SW_CTRL); + + /* reset the tuning flow after reinit and before starting tuning */ + pll_value &= ~SDHCI_GLI_9750_PLL_TX2_INV; + pll_value &= ~SDHCI_GLI_9750_PLL_TX2_DLY; + pll_value |= FIELD_PREP(SDHCI_GLI_9750_PLL_TX2_INV, + GLI_9750_PLL_TX2_INV_VALUE); + pll_value |= FIELD_PREP(SDHCI_GLI_9750_PLL_TX2_DLY, + GLI_9750_PLL_TX2_DLY_VALUE); + + misc_value &= ~SDHCI_GLI_9750_MISC_TX1_INV; + misc_value &= ~SDHCI_GLI_9750_MISC_RX_INV; + misc_value &= ~SDHCI_GLI_9750_MISC_TX1_DLY; + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_TX1_INV, + GLI_9750_MISC_TX1_INV_VALUE); + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_RX_INV, + GLI_9750_MISC_RX_INV_VALUE); + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_TX1_DLY, + GLI_9750_MISC_TX1_DLY_VALUE); + + parameter_value &= ~SDHCI_GLI_9750_TUNING_PARAMETERS_RX_DLY; + parameter_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_PARAMETERS_RX_DLY, + GLI_9750_TUNING_PARAMETERS_RX_DLY_VALUE); + + control_value &= ~SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_1; + control_value &= ~SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_2; + control_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_1, + GLI_9750_TUNING_CONTROL_GLITCH_1_VALUE); + control_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_2, + GLI_9750_TUNING_CONTROL_GLITCH_2_VALUE); + + sdhci_writel(host, pll_value, SDHCI_GLI_9750_PLL); + sdhci_writel(host, misc_value, SDHCI_GLI_9750_MISC); + + /* disable tuned clk */ + ctrl2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); + ctrl2 &= ~SDHCI_CTRL_TUNED_CLK; + sdhci_writew(host, ctrl2, SDHCI_HOST_CONTROL2); + + /* enable tuning parameters control */ + control_value &= ~SDHCI_GLI_9750_TUNING_CONTROL_EN; + control_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_CONTROL_EN, + GLI_9750_TUNING_CONTROL_EN_ON); + sdhci_writel(host, control_value, SDHCI_GLI_9750_TUNING_CONTROL); + + /* write tuning parameters */ + sdhci_writel(host, parameter_value, SDHCI_GLI_9750_TUNING_PARAMETERS); + + /* disable tuning parameters control */ + control_value &= ~SDHCI_GLI_9750_TUNING_CONTROL_EN; + control_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_CONTROL_EN, + GLI_9750_TUNING_CONTROL_EN_OFF); + sdhci_writel(host, control_value, SDHCI_GLI_9750_TUNING_CONTROL); + + /* clear tuned clk */ + ctrl2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); + ctrl2 &= ~SDHCI_CTRL_TUNED_CLK; + sdhci_writew(host, ctrl2, SDHCI_HOST_CONTROL2); + + gl9750_wt_off(host); +} + +static void gli_set_9750_rx_inv(struct sdhci_host *host, bool b) +{ + u32 misc_value; + + gl9750_wt_on(host); + + misc_value = sdhci_readl(host, SDHCI_GLI_9750_MISC); + misc_value &= ~SDHCI_GLI_9750_MISC_RX_INV; + if (b) { + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_RX_INV, + GLI_9750_MISC_RX_INV_ON); + } else { + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_RX_INV, + GLI_9750_MISC_RX_INV_OFF); + } + sdhci_writel(host, misc_value, SDHCI_GLI_9750_MISC); + + gl9750_wt_off(host); +} + +static int __sdhci_execute_tuning_9750(struct sdhci_host *host, u32 opcode) +{ + int i; + int rx_inv; + + for (rx_inv = 0; rx_inv < 2; rx_inv++) { + gli_set_9750_rx_inv(host, !!rx_inv); + sdhci_start_tuning(host); + + for (i = 0; i < GLI_MAX_TUNING_LOOP; i++) { + u16 ctrl; + + sdhci_send_tuning(host, opcode); + + if (!host->tuning_done) { + sdhci_abort_tuning(host, opcode); + break; + } + + ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2); + if (!(ctrl & SDHCI_CTRL_EXEC_TUNING)) { + if (ctrl & SDHCI_CTRL_TUNED_CLK) + return 0; /* Success! */ + break; + } + } + } + if (!host->tuning_done) { + pr_info("%s: Tuning timeout, falling back to fixed sampling clock\n", + mmc_hostname(host->mmc)); + return -ETIMEDOUT; + } + + pr_info("%s: Tuning failed, falling back to fixed sampling clock\n", + mmc_hostname(host->mmc)); + sdhci_reset_tuning(host); + + return -EAGAIN; +} + +static int gl9750_execute_tuning(struct sdhci_host *host, u32 opcode) +{ + host->mmc->retune_period = 0; + if (host->tuning_mode == SDHCI_TUNING_MODE_1) + host->mmc->retune_period = host->tuning_count; + + gli_set_9750(host); + host->tuning_err = __sdhci_execute_tuning_9750(host, opcode); + sdhci_end_tuning(host); + + return 0; +} + +static int gli_probe_slot_gl9750(struct sdhci_pci_slot *slot) +{ + struct sdhci_host *host = slot->host; + + slot->host->mmc->caps2 |= MMC_CAP2_NO_SDIO; + sdhci_enable_v4_mode(host); + + return 0; +} + +static int gli_probe_slot_gl9755(struct sdhci_pci_slot *slot) +{ + struct sdhci_host *host = slot->host; + + slot->host->mmc->caps2 |= MMC_CAP2_NO_SDIO; + sdhci_enable_v4_mode(host); + + return 0; +} + +static void sdhci_gli_voltage_switch(struct sdhci_host *host) +{ + /* + * According to Section 3.6.1 signal voltage switch procedure in + * SD Host Controller Simplified Spec. 4.20, steps 6~8 are as + * follows: + * (6) Set 1.8V Signal Enable in the Host Control 2 register. + * (7) Wait 5ms. 1.8V voltage regulator shall be stable within this + * period. + * (8) If 1.8V Signal Enable is cleared by Host Controller, go to + * step (12). + * + * Wait 5ms after set 1.8V signal enable in Host Control 2 register + * to ensure 1.8V signal enable bit is set by GL9750/GL9755. + */ + usleep_range(5000, 5500); +} + +static void sdhci_gl9750_reset(struct sdhci_host *host, u8 mask) +{ + sdhci_reset(host, mask); + gli_set_9750(host); +} + +static u32 sdhci_gl9750_readl(struct sdhci_host *host, int reg) +{ + u32 value; + + value = readl(host->ioaddr + reg); + if (unlikely(reg == SDHCI_MAX_CURRENT && !(value & 0xff))) + value |= 0xc8; + + return value; +} + +static const struct sdhci_ops sdhci_gl9755_ops = { + .set_clock = sdhci_set_clock, + .enable_dma = sdhci_pci_enable_dma, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, + .voltage_switch = sdhci_gli_voltage_switch, +}; + +const struct sdhci_pci_fixes sdhci_gl9755 = { + .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC, + .quirks2 = SDHCI_QUIRK2_BROKEN_DDR50, + .probe_slot = gli_probe_slot_gl9755, + .ops = &sdhci_gl9755_ops, +}; + +static const struct sdhci_ops sdhci_gl9750_ops = { + .read_l = sdhci_gl9750_readl, + .set_clock = sdhci_set_clock, + .enable_dma = sdhci_pci_enable_dma, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_gl9750_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, + .voltage_switch = sdhci_gli_voltage_switch, + .platform_execute_tuning = gl9750_execute_tuning, +}; + +const struct sdhci_pci_fixes sdhci_gl9750 = { + .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC, + .quirks2 = SDHCI_QUIRK2_BROKEN_DDR50, + .probe_slot = gli_probe_slot_gl9750, + .ops = &sdhci_gl9750_ops, +}; diff --git a/drivers/mmc/host/sdhci-pci.h b/drivers/mmc/host/sdhci-pci.h index 1abc9d47a4c0..558202fe64c6 100644 --- a/drivers/mmc/host/sdhci-pci.h +++ b/drivers/mmc/host/sdhci-pci.h @@ -68,6 +68,9 @@ #define PCI_DEVICE_ID_SYNOPSYS_DWC_MSHC 0xc202 +#define PCI_DEVICE_ID_GLI_9755 0x9755 +#define PCI_DEVICE_ID_GLI_9750 0x9750 + /* * PCI device class and mask */ @@ -188,5 +191,7 @@ int sdhci_pci_enable_dma(struct sdhci_host *host); extern const struct sdhci_pci_fixes sdhci_arasan; extern const struct sdhci_pci_fixes sdhci_snps; extern const struct sdhci_pci_fixes sdhci_o2; +extern const struct sdhci_pci_fixes sdhci_gl9750; +extern const struct sdhci_pci_fixes sdhci_gl9755; #endif /* __SDHCI_PCI_H */ From 78beef629fd95be4ed853b2d37b832f766bd96ca Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Thu, 26 Sep 2019 20:51:46 -0500 Subject: [PATCH 1410/2880] nfp: abm: fix memory leak in nfp_abm_u32_knode_replace In nfp_abm_u32_knode_replace if the allocation for match fails it should go to the error handling instead of returning. Updated other gotos to have correct errno returned, too. Signed-off-by: Navid Emamdoost Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/abm/cls.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/abm/cls.c b/drivers/net/ethernet/netronome/nfp/abm/cls.c index 23ebddfb9532..9f8a1f69c0c4 100644 --- a/drivers/net/ethernet/netronome/nfp/abm/cls.c +++ b/drivers/net/ethernet/netronome/nfp/abm/cls.c @@ -176,8 +176,10 @@ nfp_abm_u32_knode_replace(struct nfp_abm_link *alink, u8 mask, val; int err; - if (!nfp_abm_u32_check_knode(alink->abm, knode, proto, extack)) + if (!nfp_abm_u32_check_knode(alink->abm, knode, proto, extack)) { + err = -EOPNOTSUPP; goto err_delete; + } tos_off = proto == htons(ETH_P_IP) ? 16 : 20; @@ -198,14 +200,18 @@ nfp_abm_u32_knode_replace(struct nfp_abm_link *alink, if ((iter->val & cmask) == (val & cmask) && iter->band != knode->res->classid) { NL_SET_ERR_MSG_MOD(extack, "conflict with already offloaded filter"); + err = -EOPNOTSUPP; goto err_delete; } } if (!match) { match = kzalloc(sizeof(*match), GFP_KERNEL); - if (!match) - return -ENOMEM; + if (!match) { + err = -ENOMEM; + goto err_delete; + } + list_add(&match->list, &alink->dscp_map); } match->handle = knode->handle; @@ -221,7 +227,7 @@ nfp_abm_u32_knode_replace(struct nfp_abm_link *alink, err_delete: nfp_abm_u32_knode_delete(alink, knode); - return -EOPNOTSUPP; + return err; } static int nfp_abm_setup_tc_block_cb(enum tc_setup_type type, From faeacb6ddb13b7a020b50b9246fe098653cfbd6e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 27 Sep 2019 10:40:39 +0100 Subject: [PATCH 1411/2880] net: tap: clean up an indentation issue There is a statement that is indented too deeply, remove the extraneous tab. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/tap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index dd614c2cd994..3ae70c7e6860 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1200,7 +1200,7 @@ err_kfree: kfree_skb(skb); err: rcu_read_lock(); - tap = rcu_dereference(q->tap); + tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped) tap->count_tx_dropped(tap); rcu_read_unlock(); From 6402939ec86eaf226c8b8ae00ed983936b164908 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Tue, 17 Sep 2019 17:47:12 -0500 Subject: [PATCH 1412/2880] ieee802154: ca8210: prevent memory leak In ca8210_probe the allocated pdata needs to be assigned to spi_device->dev.platform_data before calling ca8210_get_platform_data. Othrwise when ca8210_get_platform_data fails pdata cannot be released. Signed-off-by: Navid Emamdoost Link: https://lore.kernel.org/r/20190917224713.26371-1-navid.emamdoost@gmail.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/ca8210.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index b188fce3f641..658b399ac9ea 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -3152,12 +3152,12 @@ static int ca8210_probe(struct spi_device *spi_device) goto error; } + priv->spi->dev.platform_data = pdata; ret = ca8210_get_platform_data(priv->spi, pdata); if (ret) { dev_crit(&spi_device->dev, "ca8210_get_platform_data failed\n"); goto error; } - priv->spi->dev.platform_data = pdata; ret = ca8210_dev_com_init(priv); if (ret) { From f15d9a992f901d4f22db868adf800844d1cac9f2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:55 +0200 Subject: [PATCH 1413/2880] iommu/amd: Remove domain->updated This struct member was used to track whether a domain change requires updates to the device-table and IOMMU cache flushes. The problem is, that access to this field is racy since locking in the common mapping code-paths has been eliminated. Move the updated field to the stack to get rid of all potential races and remove the field from the struct. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 49 +++++++++++++++++---------------- drivers/iommu/amd_iommu_types.h | 1 - 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 7bdce3b10f3d..042854bbc5bc 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1458,10 +1458,11 @@ static void free_pagetable(struct protection_domain *domain) * another level increases the size of the address space by 9 bits to a size up * to 64 bits. */ -static void increase_address_space(struct protection_domain *domain, +static bool increase_address_space(struct protection_domain *domain, gfp_t gfp) { unsigned long flags; + bool ret = false; u64 *pte; spin_lock_irqsave(&domain->lock, flags); @@ -1478,19 +1479,21 @@ static void increase_address_space(struct protection_domain *domain, iommu_virt_to_phys(domain->pt_root)); domain->pt_root = pte; domain->mode += 1; - domain->updated = true; + + ret = true; out: spin_unlock_irqrestore(&domain->lock, flags); - return; + return ret; } static u64 *alloc_pte(struct protection_domain *domain, unsigned long address, unsigned long page_size, u64 **pte_page, - gfp_t gfp) + gfp_t gfp, + bool *updated) { int level, end_lvl; u64 *pte, *page; @@ -1498,7 +1501,7 @@ static u64 *alloc_pte(struct protection_domain *domain, BUG_ON(!is_power_of_2(page_size)); while (address > PM_LEVEL_SIZE(domain->mode)) - increase_address_space(domain, gfp); + *updated = increase_address_space(domain, gfp) || *updated; level = domain->mode - 1; pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; @@ -1530,7 +1533,7 @@ static u64 *alloc_pte(struct protection_domain *domain, for (i = 0; i < count; ++i) cmpxchg64(&lpte[i], __pte, 0ULL); - domain->updated = true; + *updated = true; continue; } @@ -1547,7 +1550,7 @@ static u64 *alloc_pte(struct protection_domain *domain, if (cmpxchg64(pte, __pte, __npte) != __pte) free_page((unsigned long)page); else if (IOMMU_PTE_PRESENT(__pte)) - domain->updated = true; + *updated = true; continue; } @@ -1656,28 +1659,29 @@ static int iommu_map_page(struct protection_domain *dom, gfp_t gfp) { struct page *freelist = NULL; + bool updated = false; u64 __pte, *pte; - int i, count; + int ret, i, count; BUG_ON(!IS_ALIGNED(bus_addr, page_size)); BUG_ON(!IS_ALIGNED(phys_addr, page_size)); + ret = -EINVAL; if (!(prot & IOMMU_PROT_MASK)) - return -EINVAL; + goto out; count = PAGE_SIZE_PTE_COUNT(page_size); - pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp); + pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp, &updated); - if (!pte) { - update_domain(dom); - return -ENOMEM; - } + ret = -ENOMEM; + if (!pte) + goto out; for (i = 0; i < count; ++i) freelist = free_clear_pte(&pte[i], pte[i], freelist); if (freelist != NULL) - dom->updated = true; + updated = true; if (count > 1) { __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size); @@ -1693,12 +1697,16 @@ static int iommu_map_page(struct protection_domain *dom, for (i = 0; i < count; ++i) pte[i] = __pte; - update_domain(dom); + ret = 0; + +out: + if (updated) + update_domain(dom); /* Everything flushed out, free pages now */ free_page_list(freelist); - return 0; + return ret; } static unsigned long iommu_unmap_page(struct protection_domain *dom, @@ -2399,15 +2407,10 @@ static void update_device_table(struct protection_domain *domain) static void update_domain(struct protection_domain *domain) { - if (!domain->updated) - return; - update_device_table(domain); domain_flush_devices(domain); domain_flush_tlb_pde(domain); - - domain->updated = false; } static int dir2prot(enum dma_data_direction direction) @@ -3333,7 +3336,6 @@ void amd_iommu_domain_direct_map(struct iommu_domain *dom) /* Update data structure */ domain->mode = PAGE_MODE_NONE; - domain->updated = true; /* Make changes visible to IOMMUs */ update_domain(domain); @@ -3379,7 +3381,6 @@ int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) domain->glx = levels; domain->flags |= PD_IOMMUV2_MASK; - domain->updated = true; update_domain(domain); diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 9ac229e92b07..0186501ab971 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -475,7 +475,6 @@ struct protection_domain { int glx; /* Number of levels for GCR3 table */ u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ - bool updated; /* complete domain flush required */ unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ }; From 3a11905b69eb026402448c750f97a0eadfa76b08 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:56 +0200 Subject: [PATCH 1414/2880] iommu/amd: Remove amd_iommu_devtable_lock The lock is not necessary because the device table does not contain shared state that needs protection. Locking is only needed on an individual entry basis, and that needs to happen on the iommu_dev_data level. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 042854bbc5bc..37a9c04fc728 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -70,7 +70,6 @@ */ #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) -static DEFINE_SPINLOCK(amd_iommu_devtable_lock); static DEFINE_SPINLOCK(pd_bitmap_lock); /* List of all available dev_data structures */ @@ -2080,10 +2079,11 @@ static void do_detach(struct iommu_dev_data *dev_data) static int __attach_device(struct iommu_dev_data *dev_data, struct protection_domain *domain) { + unsigned long flags; int ret; /* lock domain */ - spin_lock(&domain->lock); + spin_lock_irqsave(&domain->lock, flags); ret = -EBUSY; if (dev_data->domain != NULL) @@ -2097,7 +2097,7 @@ static int __attach_device(struct iommu_dev_data *dev_data, out_unlock: /* ready */ - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); return ret; } @@ -2181,7 +2181,6 @@ static int attach_device(struct device *dev, { struct pci_dev *pdev; struct iommu_dev_data *dev_data; - unsigned long flags; int ret; dev_data = get_dev_data(dev); @@ -2209,9 +2208,7 @@ static int attach_device(struct device *dev, } skip_ats_check: - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); ret = __attach_device(dev_data, domain); - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); /* * We might boot into a crash-kernel here. The crashed kernel @@ -2231,14 +2228,15 @@ skip_ats_check: static void __detach_device(struct iommu_dev_data *dev_data) { struct protection_domain *domain; + unsigned long flags; domain = dev_data->domain; - spin_lock(&domain->lock); + spin_lock_irqsave(&domain->lock, flags); do_detach(dev_data); - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); } /* @@ -2248,7 +2246,6 @@ static void detach_device(struct device *dev) { struct protection_domain *domain; struct iommu_dev_data *dev_data; - unsigned long flags; dev_data = get_dev_data(dev); domain = dev_data->domain; @@ -2262,10 +2259,7 @@ static void detach_device(struct device *dev) if (WARN_ON(!dev_data->domain)) return; - /* lock device table */ - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); __detach_device(dev_data); - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); if (!dev_is_pci(dev)) return; @@ -2910,9 +2904,6 @@ int __init amd_iommu_init_dma_ops(void) static void cleanup_domain(struct protection_domain *domain) { struct iommu_dev_data *entry; - unsigned long flags; - - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, @@ -2920,8 +2911,6 @@ static void cleanup_domain(struct protection_domain *domain) BUG_ON(!entry->domain); __detach_device(entry); } - - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } static void protection_domain_free(struct protection_domain *domain) From f6c0bfce271b2dd613e8b8e009eefe89c1f788e8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:57 +0200 Subject: [PATCH 1415/2880] iommu/amd: Take domain->lock for complete attach/detach path The code-paths before __attach_device() and __detach_device() are called also access and modify domain state, so take the domain lock there too. This allows to get rid of the __detach_device() function. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 65 ++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 39 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 37a9c04fc728..2919168577ff 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2079,27 +2079,13 @@ static void do_detach(struct iommu_dev_data *dev_data) static int __attach_device(struct iommu_dev_data *dev_data, struct protection_domain *domain) { - unsigned long flags; - int ret; - - /* lock domain */ - spin_lock_irqsave(&domain->lock, flags); - - ret = -EBUSY; if (dev_data->domain != NULL) - goto out_unlock; + return -EBUSY; /* Attach alias group root */ do_attach(dev_data, domain); - ret = 0; - -out_unlock: - - /* ready */ - spin_unlock_irqrestore(&domain->lock, flags); - - return ret; + return 0; } @@ -2181,8 +2167,11 @@ static int attach_device(struct device *dev, { struct pci_dev *pdev; struct iommu_dev_data *dev_data; + unsigned long flags; int ret; + spin_lock_irqsave(&domain->lock, flags); + dev_data = get_dev_data(dev); if (!dev_is_pci(dev)) @@ -2190,12 +2179,13 @@ static int attach_device(struct device *dev, pdev = to_pci_dev(dev); if (domain->flags & PD_IOMMUV2_MASK) { + ret = -EINVAL; if (!dev_data->passthrough) - return -EINVAL; + goto out; if (dev_data->iommu_v2) { if (pdev_iommuv2_enable(pdev) != 0) - return -EINVAL; + goto out; dev_data->ats.enabled = true; dev_data->ats.qdep = pci_ats_queue_depth(pdev); @@ -2219,24 +2209,10 @@ skip_ats_check: domain_flush_complete(domain); - return ret; -} - -/* - * Removes a device from a protection domain (unlocked) - */ -static void __detach_device(struct iommu_dev_data *dev_data) -{ - struct protection_domain *domain; - unsigned long flags; - - domain = dev_data->domain; - - spin_lock_irqsave(&domain->lock, flags); - - do_detach(dev_data); - +out: spin_unlock_irqrestore(&domain->lock, flags); + + return ret; } /* @@ -2246,10 +2222,13 @@ static void detach_device(struct device *dev) { struct protection_domain *domain; struct iommu_dev_data *dev_data; + unsigned long flags; dev_data = get_dev_data(dev); domain = dev_data->domain; + spin_lock_irqsave(&domain->lock, flags); + /* * First check if the device is still attached. It might already * be detached from its domain because the generic @@ -2257,12 +2236,12 @@ static void detach_device(struct device *dev) * our alias handling. */ if (WARN_ON(!dev_data->domain)) - return; + goto out; - __detach_device(dev_data); + do_detach(dev_data); if (!dev_is_pci(dev)) - return; + goto out; if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2) pdev_iommuv2_disable(to_pci_dev(dev)); @@ -2270,6 +2249,9 @@ static void detach_device(struct device *dev) pci_disable_ats(to_pci_dev(dev)); dev_data->ats.enabled = false; + +out: + spin_unlock_irqrestore(&domain->lock, flags); } static int amd_iommu_add_device(struct device *dev) @@ -2904,13 +2886,18 @@ int __init amd_iommu_init_dma_ops(void) static void cleanup_domain(struct protection_domain *domain) { struct iommu_dev_data *entry; + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, struct iommu_dev_data, list); BUG_ON(!entry->domain); - __detach_device(entry); + do_detach(entry); } + + spin_unlock_irqrestore(&domain->lock, flags); } static void protection_domain_free(struct protection_domain *domain) From 45e528d9c479aeef2d3d1db1e619b243f91e324f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:58 +0200 Subject: [PATCH 1416/2880] iommu/amd: Check for busy devices earlier in attach_device() Check early in attach_device whether the device is already attached to a domain. This also simplifies the code path so that __attach_device() can be removed. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 2919168577ff..459247c32dc0 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2072,23 +2072,6 @@ static void do_detach(struct iommu_dev_data *dev_data) domain->dev_cnt -= 1; } -/* - * If a device is not yet associated with a domain, this function makes the - * device visible in the domain - */ -static int __attach_device(struct iommu_dev_data *dev_data, - struct protection_domain *domain) -{ - if (dev_data->domain != NULL) - return -EBUSY; - - /* Attach alias group root */ - do_attach(dev_data, domain); - - return 0; -} - - static void pdev_iommuv2_disable(struct pci_dev *pdev) { pci_disable_ats(pdev); @@ -2174,6 +2157,10 @@ static int attach_device(struct device *dev, dev_data = get_dev_data(dev); + ret = -EBUSY; + if (dev_data->domain != NULL) + goto out; + if (!dev_is_pci(dev)) goto skip_ats_check; @@ -2198,7 +2185,9 @@ static int attach_device(struct device *dev, } skip_ats_check: - ret = __attach_device(dev_data, domain); + ret = 0; + + do_attach(dev_data, domain); /* * We might boot into a crash-kernel here. The crashed kernel From ab7b2577f0d119052b98b8d913bad369ac2760eb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:59 +0200 Subject: [PATCH 1417/2880] iommu/amd: Lock dev_data in attach/detach code paths Make sure that attaching a detaching a device can't race against each other and protect the iommu_dev_data with a spin_lock in these code paths. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 9 +++++++++ drivers/iommu/amd_iommu_types.h | 3 +++ 2 files changed, 12 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 459247c32dc0..bac4e20a5919 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -201,6 +201,7 @@ static struct iommu_dev_data *alloc_dev_data(u16 devid) if (!dev_data) return NULL; + spin_lock_init(&dev_data->lock); dev_data->devid = devid; ratelimit_default_init(&dev_data->rs); @@ -2157,6 +2158,8 @@ static int attach_device(struct device *dev, dev_data = get_dev_data(dev); + spin_lock(&dev_data->lock); + ret = -EBUSY; if (dev_data->domain != NULL) goto out; @@ -2199,6 +2202,8 @@ skip_ats_check: domain_flush_complete(domain); out: + spin_unlock(&dev_data->lock); + spin_unlock_irqrestore(&domain->lock, flags); return ret; @@ -2218,6 +2223,8 @@ static void detach_device(struct device *dev) spin_lock_irqsave(&domain->lock, flags); + spin_lock(&dev_data->lock); + /* * First check if the device is still attached. It might already * be detached from its domain because the generic @@ -2240,6 +2247,8 @@ static void detach_device(struct device *dev) dev_data->ats.enabled = false; out: + spin_unlock(&dev_data->lock); + spin_unlock_irqrestore(&domain->lock, flags); } diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 0186501ab971..c9c1612d52e0 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -633,6 +633,9 @@ struct devid_map { * This struct contains device specific data for the IOMMU */ struct iommu_dev_data { + /*Protect against attach/detach races */ + spinlock_t lock; + struct list_head list; /* For domain->dev_list */ struct llist_node dev_data_list; /* For global dev_data_list */ struct protection_domain *domain; /* Domain the device is bound to */ From 2a78f9962565e53b78363eaf516eb052009e8020 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:23:00 +0200 Subject: [PATCH 1418/2880] iommu/amd: Lock code paths traversing protection_domain->dev_list The traversing of this list requires protection_domain->lock to be taken to avoid nasty races with attach/detach code. Make sure the lock is held on all code-paths traversing this list. Reported-by: Filippo Sironi Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index bac4e20a5919..9c26976a0f99 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1334,8 +1334,12 @@ static void domain_flush_np_cache(struct protection_domain *domain, dma_addr_t iova, size_t size) { if (unlikely(amd_iommu_np_cache)) { + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); domain_flush_pages(domain, iova, size); domain_flush_complete(domain); + spin_unlock_irqrestore(&domain->lock, flags); } } @@ -1700,8 +1704,13 @@ static int iommu_map_page(struct protection_domain *dom, ret = 0; out: - if (updated) + if (updated) { + unsigned long flags; + + spin_lock_irqsave(&dom->lock, flags); update_domain(dom); + spin_unlock_irqrestore(&dom->lock, flags); + } /* Everything flushed out, free pages now */ free_page_list(freelist); @@ -1857,8 +1866,12 @@ static void free_gcr3_table(struct protection_domain *domain) static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom) { + unsigned long flags; + + spin_lock_irqsave(&dom->domain.lock, flags); domain_flush_tlb(&dom->domain); domain_flush_complete(&dom->domain); + spin_unlock_irqrestore(&dom->domain.lock, flags); } static void iova_domain_flush_tlb(struct iova_domain *iovad) @@ -2414,6 +2427,7 @@ static dma_addr_t __map_single(struct device *dev, { dma_addr_t offset = paddr & ~PAGE_MASK; dma_addr_t address, start, ret; + unsigned long flags; unsigned int pages; int prot = 0; int i; @@ -2451,8 +2465,10 @@ out_unmap: iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE); } + spin_lock_irqsave(&dma_dom->domain.lock, flags); domain_flush_tlb(&dma_dom->domain); domain_flush_complete(&dma_dom->domain); + spin_unlock_irqrestore(&dma_dom->domain.lock, flags); dma_ops_free_iova(dma_dom, address, pages); @@ -2481,8 +2497,12 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, } if (amd_iommu_unmap_flush) { + unsigned long flags; + + spin_lock_irqsave(&dma_dom->domain.lock, flags); domain_flush_tlb(&dma_dom->domain); domain_flush_complete(&dma_dom->domain); + spin_unlock_irqrestore(&dma_dom->domain.lock, flags); dma_ops_free_iova(dma_dom, dma_addr, pages); } else { pages = __roundup_pow_of_two(pages); @@ -3246,9 +3266,12 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) { struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; + spin_lock_irqsave(&dom->lock, flags); domain_flush_tlb_pde(dom); domain_flush_complete(dom); + spin_unlock_irqrestore(&dom->lock, flags); } static void amd_iommu_iotlb_sync(struct iommu_domain *domain, From 127068abe85bf3dee50df51cb039a5a987a4a666 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Thu, 5 Sep 2019 20:24:12 +0100 Subject: [PATCH 1419/2880] i2c: qcom-geni: Disable DMA processing on the Lenovo Yoga C630 We have a production-level laptop (Lenovo Yoga C630) which is exhibiting a rather horrific bug. When I2C HID devices are being scanned for at boot-time the QCom Geni based I2C (Serial Engine) attempts to use DMA. When it does, the laptop reboots and the user never sees the OS. Attempts are being made to debug the reason for the spontaneous reboot. No luck so far, hence the requirement for this hot-fix. This workaround will be removed once we have a viable fix. Signed-off-by: Lee Jones Tested-by: Bjorn Andersson Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-qcom-geni.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index a89bfce5388e..17abf60c94ae 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -355,11 +355,13 @@ static int geni_i2c_rx_one_msg(struct geni_i2c_dev *gi2c, struct i2c_msg *msg, { dma_addr_t rx_dma; unsigned long time_left; - void *dma_buf; + void *dma_buf = NULL; struct geni_se *se = &gi2c->se; size_t len = msg->len; - dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (!of_machine_is_compatible("lenovo,yoga-c630")) + dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (dma_buf) geni_se_select_mode(se, GENI_SE_DMA); else @@ -394,11 +396,13 @@ static int geni_i2c_tx_one_msg(struct geni_i2c_dev *gi2c, struct i2c_msg *msg, { dma_addr_t tx_dma; unsigned long time_left; - void *dma_buf; + void *dma_buf = NULL; struct geni_se *se = &gi2c->se; size_t len = msg->len; - dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (!of_machine_is_compatible("lenovo,yoga-c630")) + dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (dma_buf) geni_se_select_mode(se, GENI_SE_DMA); else From a71e2ac1f32097fbb2beab098687a7a95c84543e Mon Sep 17 00:00:00 2001 From: Chris Brandt Date: Thu, 26 Sep 2019 07:19:09 -0500 Subject: [PATCH 1420/2880] i2c: riic: Clear NACK in tend isr The NACKF flag should be cleared in INTRIICNAKI interrupt processing as description in HW manual. This issue shows up quickly when PREEMPT_RT is applied and a device is probed that is not plugged in (like a touchscreen controller). The result is endless interrupts that halt system boot. Fixes: 310c18a41450 ("i2c: riic: add driver") Cc: stable@vger.kernel.org Reported-by: Chien Nguyen Signed-off-by: Chris Brandt Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-riic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-riic.c b/drivers/i2c/busses/i2c-riic.c index f31413fd9521..800414886f6b 100644 --- a/drivers/i2c/busses/i2c-riic.c +++ b/drivers/i2c/busses/i2c-riic.c @@ -202,6 +202,7 @@ static irqreturn_t riic_tend_isr(int irq, void *data) if (readb(riic->base + RIIC_ICSR2) & ICSR2_NACKF) { /* We got a NACKIE */ readb(riic->base + RIIC_ICDRR); /* dummy read */ + riic_clear_set_bit(riic, ICSR2_NACKF, 0, RIIC_ICSR2); riic->err = -ENXIO; } else if (riic->bytes_left) { return IRQ_NONE; From fd4b204a0971854c35795ab60b3673a5b57dfebc Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 27 Sep 2019 14:09:11 +0300 Subject: [PATCH 1421/2880] i2c: i801: Bring back Block Process Call support for certain platforms Commit b84398d6d7f9 ("i2c: i801: Use iTCO version 6 in Cannon Lake PCH and beyond") looks like to drop by accident Block Write-Block Read Process Call support for Intel Sunrisepoint, Lewisburg, Denverton and Kaby Lake. That support was added for above and newer platforms by the commit 315cd67c9453 ("i2c: i801: Add Block Write-Block Read Process Call support") so bring it back for above platforms. Fixes: b84398d6d7f9 ("i2c: i801: Use iTCO version 6 in Cannon Lake PCH and beyond") Signed-off-by: Jarkko Nikula Reviewed-by: Alexander Sverdlin Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-i801.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index c09791fb4929..f1c714acc280 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -1736,6 +1736,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) case PCI_DEVICE_ID_INTEL_LEWISBURG_SSKU_SMBUS: case PCI_DEVICE_ID_INTEL_DNV_SMBUS: case PCI_DEVICE_ID_INTEL_KABYLAKE_PCH_H_SMBUS: + priv->features |= FEATURE_BLOCK_PROC; priv->features |= FEATURE_I2C_BLOCK_READ; priv->features |= FEATURE_IRQ; priv->features |= FEATURE_SMBUS_PEC; From 11af27f494086188620e7768e421894af93c126a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ard=C3=B6?= Date: Fri, 6 Sep 2019 16:06:09 +0200 Subject: [PATCH 1422/2880] i2c: slave-eeprom: Add read only mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add read-only versions of all EEPROMs. These versions are read-only on the i2c side, but can be written from the sysfs side. Signed-off-by: Björn Ardö Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-slave-eeprom.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/i2c-slave-eeprom.c b/drivers/i2c/i2c-slave-eeprom.c index 92ff9991bae8..db9763cb4dae 100644 --- a/drivers/i2c/i2c-slave-eeprom.c +++ b/drivers/i2c/i2c-slave-eeprom.c @@ -33,11 +33,13 @@ struct eeprom_data { u16 address_mask; u8 num_address_bytes; u8 idx_write_cnt; + bool read_only; u8 buffer[]; }; #define I2C_SLAVE_BYTELEN GENMASK(15, 0) #define I2C_SLAVE_FLAG_ADDR16 BIT(16) +#define I2C_SLAVE_FLAG_RO BIT(17) #define I2C_SLAVE_DEVICE_MAGIC(_len, _flags) ((_flags) | (_len)) static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, @@ -53,9 +55,11 @@ static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, eeprom->buffer_idx = *val | (eeprom->buffer_idx << 8); eeprom->idx_write_cnt++; } else { - spin_lock(&eeprom->buffer_lock); - eeprom->buffer[eeprom->buffer_idx++ & eeprom->address_mask] = *val; - spin_unlock(&eeprom->buffer_lock); + if (!eeprom->read_only) { + spin_lock(&eeprom->buffer_lock); + eeprom->buffer[eeprom->buffer_idx++ & eeprom->address_mask] = *val; + spin_unlock(&eeprom->buffer_lock); + } } break; @@ -130,6 +134,7 @@ static int i2c_slave_eeprom_probe(struct i2c_client *client, const struct i2c_de eeprom->idx_write_cnt = 0; eeprom->num_address_bytes = flag_addr16 ? 2 : 1; eeprom->address_mask = size - 1; + eeprom->read_only = FIELD_GET(I2C_SLAVE_FLAG_RO, id->driver_data); spin_lock_init(&eeprom->buffer_lock); i2c_set_clientdata(client, eeprom); @@ -165,8 +170,11 @@ static int i2c_slave_eeprom_remove(struct i2c_client *client) static const struct i2c_device_id i2c_slave_eeprom_id[] = { { "slave-24c02", I2C_SLAVE_DEVICE_MAGIC(2048 / 8, 0) }, + { "slave-24c02ro", I2C_SLAVE_DEVICE_MAGIC(2048 / 8, I2C_SLAVE_FLAG_RO) }, { "slave-24c32", I2C_SLAVE_DEVICE_MAGIC(32768 / 8, I2C_SLAVE_FLAG_ADDR16) }, + { "slave-24c32ro", I2C_SLAVE_DEVICE_MAGIC(32768 / 8, I2C_SLAVE_FLAG_ADDR16 | I2C_SLAVE_FLAG_RO) }, { "slave-24c64", I2C_SLAVE_DEVICE_MAGIC(65536 / 8, I2C_SLAVE_FLAG_ADDR16) }, + { "slave-24c64ro", I2C_SLAVE_DEVICE_MAGIC(65536 / 8, I2C_SLAVE_FLAG_ADDR16 | I2C_SLAVE_FLAG_RO) }, { } }; MODULE_DEVICE_TABLE(i2c, i2c_slave_eeprom_id); From ac79f78dab892fcdc11fda8af5cc5e80d09dca8a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Sep 2019 12:54:18 -0700 Subject: [PATCH 1423/2880] Revert "Revert "mm, thp: restore node-local hugepage allocations"" This reverts commit a8282608c88e08b1782141026eab61204c1e533f. The commit references the original intended semantic for MADV_HUGEPAGE which has subsequently taken on three unique purposes: - enables or disables thp for a range of memory depending on the system's config (is thp "enabled" set to "always" or "madvise"), - determines the synchronous compaction behavior for thp allocations at fault (is thp "defrag" set to "always", "defer+madvise", or "madvise"), and - reverts a previous MADV_NOHUGEPAGE (there is no madvise mode to only clear previous hugepage advice). These are the three purposes that currently exist in 5.2 and over the past several years that userspace has been written around. Adding a NUMA locality preference adds a fourth dimension to an already conflated advice mode. Based on the semantic that MADV_HUGEPAGE has provided over the past several years, there exist workloads that use the tunable based on these principles: specifically that the allocation should attempt to defragment a local node before falling back. It is agreed that remote hugepages typically (but not always) have a better access latency than remote native pages, although on Naples this is at parity for intersocket. The revert commit that this patch reverts allows hugepage allocation to immediately allocate remotely when local memory is fragmented. This is contrary to the semantic of MADV_HUGEPAGE over the past several years: that is, memory compaction should be attempted locally before falling back. The performance degradation of remote hugepages over local hugepages on Rome, for example, is 53.5% increased access latency. For this reason, the goal is to revert back to the 5.2 and previous behavior that would attempt local defragmentation before falling back. With the patch that is reverted by this patch, we see performance degradations at the tail because the allocator happily allocates the remote hugepage rather than even attempting to make a local hugepage available. zone_reclaim_mode is not a solution to this problem since it does not only impact hugepage allocations but rather changes the memory allocation strategy for *all* page allocations. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 -- mm/huge_memory.c | 42 +++++++++++++++------------------------ mm/mempolicy.c | 2 +- 3 files changed, 17 insertions(+), 29 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index bac395f1d00a..5228c62af416 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -139,8 +139,6 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr); -struct mempolicy *get_vma_policy(struct vm_area_struct *vma, - unsigned long addr); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de1f15969e27..62f0d8e9d76b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -648,37 +648,27 @@ release: static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - gfp_t this_node = 0; - -#ifdef CONFIG_NUMA - struct mempolicy *pol; - /* - * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not - * specified, to express a general desire to stay on the current - * node for optimistic allocation attempts. If the defrag mode - * and/or madvise hint requires the direct reclaim then we prefer - * to fallback to other node rather than node reclaim because that - * can lead to excessive reclaim even though there is free memory - * on other nodes. We expect that NUMA preferences are specified - * by memory policies. - */ - pol = get_vma_policy(vma, addr); - if (pol->mode != MPOL_BIND) - this_node = __GFP_THISNODE; - mpol_cond_put(pol); -#endif + const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; + /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + return GFP_TRANSHUGE | __GFP_THISNODE | + (vma_madvised ? 0 : __GFP_NORETRY); + + /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; + return gfp_mask | __GFP_KSWAPD_RECLAIM; + + /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM | this_node); + return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + + /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - this_node); - return GFP_TRANSHUGE_LIGHT | this_node; + return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + + return gfp_mask; } /* Caller must hold page table lock. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 65e0874fce17..9c9877a43d58 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1734,7 +1734,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -struct mempolicy *get_vma_policy(struct vm_area_struct *vma, +static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = __get_vma_policy(vma, addr); From 19deb7695e072deaff025e03de40c61b525bd57e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Sep 2019 12:54:20 -0700 Subject: [PATCH 1424/2880] Revert "Revert "Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"" This reverts commit 92717d429b38e4f9f934eed7e605cc42858f1839. Since commit a8282608c88e ("Revert "mm, thp: restore node-local hugepage allocations"") is reverted in this series, it is better to restore the previous 5.2 behavior between the thp allocation and the page allocator rather than to attempt any consolidation or cleanup for a policy that is now reverted. It's less risky during an rc cycle and subsequent patches in this series further modify the same policy that the pre-5.3 behavior implements. Consolidation and cleanup can be done subsequent to a sane default page allocation strategy, so this patch reverts a cleanup done on a strategy that is now reverted and thus is the least risky option. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++++++++---- mm/huge_memory.c | 27 +++++++++++++-------------- mm/mempolicy.c | 32 +++++++++++++++++++++++++++++--- mm/shmem.c | 2 +- 4 files changed, 51 insertions(+), 22 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f33881688f42..fb07b503dc45 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -510,18 +510,22 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node); + int node, bool hugepage); +#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ + alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\ +#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ + alloc_pages(gfp_mask, order) +#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, node) + alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 62f0d8e9d76b..aec462cc5d46 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -645,30 +645,30 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | __GFP_THISNODE | - (vma_madvised ? 0 : __GFP_NORETRY); + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return gfp_mask | __GFP_KSWAPD_RECLAIM; + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); - return gfp_mask; + return GFP_TRANSHUGE_LIGHT; } /* Caller must hold page table lock. */ @@ -740,8 +740,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma, haddr); - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); + gfp = alloc_hugepage_direct_gfpmask(vma); + page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1348,9 +1348,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) alloc: if (__transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); - new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, - haddr, numa_node_id()); + huge_gfp = alloc_hugepage_direct_gfpmask(vma); + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9c9877a43d58..547cd403ed02 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1180,8 +1180,8 @@ static struct page *new_page(struct page *page, unsigned long start) } else if (PageTransHuge(page)) { struct page *thp; - thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, - address, numa_node_id()); + thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, + HPAGE_PMD_ORDER); if (!thp) return NULL; prep_transhuge_page(thp); @@ -2083,6 +2083,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @node: Which node to prefer for allocation (modulo policy). + * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. @@ -2093,7 +2094,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node) + unsigned long addr, int node, bool hugepage) { struct mempolicy *pol; struct page *page; @@ -2111,6 +2112,31 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { + int hpage_node = node; + + /* + * For hugepage allocation and non-interleave policy which + * allows the current node (or other explicitly preferred + * node) we only try to allocate from the current/preferred + * node and don't fall back to other nodes, as the cost of + * remote accesses would likely offset THP benefits. + * + * If the policy is interleave, or does not allow the current + * node in its nodemask, we allocate the standard way. + */ + if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) + hpage_node = pol->v.preferred_node; + + nmask = policy_nodemask(gfp, pol); + if (!nmask || node_isset(hpage_node, *nmask)) { + mpol_cond_put(pol); + page = __alloc_pages_node(hpage_node, + gfp | __GFP_THISNODE, order); + goto out; + } + } + nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); diff --git a/mm/shmem.c b/mm/shmem.c index 2bed4761f279..626d8c74b973 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1466,7 +1466,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id()); + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); From b39d0ee2632d2f4fb180e8e4eba33736283f23de Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Sep 2019 12:54:22 -0700 Subject: [PATCH 1425/2880] mm, page_alloc: avoid expensive reclaim when compaction may not succeed Memory compaction has a couple significant drawbacks as the allocation order increases, specifically: - isolate_freepages() is responsible for finding free pages to use as migration targets and is implemented as a linear scan of memory starting at the end of a zone, - failing order-0 watermark checks in memory compaction does not account for how far below the watermarks the zone actually is: to enable migration, there must be *some* free memory available. Per the above, watermarks are not always suffficient if isolate_freepages() cannot find the free memory but it could require hundreds of MBs of reclaim to even reach this threshold (read: potentially very expensive reclaim with no indication compaction can be successful), and - if compaction at this order has failed recently so that it does not even run as a result of deferred compaction, looping through reclaim can often be pointless. For hugepage allocations, these are quite substantial drawbacks because these are very high order allocations (order-9 on x86) and falling back to doing reclaim can potentially be *very* expensive without any indication that compaction would even be successful. Reclaim itself is unlikely to free entire pageblocks and certainly no reliance should be put on it to do so in isolation (recall lumpy reclaim). This means we should avoid reclaim and simply fail hugepage allocation if compaction is deferred. It is also not helpful to thrash a zone by doing excessive reclaim if compaction may not be able to access that memory. If order-0 watermarks fail and the allocation order is sufficiently large, it is likely better to fail the allocation rather than thrashing the zone. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9c9194959271..87cbd92065e5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4458,6 +4458,28 @@ retry_cpuset: if (page) goto got_pg; + if (order >= pageblock_order && (gfp_mask & __GFP_IO)) { + /* + * If allocating entire pageblock(s) and compaction + * failed because all zones are below low watermarks + * or is prohibited because it recently failed at this + * order, fail immediately. + * + * Reclaim is + * - potentially very expensive because zones are far + * below their low watermarks or this is part of very + * bursty high order allocations, + * - not guaranteed to help because isolate_freepages() + * may not iterate over freed pages as part of its + * linear scan, and + * - unlikely to make entire pageblocks free on its + * own. + */ + if (compact_result == COMPACT_SKIPPED || + compact_result == COMPACT_DEFERRED) + goto nopage; + } + /* * Checks for costly allocations with __GFP_NORETRY, which * includes THP page fault allocations From 76e654cc91bbe627aa6067916f02a4d3ac041620 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Sep 2019 12:54:25 -0700 Subject: [PATCH 1426/2880] mm, page_alloc: allow hugepage fallback to remote nodes when madvised For systems configured to always try hard to allocate transparent hugepages (thp defrag setting of "always") or for memory that has been explicitly madvised to MADV_HUGEPAGE, it is often better to fallback to remote memory to allocate the hugepage if the local allocation fails first. The point is to allow the initial call to __alloc_pages_node() to attempt to defragment local memory to make a hugepage available, if possible, rather than immediately fallback to remote memory. Local hugepages will always have a better access latency than remote (huge)pages, so an attempt to make a hugepage available locally is always preferred. If memory compaction cannot be successful locally, however, it is likely better to fallback to remote memory. This could take on two forms: either allow immediate fallback to remote memory or do per-zone watermark checks. It would be possible to fallback only when per-zone watermarks fail for order-0 memory, since that would require local reclaim for all subsequent faults so remote huge allocation is likely better than thrashing the local zone for large workloads. In this case, it is assumed that because the system is configured to try hard to allocate hugepages or the vma is advised to explicitly want to try hard for hugepages that remote allocation is better when local allocation and memory compaction have both failed. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 547cd403ed02..8caab1f81a52 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2133,6 +2133,17 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, mpol_cond_put(pol); page = __alloc_pages_node(hpage_node, gfp | __GFP_THISNODE, order); + + /* + * If hugepage allocations are configured to always + * synchronous compact or the vma has been madvised + * to prefer hugepage backing, retry allowing remote + * memory as well. + */ + if (!page && (gfp & __GFP_DIRECT_RECLAIM)) + page = __alloc_pages_node(hpage_node, + gfp | __GFP_NORETRY, order); + goto out; } } From d2aea95a1a4d195d939d16303700921be318a2b9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Sep 2019 05:53:29 -0400 Subject: [PATCH 1427/2880] tracing/probe: Fix to check the difference of nr_args before adding probe Steven reported that a test triggered: ================================================================== BUG: KASAN: slab-out-of-bounds in trace_kprobe_create+0xa9e/0xe40 Read of size 8 at addr ffff8880c4f25a48 by task ftracetest/4798 CPU: 2 PID: 4798 Comm: ftracetest Not tainted 5.3.0-rc6-test+ #30 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 Call Trace: dump_stack+0x7c/0xc0 ? trace_kprobe_create+0xa9e/0xe40 print_address_description+0x6c/0x332 ? trace_kprobe_create+0xa9e/0xe40 ? trace_kprobe_create+0xa9e/0xe40 __kasan_report.cold.6+0x1a/0x3b ? trace_kprobe_create+0xa9e/0xe40 kasan_report+0xe/0x12 trace_kprobe_create+0xa9e/0xe40 ? print_kprobe_event+0x280/0x280 ? match_held_lock+0x1b/0x240 ? find_held_lock+0xac/0xd0 ? fs_reclaim_release.part.112+0x5/0x20 ? lock_downgrade+0x350/0x350 ? kasan_unpoison_shadow+0x30/0x40 ? __kasan_kmalloc.constprop.6+0xc1/0xd0 ? trace_kprobe_create+0xe40/0xe40 ? trace_kprobe_create+0xe40/0xe40 create_or_delete_trace_kprobe+0x2e/0x60 trace_run_command+0xc3/0xe0 ? trace_panic_handler+0x20/0x20 ? kasan_unpoison_shadow+0x30/0x40 trace_parse_run_command+0xdc/0x163 vfs_write+0xe1/0x240 ksys_write+0xba/0x150 ? __ia32_sys_read+0x50/0x50 ? tracer_hardirqs_on+0x61/0x180 ? trace_hardirqs_off_caller+0x43/0x110 ? mark_held_locks+0x29/0xa0 ? do_syscall_64+0x14/0x260 do_syscall_64+0x68/0x260 Fix to check the difference of nr_args before adding probe on existing probes. This also may set the error log index bigger than the number of command parameters. In that case it sets the error position is next to the last parameter. Link: http://lkml.kernel.org/r/156966474783.3478.13217501608215769150.stgit@devnote2 Fixes: ca89bc071d5e ("tracing/kprobe: Add multi-probe per event support") Reported-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index baf58a3612c0..905b10af5d5c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -178,6 +178,16 @@ void __trace_probe_log_err(int offset, int err_type) if (!command) return; + if (trace_probe_log.index >= trace_probe_log.argc) { + /** + * Set the error position is next to the last arg + space. + * Note that len includes the terminal null and the cursor + * appaers at pos + 1. + */ + pos = len; + offset = 0; + } + /* And make a command string from argv array */ p = command; for (i = 0; i < trace_probe_log.argc; i++) { @@ -1084,6 +1094,12 @@ int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b) { int i; + /* In case of more arguments */ + if (a->nr_args < b->nr_args) + return a->nr_args + 1; + if (a->nr_args > b->nr_args) + return b->nr_args + 1; + for (i = 0; i < a->nr_args; i++) { if ((b->nr_args <= i) || ((a->args[i].type != b->args[i].type) || From 968e5170939662341242812b9c82ef51cf140a33 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 26 Sep 2019 09:22:59 -0700 Subject: [PATCH 1428/2880] tracing: Fix clang -Wint-in-bool-context warnings in IF_ASSIGN macro After r372664 in clang, the IF_ASSIGN macro causes a couple hundred warnings along the lines of: kernel/trace/trace_output.c:1331:2: warning: converting the enum constant to a boolean [-Wint-in-bool-context] kernel/trace/trace.h:409:3: note: expanded from macro 'trace_assign_type' IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, ^ kernel/trace/trace.h:371:14: note: expanded from macro 'IF_ASSIGN' WARN_ON(id && (entry)->type != id); \ ^ 264 warnings generated. This warning can catch issues with constructs like: if (state == A || B) where the developer really meant: if (state == A || state == B) This is currently the only occurrence of the warning in the kernel tree across defconfig, allyesconfig, allmodconfig for arm32, arm64, and x86_64. Add the implicit '!= 0' to the WARN_ON statement to fix the warnings and find potential issues in the future. Link: https://github.com/llvm/llvm-project/commit/28b38c277a2941e9e891b2db30652cfd962f070b Link: https://github.com/ClangBuiltLinux/linux/issues/686 Link: http://lkml.kernel.org/r/20190926162258.466321-1-natechancellor@gmail.com Reviewed-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 26b0a08f3c7d..f801d154ff6a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -365,11 +365,11 @@ static inline struct trace_array *top_trace_array(void) __builtin_types_compatible_p(typeof(var), type *) #undef IF_ASSIGN -#define IF_ASSIGN(var, entry, etype, id) \ - if (FTRACE_CMP_TYPE(var, etype)) { \ - var = (typeof(var))(entry); \ - WARN_ON(id && (entry)->type != id); \ - break; \ +#define IF_ASSIGN(var, entry, etype, id) \ + if (FTRACE_CMP_TYPE(var, etype)) { \ + var = (typeof(var))(entry); \ + WARN_ON(id != 0 && (entry)->type != id); \ + break; \ } /* Will cause compile errors if type is not found. */ From 96c5c6e6a5b6db592acae039fed54b5c8844cd35 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Fri, 20 Sep 2019 17:57:59 -0500 Subject: [PATCH 1429/2880] tracing: Have error path in predicate_parse() free its allocated memory In predicate_parse, there is an error path that is not going to out_free instead it returns directly which leads to a memory leak. Link: http://lkml.kernel.org/r/20190920225800.3870-1-navid.emamdoost@gmail.com Signed-off-by: Navid Emamdoost Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c773b8fb270c..c9a74f82b14a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -452,8 +452,10 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, switch (*next) { case '(': /* #2 */ - if (top - op_stack > nr_parens) - return ERR_PTR(-EINVAL); + if (top - op_stack > nr_parens) { + ret = -EINVAL; + goto out_free; + } *(++top) = invert; continue; case '!': /* #3 */ From f7d6316fb43735ae8af969c2e582fbee85709483 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 14 Sep 2019 18:32:15 +0800 Subject: [PATCH 1430/2880] mm, tracing: Print symbol name for call_site in trace events To improve the readability of raw slab trace points, print the call_site ip using '%pS'. Then we can grep events with function names. [002] .... 808.188897: kmem_cache_free: call_site=putname+0x47/0x50 ptr=00000000cef40c80 [002] .... 808.188898: kfree: call_site=security_cred_free+0x42/0x50 ptr=0000000062400820 [002] .... 808.188904: kmem_cache_free: call_site=put_cred_rcu+0x88/0xa0 ptr=0000000058d74ef8 [002] .... 808.188913: kmem_cache_alloc: call_site=prepare_creds+0x26/0x100 ptr=0000000058d74ef8 bytes_req=168 bytes_alloc=576 gfp_flags=GFP_KERNEL [002] .... 808.188917: kmalloc: call_site=security_prepare_creds+0x77/0xa0 ptr=0000000062400820 bytes_req=8 bytes_alloc=336 gfp_flags=GFP_KERNEL|__GFP_ZERO [002] .... 808.188920: kmem_cache_alloc: call_site=getname_flags+0x4f/0x1e0 ptr=00000000cef40c80 bytes_req=4096 bytes_alloc=4480 gfp_flags=GFP_KERNEL [002] .... 808.188925: kmem_cache_free: call_site=putname+0x47/0x50 ptr=00000000cef40c80 [002] .... 808.188926: kfree: call_site=security_cred_free+0x42/0x50 ptr=0000000062400820 [002] .... 808.188931: kmem_cache_free: call_site=put_cred_rcu+0x88/0xa0 ptr=0000000058d74ef8 Link: http://lkml.kernel.org/r/20190914103215.23301-1-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- include/trace/events/kmem.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index eb57e3037deb..69e8bb8963db 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -35,8 +35,8 @@ DECLARE_EVENT_CLASS(kmem_alloc, __entry->gfp_flags = gfp_flags; ), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s", - __entry->call_site, + TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s", + (void *)__entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, @@ -131,7 +131,8 @@ DECLARE_EVENT_CLASS(kmem_free, __entry->ptr = ptr; ), - TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) + TP_printk("call_site=%pS ptr=%p", + (void *)__entry->call_site, __entry->ptr) ); DEFINE_EVENT(kmem_free, kfree, From 8ed4889eb83179dbc9a105cfed65cc42ecb61097 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 27 Sep 2019 11:10:22 -0400 Subject: [PATCH 1431/2880] selftests/ftrace: Fix same probe error test The "same probe" selftest that tests that adding the same probe fails doesn't add the same probe and passes, which fails the test. Fixes: b78b94b82122 ("selftests/ftrace: Update kprobe event error testcase") Signed-off-by: Steven Rostedt (VMware) --- .../selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc index 8a4025e912cb..ef1e9bafb098 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc @@ -95,7 +95,7 @@ echo 'p:kprobes/testevent _do_fork abcd=\1' > kprobe_events check_error 'p:kprobes/testevent _do_fork ^bcd=\1' # DIFF_ARG_TYPE check_error 'p:kprobes/testevent _do_fork ^abcd=\1:u8' # DIFF_ARG_TYPE check_error 'p:kprobes/testevent _do_fork ^abcd=\"foo"' # DIFF_ARG_TYPE -check_error '^p:kprobes/testevent _do_fork' # SAME_PROBE +check_error '^p:kprobes/testevent _do_fork abcd=\1' # SAME_PROBE fi exit 0 From dc925a36060e8cef050a9d05c64dae1c30dc5027 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Sep 2019 10:29:49 +0200 Subject: [PATCH 1432/2880] Documentation/process: Clarify disclosure rules The role of the contact list provided by the disclosing party and how it affects the disclosure process and the ability to include experts into the development process is not really well explained. Neither is it entirely clear when the disclosing party will be informed about the fact that a developer who is not covered by an employer NDA needs to be brought in and disclosed. Explain the role of the contact list and the information policy along with an eventual conflict resolution better. Reported-by: Dave Hansen Signed-off-by: Thomas Gleixner Acked-by: Dave Hansen Link: https://lore.kernel.org/r/alpine.DEB.2.21.1909251028390.10825@nanos.tec.linutronix.de Signed-off-by: Greg Kroah-Hartman --- .../process/embargoed-hardware-issues.rst | 40 +++++++++++++++---- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index e57b9f39c69f..a3c3349046c4 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -143,6 +143,20 @@ via their employer, they cannot enter individual non-disclosure agreements in their role as Linux kernel developers. They will, however, agree to adhere to this documented process and the Memorandum of Understanding. +The disclosing party should provide a list of contacts for all other +entities who have already been, or should be, informed about the issue. +This serves several purposes: + + - The list of disclosed entities allows communication accross the + industry, e.g. other OS vendors, HW vendors, etc. + + - The disclosed entities can be contacted to name experts who should + participate in the mitigation development. + + - If an expert which is required to handle an issue is employed by an + listed entity or member of an listed entity, then the response teams can + request the disclosure of that expert from that entity. This ensures + that the expert is also part of the entity's response team. Disclosure """""""""" @@ -158,10 +172,7 @@ Mitigation development """""""""""""""""""""" The initial response team sets up an encrypted mailing-list or repurposes -an existing one if appropriate. The disclosing party should provide a list -of contacts for all other parties who have already been, or should be, -informed about the issue. The response team contacts these parties so they -can name experts who should be subscribed to the mailing-list. +an existing one if appropriate. Using a mailing-list is close to the normal Linux development process and has been successfully used in developing mitigations for various hardware @@ -175,9 +186,24 @@ development branch against the mainline kernel and backport branches for stable kernel versions as necessary. The initial response team will identify further experts from the Linux -kernel developer community as needed and inform the disclosing party about -their participation. Bringing in experts can happen at any time of the -development process and often needs to be handled in a timely manner. +kernel developer community as needed. Bringing in experts can happen at any +time of the development process and needs to be handled in a timely manner. + +If an expert is employed by or member of an entity on the disclosure list +provided by the disclosing party, then participation will be requested from +the relevant entity. + +If not, then the disclosing party will be informed about the experts +participation. The experts are covered by the Memorandum of Understanding +and the disclosing party is requested to acknowledge the participation. In +case that the disclosing party has a compelling reason to object, then this +objection has to be raised within five work days and resolved with the +incident team immediately. If the disclosing party does not react within +five work days this is taken as silent acknowledgement. + +After acknowledgement or resolution of an objection the expert is disclosed +by the incident team and brought into the development process. + Coordinated release """"""""""""""""""" From 50ee7529ec4500c88f8664560770a7a1b65db72b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 28 Sep 2019 16:53:52 -0700 Subject: [PATCH 1433/2880] random: try to actively add entropy rather than passively wait for it For 5.3 we had to revert a nice ext4 IO pattern improvement, because it caused a bootup regression due to lack of entropy at bootup together with arguably broken user space that was asking for secure random numbers when it really didn't need to. See commit 72dbcf721566 (Revert "ext4: make __ext4_get_inode_loc plug"). This aims to solve the issue by actively generating entropy noise using the CPU cycle counter when waiting for the random number generator to initialize. This only works when you have a high-frequency time stamp counter available, but that's the case on all modern x86 CPU's, and on most other modern CPU's too. What we do is to generate jitter entropy from the CPU cycle counter under a somewhat complex load: calling the scheduler while also guaranteeing a certain amount of timing noise by also triggering a timer. I'm sure we can tweak this, and that people will want to look at other alternatives, but there's been a number of papers written on jitter entropy, and this should really be fairly conservative by crediting one bit of entropy for every timer-induced jump in the cycle counter. Not because the timer itself would be all that unpredictable, but because the interaction between the timer and the loop is going to be. Even if (and perhaps particularly if) the timer actually happens on another CPU, the cacheline interaction between the loop that reads the cycle counter and the timer itself firing is going to add perturbations to the cycle counter values that get mixed into the entropy pool. As Thomas pointed out, with a modern out-of-order CPU, even quite simple loops show a fair amount of hard-to-predict timing variability even in the absense of external interrupts. But this tries to take that further by actually having a fairly complex interaction. This is not going to solve the entropy issue for architectures that have no CPU cycle counter, but it's not clear how (and if) that is solvable, and the hardware in question is largely starting to be irrelevant. And by doing this we can at least avoid some of the even more contentious approaches (like making the entropy waiting time out in order to avoid the possibly unbounded waiting). Cc: Ahmed Darwish Cc: Thomas Gleixner Cc: Theodore Ts'o Cc: Nicholas Mc Guire Cc: Andy Lutomirski Cc: Kees Cook Cc: Willy Tarreau Cc: Alexander E. Patrakov Cc: Lennart Poettering Signed-off-by: Linus Torvalds --- drivers/char/random.c | 62 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 5d5ea4ce1442..2fda6166c1dd 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1731,6 +1731,56 @@ void get_random_bytes(void *buf, int nbytes) } EXPORT_SYMBOL(get_random_bytes); + +/* + * Each time the timer fires, we expect that we got an unpredictable + * jump in the cycle counter. Even if the timer is running on another + * CPU, the timer activity will be touching the stack of the CPU that is + * generating entropy.. + * + * Note that we don't re-arm the timer in the timer itself - we are + * happy to be scheduled away, since that just makes the load more + * complex, but we do not want the timer to keep ticking unless the + * entropy loop is running. + * + * So the re-arming always happens in the entropy loop itself. + */ +static void entropy_timer(struct timer_list *t) +{ + credit_entropy_bits(&input_pool, 1); +} + +/* + * If we have an actual cycle counter, see if we can + * generate enough entropy with timing noise + */ +static void try_to_generate_entropy(void) +{ + struct { + unsigned long now; + struct timer_list timer; + } stack; + + stack.now = random_get_entropy(); + + /* Slow counter - or none. Don't even bother */ + if (stack.now == random_get_entropy()) + return; + + timer_setup_on_stack(&stack.timer, entropy_timer, 0); + while (!crng_ready()) { + if (!timer_pending(&stack.timer)) + mod_timer(&stack.timer, jiffies+1); + mix_pool_bytes(&input_pool, &stack.now, sizeof(stack.now)); + schedule(); + stack.now = random_get_entropy(); + } + + del_timer_sync(&stack.timer); + destroy_timer_on_stack(&stack.timer); + mix_pool_bytes(&input_pool, &stack.now, sizeof(stack.now)); +} + /* * Wait for the urandom pool to be seeded and thus guaranteed to supply * cryptographically secure random numbers. This applies to: the /dev/urandom @@ -1745,7 +1795,17 @@ int wait_for_random_bytes(void) { if (likely(crng_ready())) return 0; - return wait_event_interruptible(crng_init_wait, crng_ready()); + + do { + int ret; + ret = wait_event_interruptible_timeout(crng_init_wait, crng_ready(), HZ); + if (ret) + return ret > 0 ? 0 : ret; + + try_to_generate_entropy(); + } while (!crng_ready()); + + return 0; } EXPORT_SYMBOL(wait_for_random_bytes); From 02f03c4206c1b2a7451d3b3546f86c9c783eac13 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Sep 2019 17:59:23 -0700 Subject: [PATCH 1434/2880] Revert "Revert "ext4: make __ext4_get_inode_loc plug"" This reverts commit 72dbcf72156641fde4d8ea401e977341bfd35a05. Instead of waiting forever for entropy that may just not happen, we now try to actively generate entropy when required, and are thus hopefully avoiding the problem that caused the nice ext4 IO pattern fix to be reverted. So revert the revert. Cc: Ahmed S. Darwish Cc: Ted Ts'o Cc: Willy Tarreau Cc: Alexander E. Patrakov Signed-off-by: Linus Torvalds --- fs/ext4/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 006b7a2070bf..420fe3deed39 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4586,6 +4586,7 @@ static int __ext4_get_inode_loc(struct inode *inode, struct buffer_head *bh; struct super_block *sb = inode->i_sb; ext4_fsblk_t block; + struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; @@ -4674,6 +4675,7 @@ make_io: * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ + blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; @@ -4704,6 +4706,7 @@ make_io: get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); + blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, From fdbdcddc2c93096e9b956de930d2d710a1342502 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 28 Aug 2019 16:35:19 +0300 Subject: [PATCH 1435/2880] csky: Use generic free_initrd_mem() The csky implementation of free_initrd_mem() is an open-coded version of free_reserved_area() without poisoning. Remove it and make csky use the generic version of free_initrd_mem(). Signed-off-by: Mike Rapoport Signed-off-by: Guo Ren --- arch/csky/mm/init.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index eb0dc9e5065f..d4c2292ea46b 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -60,22 +60,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - if (start < end) - pr_info("Freeing initrd memory: %ldk freed\n", - (end - start) >> 10); - - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - init_page_count(virt_to_page(start)); - free_page(start); - totalram_pages_inc(); - } -} -#endif - extern char __init_begin[], __init_end[]; void free_initmem(void) From 48ede51fd94fe9251058fc85626b2aeb5cbb5884 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 25 Sep 2019 19:56:16 +0800 Subject: [PATCH 1436/2880] csky: Fixup add zero_fp fixup perf backtrace panic We need set fp zero to let backtrace know the end. The patch fixup perf callchain panic problem, because backtrace didn't know what is the end of fp. Signed-off-by: Guo Ren Reported-by: Mao Han --- arch/csky/kernel/entry.S | 50 +++++++++++++++++++++++--------------- arch/csky/kernel/process.c | 2 +- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S index a7e84ccccbd8..564dab2fabaa 100644 --- a/arch/csky/kernel/entry.S +++ b/arch/csky/kernel/entry.S @@ -17,6 +17,12 @@ #define PTE_INDX_SHIFT 10 #define _PGDIR_SHIFT 22 +.macro zero_fp +#ifdef CONFIG_STACKTRACE + movi r8, 0 +#endif +.endm + .macro tlbop_begin name, val0, val1, val2 ENTRY(csky_\name) mtcr a3, ss2 @@ -96,6 +102,7 @@ ENTRY(csky_\name) SAVE_ALL 0 .endm .macro tlbop_end is_write + zero_fp RD_MEH a2 psrset ee, ie mov a0, sp @@ -120,6 +127,7 @@ tlbop_end 1 ENTRY(csky_systemcall) SAVE_ALL TRAP0_SIZE + zero_fp psrset ee, ie @@ -136,9 +144,9 @@ ENTRY(csky_systemcall) mov r9, sp bmaski r10, THREAD_SHIFT andn r9, r10 - ldw r8, (r9, TINFO_FLAGS) - ANDI_R3 r8, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) - cmpnei r8, 0 + ldw r12, (r9, TINFO_FLAGS) + ANDI_R3 r12, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) + cmpnei r12, 0 bt csky_syscall_trace #if defined(__CSKYABIV2__) subi sp, 8 @@ -180,7 +188,7 @@ csky_syscall_trace: ENTRY(ret_from_kernel_thread) jbsr schedule_tail - mov a0, r8 + mov a0, r10 jsr r9 jbsr ret_from_exception @@ -189,9 +197,9 @@ ENTRY(ret_from_fork) mov r9, sp bmaski r10, THREAD_SHIFT andn r9, r10 - ldw r8, (r9, TINFO_FLAGS) - ANDI_R3 r8, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) - cmpnei r8, 0 + ldw r12, (r9, TINFO_FLAGS) + ANDI_R3 r12, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) + cmpnei r12, 0 bf ret_from_exception mov a0, sp /* sp = pt_regs pointer */ jbsr syscall_trace_exit @@ -209,9 +217,9 @@ ret_from_exception: bmaski r10, THREAD_SHIFT andn r9, r10 - ldw r8, (r9, TINFO_FLAGS) - andi r8, (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED) - cmpnei r8, 0 + ldw r12, (r9, TINFO_FLAGS) + andi r12, (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED) + cmpnei r12, 0 bt exit_work 1: RESTORE_ALL @@ -220,11 +228,11 @@ exit_work: lrw syscallid, ret_from_exception mov lr, syscallid - btsti r8, TIF_NEED_RESCHED + btsti r12, TIF_NEED_RESCHED bt work_resched mov a0, sp - mov a1, r8 + mov a1, r12 jmpi do_notify_resume work_resched: @@ -232,6 +240,7 @@ work_resched: ENTRY(csky_trap) SAVE_ALL 0 + zero_fp psrset ee mov a0, sp /* Push Stack pointer arg */ jbsr trap_c /* Call C-level trap handler */ @@ -265,6 +274,7 @@ ENTRY(csky_get_tls) ENTRY(csky_irq) SAVE_ALL 0 + zero_fp psrset ee #ifdef CONFIG_PREEMPT @@ -276,21 +286,21 @@ ENTRY(csky_irq) * Get task_struct->stack.preempt_count for current, * and increase 1. */ - ldw r8, (r9, TINFO_PREEMPT) - addi r8, 1 - stw r8, (r9, TINFO_PREEMPT) + ldw r12, (r9, TINFO_PREEMPT) + addi r12, 1 + stw r12, (r9, TINFO_PREEMPT) #endif mov a0, sp jbsr csky_do_IRQ #ifdef CONFIG_PREEMPT - subi r8, 1 - stw r8, (r9, TINFO_PREEMPT) - cmpnei r8, 0 + subi r12, 1 + stw r12, (r9, TINFO_PREEMPT) + cmpnei r12, 0 bt 2f - ldw r8, (r9, TINFO_FLAGS) - btsti r8, TIF_NEED_RESCHED + ldw r12, (r9, TINFO_FLAGS) + btsti r12, TIF_NEED_RESCHED bf 2f 1: jbsr preempt_schedule_irq /* irq en/disable is done inside */ diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index e555740c0be5..f320d9248a22 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -55,7 +55,7 @@ int copy_thread(unsigned long clone_flags, if (unlikely(p->flags & PF_KTHREAD)) { memset(childregs, 0, sizeof(struct pt_regs)); childstack->r15 = (unsigned long) ret_from_kernel_thread; - childstack->r8 = kthread_arg; + childstack->r10 = kthread_arg; childstack->r9 = usp; childregs->sr = mfcr("psr"); } else { From 3a09d8e2893b2403a043890e5832966e8640feaf Mon Sep 17 00:00:00 2001 From: Mao Han Date: Wed, 25 Sep 2019 17:23:02 +0800 Subject: [PATCH 1437/2880] csky: Fixup csky_pmu.max_period assignment The csky_pmu.max_period has type u64, and BIT() can only return 32 bits unsigned long on C-SKY. The initialization for max_period will be incorrect when count_width is bigger than 32. Use BIT_ULL() Signed-off-by: Mao Han Signed-off-by: Guo Ren --- arch/csky/kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/csky/kernel/perf_event.c b/arch/csky/kernel/perf_event.c index 4c1a1934d76a..7570109cddc6 100644 --- a/arch/csky/kernel/perf_event.c +++ b/arch/csky/kernel/perf_event.c @@ -1306,7 +1306,7 @@ int csky_pmu_device_probe(struct platform_device *pdev, &csky_pmu.count_width)) { csky_pmu.count_width = DEFAULT_COUNT_WIDTH; } - csky_pmu.max_period = BIT(csky_pmu.count_width) - 1; + csky_pmu.max_period = BIT_ULL(csky_pmu.count_width) - 1; csky_pmu.plat_device = pdev; From a2139d3b4fd7ef26a363a1b1eb6cd55be2c1bcd1 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 23 Sep 2019 15:36:14 +0100 Subject: [PATCH 1438/2880] csky: entry: Remove unneeded need_resched() loop Since the enabling and disabling of IRQs within preempt_schedule_irq() is contained in a need_resched() loop, we don't need the outer arch code loop. Signed-off-by: Valentin Schneider Signed-off-by: Guo Ren --- arch/csky/kernel/entry.S | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S index 564dab2fabaa..a7a5b67df898 100644 --- a/arch/csky/kernel/entry.S +++ b/arch/csky/kernel/entry.S @@ -302,11 +302,7 @@ ENTRY(csky_irq) ldw r12, (r9, TINFO_FLAGS) btsti r12, TIF_NEED_RESCHED bf 2f -1: jbsr preempt_schedule_irq /* irq en/disable is done inside */ - ldw r7, (r9, TINFO_FLAGS) /* get new tasks TI_FLAGS */ - btsti r7, TIF_NEED_RESCHED - bt 1b /* go again */ #endif 2: jmpi ret_from_exception From 9af032a30172e119a5935f802b066631f8ded2d6 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Tue, 3 Sep 2019 13:36:51 +0200 Subject: [PATCH 1439/2880] csky: Move static keyword to the front of declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the static keyword to the front of declaration of csky_pmu_of_device_ids, and resolve the following compiler warning that can be seen when building with warnings enabled (W=1): arch/csky/kernel/perf_event.c:1340:1: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] Signed-off-by: Krzysztof Wilczynski Signed-off-by: Guo Ren --- arch/csky/kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/csky/kernel/perf_event.c b/arch/csky/kernel/perf_event.c index 7570109cddc6..1a29f1157449 100644 --- a/arch/csky/kernel/perf_event.c +++ b/arch/csky/kernel/perf_event.c @@ -1337,7 +1337,7 @@ int csky_pmu_device_probe(struct platform_device *pdev, return ret; } -const static struct of_device_id csky_pmu_of_device_ids[] = { +static const struct of_device_id csky_pmu_of_device_ids[] = { {.compatible = "csky,csky-pmu"}, {}, }; From 55d554f5d14071f7c2c5dbd88d0a2eb695c97d16 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 26 Sep 2019 19:13:44 -0600 Subject: [PATCH 1440/2880] tools: bpf: Use !building_out_of_srctree to determine srctree make TARGETS=bpf kselftest fails with: Makefile:127: tools/build/Makefile.include: No such file or directory When the bpf tool make is invoked from tools Makefile, srctree is cleared and the current logic check for srctree equals to empty string to determine srctree location from CURDIR. When the build in invoked from selftests/bpf Makefile, the srctree is set to "." and the same logic used for srctree equals to empty is needed to determine srctree. Check building_out_of_srctree undefined as the condition for both cases to fix "make TARGETS=bpf kselftest" build failure. Signed-off-by: Shuah Khan Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20190927011344.4695-1-skhan@linuxfoundation.org --- tools/bpf/Makefile | 6 +++++- tools/lib/bpf/Makefile | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index fbf5e4a0cb9c..5d1995fd369c 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -12,7 +12,11 @@ INSTALL ?= install CFLAGS += -Wall -O2 CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/include/uapi -I$(srctree)/include -ifeq ($(srctree),) +# This will work when bpf is built in tools env. where srctree +# isn't set and when invoked from selftests build, where srctree +# is set to ".". building_out_of_srctree is undefined for in srctree +# builds +ifndef building_out_of_srctree srctree := $(patsubst %/,%,$(dir $(CURDIR))) srctree := $(patsubst %/,%,$(dir $(srctree))) endif diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index c6f94cffe06e..20772663d3e1 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -8,7 +8,11 @@ LIBBPF_MAJOR_VERSION := $(firstword $(subst ., ,$(LIBBPF_VERSION))) MAKEFLAGS += --no-print-directory -ifeq ($(srctree),) +# This will work when bpf is built in tools env. where srctree +# isn't set and when invoked from selftests build, where srctree +# is a ".". building_out_of_srctree is undefined for in srctree +# builds +ifndef building_out_of_srctree srctree := $(patsubst %/,%,$(dir $(CURDIR))) srctree := $(patsubst %/,%,$(dir $(srctree))) srctree := $(patsubst %/,%,$(dir $(srctree))) From d7d44b6fe40a98e960be92ea8617954c2596d140 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 9 Sep 2019 22:33:57 +0200 Subject: [PATCH 1441/2880] drm/tilcdc: include linux/pinctrl/consumer.h again This was apparently dropped by accident in a recent cleanup, causing a build failure in some configurations now: drivers/gpu/drm/tilcdc/tilcdc_tfp410.c:296:12: error: implicit declaration of function 'devm_pinctrl_get_select_default' [-Werror,-Wimplicit-function-declaration] Fixes: fcb57664172e ("drm/tilcdc: drop use of drmP.h") Signed-off-by: Arnd Bergmann Reviewed-by: Laurent Pinchart Signed-off-by: Jyri Sarha Link: https://patchwork.freedesktop.org/patch/msgid/02b03f74cf941f52a941a36bdc8dabb4a69fd87e.1569852588.git.jsarha@ti.com Acked-by: Sam Ravnborg --- drivers/gpu/drm/tilcdc/tilcdc_tfp410.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c b/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c index 525dc1c0f1c1..530edb3b51cc 100644 --- a/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c +++ b/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include From f90ec6cdf674248dcad85bf9af6e064bf472b841 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 30 Sep 2019 11:54:50 +0300 Subject: [PATCH 1442/2880] ARM: dts: am4372: Set memory bandwidth limit for DISPC Set memory bandwidth limit to filter out resolutions above 720p@60Hz to avoid underflow errors due to the bandwidth needs of higher resolutions. am43xx can not provide enough bandwidth to DISPC to correctly handle 'high' resolutions. Signed-off-by: Peter Ujfalusi Signed-off-by: Tomi Valkeinen Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am4372.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/boot/dts/am4372.dtsi b/arch/arm/boot/dts/am4372.dtsi index 848e2a8884e2..14bbc438055f 100644 --- a/arch/arm/boot/dts/am4372.dtsi +++ b/arch/arm/boot/dts/am4372.dtsi @@ -337,6 +337,8 @@ ti,hwmods = "dss_dispc"; clocks = <&disp_clk>; clock-names = "fck"; + + max-memory-bandwidth = <230000000>; }; rfbi: rfbi@4832a800 { From 833b45de69a6016c4b0cebe6765d526a31a81580 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 30 Sep 2019 18:48:44 +0200 Subject: [PATCH 1443/2880] kvm: x86, powerpc: do not allow clearing largepages debugfs entry The largepages debugfs entry is incremented/decremented as shadow pages are created or destroyed. Clearing it will result in an underflow, which is harmless to KVM but ugly (and could be misinterpreted by tools that use debugfs information), so make this particular statistic read-only. Cc: kvm-ppc@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s.c | 8 ++++---- arch/x86/kvm/x86.c | 6 +++--- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 10 +++++++--- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index d7fcdfa7fee4..ec2547cc5ecb 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -36,8 +36,8 @@ #include "book3s.h" #include "trace.h" -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ /* #define EXIT_DEBUG */ @@ -69,8 +69,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "pthru_all", VCPU_STAT(pthru_all) }, { "pthru_host", VCPU_STAT(pthru_host) }, { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, - { "largepages_2M", VM_STAT(num_2M_pages) }, - { "largepages_1G", VM_STAT(num_1G_pages) }, + { "largepages_2M", VM_STAT(num_2M_pages, .mode = 0444) }, + { "largepages_1G", VM_STAT(num_1G_pages, .mode = 0444) }, { NULL } }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 180c7e88577a..8072acaaf028 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -212,7 +212,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, + { "largepages", VM_STAT(lpages, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fcb46b3374c6..719fc3e15ea4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1090,6 +1090,7 @@ enum kvm_stat_kind { struct kvm_stat_data { int offset; + int mode; struct kvm *kvm; }; @@ -1097,6 +1098,7 @@ struct kvm_stats_debugfs_item { const char *name; int offset; enum kvm_stat_kind kind; + int mode; }; extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e6de3159e682..fd68fbe0a75d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -617,8 +617,9 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) stat_data->kvm = kvm; stat_data->offset = p->offset; + stat_data->mode = p->mode ? p->mode : 0644; kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; - debugfs_create_file(p->name, 0644, kvm->debugfs_dentry, + debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, stat_data, stat_fops_per_vm[p->kind]); } return 0; @@ -3929,7 +3930,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) return -ENOENT; - if (simple_attr_open(inode, file, get, set, fmt)) { + if (simple_attr_open(inode, file, get, + stat_data->mode & S_IWUGO ? set : NULL, + fmt)) { kvm_put_kvm(stat_data->kvm); return -ENOMEM; } @@ -4177,7 +4180,8 @@ static void kvm_init_debug(void) kvm_debugfs_num_entries = 0; for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { - debugfs_create_file(p->name, 0644, kvm_debugfs_dir, + int mode = p->mode ? p->mode : 0644; + debugfs_create_file(p->name, mode, kvm_debugfs_dir, (void *)(long)p->offset, stat_fops[p->kind]); } From 54ecb8f7028c5eb3d740bb82b0f1d90f2df63c5c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 30 Sep 2019 10:35:40 -0700 Subject: [PATCH 1444/2880] Linux 5.4-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index d456746da347..6f54f2f95743 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 -PATCHLEVEL = 3 +PATCHLEVEL = 4 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Bobtail Squid # *DOCUMENTATION* From 7ae6d93c8f052b7a77ba56ed0f654e22a2876739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Vok=C3=A1=C4=8D?= Date: Thu, 26 Sep 2019 10:59:17 +0200 Subject: [PATCH 1445/2880] net: dsa: qca8k: Use up to 7 ports for all operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The QCA8K family supports up to 7 ports. So use the existing QCA8K_NUM_PORTS define to allocate the switch structure and limit all operations with the switch ports. This was not an issue until commit 0394a63acfe2 ("net: dsa: enable and disable all ports") disabled all unused ports. Since the unused ports 7-11 are outside of the correct register range on this switch some registers were rewritten with invalid content. Fixes: 6b93fb46480a ("net-next: dsa: add new driver for qca8xxx family") Fixes: a0c02161ecfc ("net: dsa: variable number of ports") Fixes: 0394a63acfe2 ("net: dsa: enable and disable all ports") Signed-off-by: Michal Vokáč Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/qca8k.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 684aa51684db..b00274caae4f 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -705,7 +705,7 @@ qca8k_setup(struct dsa_switch *ds) BIT(0) << QCA8K_GLOBAL_FW_CTRL1_UC_DP_S); /* Setup connection between CPU port & user ports */ - for (i = 0; i < DSA_MAX_PORTS; i++) { + for (i = 0; i < QCA8K_NUM_PORTS; i++) { /* CPU port gets connected to all user ports of the switch */ if (dsa_is_cpu_port(ds, i)) { qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(QCA8K_CPU_PORT), @@ -1077,7 +1077,7 @@ qca8k_sw_probe(struct mdio_device *mdiodev) if (id != QCA8K_ID_QCA8337) return -ENODEV; - priv->ds = dsa_switch_alloc(&mdiodev->dev, DSA_MAX_PORTS); + priv->ds = dsa_switch_alloc(&mdiodev->dev, QCA8K_NUM_PORTS); if (!priv->ds) return -ENOMEM; From e9789c7cc182484fc031fd88097eb14cb26c4596 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Sep 2019 18:24:43 -0700 Subject: [PATCH 1446/2880] sch_cbq: validate TCA_CBQ_WRROPT to avoid crash syzbot reported a crash in cbq_normalize_quanta() caused by an out of range cl->priority. iproute2 enforces this check, but malicious users do not. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN PTI Modules linked in: CPU: 1 PID: 26447 Comm: syz-executor.1 Not tainted 5.3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:cbq_normalize_quanta.part.0+0x1fd/0x430 net/sched/sch_cbq.c:902 RSP: 0018:ffff8801a5c333b0 EFLAGS: 00010206 RAX: 0000000020000003 RBX: 00000000fffffff8 RCX: ffffc9000712f000 RDX: 00000000000043bf RSI: ffffffff83be8962 RDI: 0000000100000018 RBP: ffff8801a5c33420 R08: 000000000000003a R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000002ef R13: ffff88018da95188 R14: dffffc0000000000 R15: 0000000000000015 FS: 00007f37d26b1700(0000) GS:ffff8801dad00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004c7cec CR3: 00000001bcd0a006 CR4: 00000000001626f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [] cbq_normalize_quanta include/net/pkt_sched.h:27 [inline] [] cbq_addprio net/sched/sch_cbq.c:1097 [inline] [] cbq_set_wrr+0x2d7/0x450 net/sched/sch_cbq.c:1115 [] cbq_change_class+0x987/0x225b net/sched/sch_cbq.c:1537 [] tc_ctl_tclass+0x555/0xcd0 net/sched/sch_api.c:2329 [] rtnetlink_rcv_msg+0x485/0xc10 net/core/rtnetlink.c:5248 [] netlink_rcv_skb+0x17a/0x460 net/netlink/af_netlink.c:2510 [] rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5266 [] netlink_unicast_kernel net/netlink/af_netlink.c:1324 [inline] [] netlink_unicast+0x536/0x720 net/netlink/af_netlink.c:1350 [] netlink_sendmsg+0x89a/0xd50 net/netlink/af_netlink.c:1939 [] sock_sendmsg_nosec net/socket.c:673 [inline] [] sock_sendmsg+0x12e/0x170 net/socket.c:684 [] ___sys_sendmsg+0x81d/0x960 net/socket.c:2359 [] __sys_sendmsg+0x105/0x1d0 net/socket.c:2397 [] SYSC_sendmsg net/socket.c:2406 [inline] [] SyS_sendmsg+0x29/0x30 net/socket.c:2404 [] do_syscall_64+0x528/0x770 arch/x86/entry/common.c:305 [] entry_SYSCALL_64_after_hwframe+0x42/0xb7 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 06c7a2da21bc..39b427dc7512 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1127,6 +1127,33 @@ static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, }; +static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1], + struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, "CBQ options are required for this operation"); + return -EINVAL; + } + + err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, + cbq_policy, extack); + if (err < 0) + return err; + + if (tb[TCA_CBQ_WRROPT]) { + const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]); + + if (wrr->priority > TC_CBQ_MAXPRIO) { + NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO"); + err = -EINVAL; + } + } + return err; +} + static int cbq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -1139,13 +1166,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt, hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); q->delay_timer.function = cbq_undelay; - if (!opt) { - NL_SET_ERR_MSG(extack, "CBQ options are required for this operation"); - return -EINVAL; - } - - err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy, - extack); + err = cbq_opt_parse(tb, opt, extack); if (err < 0) return err; @@ -1464,13 +1485,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t struct cbq_class *parent; struct qdisc_rate_table *rtab = NULL; - if (!opt) { - NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing"); - return -EINVAL; - } - - err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy, - extack); + err = cbq_opt_parse(tb, opt, extack); if (err < 0) return err; From 0e141f757b2c78c983df893e9993313e2dc21e38 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Fri, 27 Sep 2019 14:58:20 +0800 Subject: [PATCH 1447/2880] erspan: remove the incorrect mtu limit for erspan erspan driver calls ether_setup(), after commit 61e84623ace3 ("net: centralize net_device min/max MTU checking"), the range of mtu is [min_mtu, max_mtu], which is [68, 1500] by default. It causes the dev mtu of the erspan device to not be greater than 1500, this limit value is not correct for ipgre tap device. Tested: Before patch: # ip link set erspan0 mtu 1600 Error: mtu greater than device maximum. After patch: # ip link set erspan0 mtu 1600 # ip -d link show erspan0 21: erspan0@NONE: mtu 1600 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 0 Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking") Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a53a543fe055..52690bb3e40f 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1446,6 +1446,7 @@ static void erspan_setup(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &erspan_netdev_ops; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; From a304c0a60252fde9daccd9f8a07a598021f6956a Mon Sep 17 00:00:00 2001 From: Keerthy Date: Fri, 20 Sep 2019 13:29:46 +0530 Subject: [PATCH 1448/2880] arm64/ARM: configs: Change CONFIG_REMOTEPROC from m to y Commit 6334150e9a36 ("remoteproc: don't allow modular build") changes CONFIG_REMOTEPROC to a boolean from a tristate config option which inhibits all defconfigs marking CONFIG_REMOTEPROC as a module in compiling the remoteproc and dependent config options. So fix the configs to have CONFIG_REMOTEPROC built in. Link: https://lore.kernel.org/r/20190920075946.13282-5-j-keerthy@ti.com Fixes: 6334150e9a36 ("remoteproc: don't allow modular build") Signed-off-by: Keerthy Acked-by: Will Deacon [olof: Fixed up all 4 occurrances in this one commit] Signed-off-by: Olof Johansson --- arch/arm/configs/davinci_all_defconfig | 2 +- arch/arm/configs/multi_v7_defconfig | 2 +- arch/arm/configs/omap2plus_defconfig | 2 +- arch/arm64/configs/defconfig | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/configs/davinci_all_defconfig b/arch/arm/configs/davinci_all_defconfig index b34970ce6b31..01e3c0f4be92 100644 --- a/arch/arm/configs/davinci_all_defconfig +++ b/arch/arm/configs/davinci_all_defconfig @@ -228,7 +228,7 @@ CONFIG_RTC_DRV_OMAP=m CONFIG_DMADEVICES=y CONFIG_TI_EDMA=y CONFIG_COMMON_CLK_PWM=m -CONFIG_REMOTEPROC=m +CONFIG_REMOTEPROC=y CONFIG_DA8XX_REMOTEPROC=m CONFIG_MEMORY=y CONFIG_TI_AEMIF=m diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index 13ba53286901..198de8e36d92 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -933,7 +933,7 @@ CONFIG_BCM2835_MBOX=y CONFIG_ROCKCHIP_IOMMU=y CONFIG_TEGRA_IOMMU_GART=y CONFIG_TEGRA_IOMMU_SMMU=y -CONFIG_REMOTEPROC=m +CONFIG_REMOTEPROC=y CONFIG_ST_REMOTEPROC=m CONFIG_RPMSG_VIRTIO=m CONFIG_ASPEED_LPC_CTRL=m diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 64eb896907bf..af4069476582 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -481,7 +481,7 @@ CONFIG_RTC_DRV_OMAP=m CONFIG_RTC_DRV_CPCAP=m CONFIG_DMADEVICES=y CONFIG_OMAP_IOMMU=y -CONFIG_REMOTEPROC=m +CONFIG_REMOTEPROC=y CONFIG_OMAP_REMOTEPROC=m CONFIG_WKUP_M3_RPROC=m CONFIG_SOC_TI=y diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 8e05c39eab08..c9a867ac32d4 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -723,7 +723,7 @@ CONFIG_TEGRA_IOMMU_SMMU=y CONFIG_ARM_SMMU=y CONFIG_ARM_SMMU_V3=y CONFIG_QCOM_IOMMU=y -CONFIG_REMOTEPROC=m +CONFIG_REMOTEPROC=y CONFIG_QCOM_Q6V5_MSS=m CONFIG_QCOM_Q6V5_PAS=m CONFIG_QCOM_SYSMON=m From 2511366797fa6ab4a404b4b000ef7cd262aaafe8 Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Mon, 9 Sep 2019 20:42:35 +0200 Subject: [PATCH 1449/2880] arm64: dts: allwinner: a64: pine64-plus: Add PHY regulator delay Depending on kernel and bootloader configuration, it's possible that Realtek ethernet PHY isn't powered on properly. According to the datasheet, it needs 30ms to power up and then some more time before it can be used. Fix that by adding 100ms ramp delay to regulator responsible for powering PHY. Fixes: 94dcfdc77fc5 ("arm64: allwinner: pine64-plus: Enable dwmac-sun8i") Suggested-by: Ondrej Jirman Signed-off-by: Jernej Skrabec Signed-off-by: Maxime Ripard --- arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts index 24f1aac366d6..d5b6e8159a33 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts @@ -63,3 +63,12 @@ reg = <1>; }; }; + +®_dc1sw { + /* + * Ethernet PHY needs 30ms to properly power up and some more + * to initialize. 100ms should be plenty of time to finish + * whole process. + */ + regulator-enable-ramp-delay = <100000>; +}; From ed3e9406bcbc32f84dc4aa4cb4767852e5ab086c Mon Sep 17 00:00:00 2001 From: Vasily Khoruzhick Date: Tue, 6 Aug 2019 07:01:35 -0700 Subject: [PATCH 1450/2880] arm64: dts: allwinner: a64: Drop PMU node Looks like PMU in A64 is broken, it generates no interrupts at all and as result 'perf top' shows no events. Tested on Pine64-LTS. Fixes: 34a97fcc71c2 ("arm64: dts: allwinner: a64: Add PMU node") Cc: Harald Geyer Cc: Jared D. McNeill Signed-off-by: Vasily Khoruzhick Reviewed-by: Emmanuel Vadot Signed-off-by: Maxime Ripard --- arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi index 69128a6dfc46..d0028934e11c 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi @@ -142,15 +142,6 @@ clock-output-names = "ext-osc32k"; }; - pmu { - compatible = "arm,cortex-a53-pmu"; - interrupts = , - , - , - ; - interrupt-affinity = <&cpu0>, <&cpu1>, <&cpu2>, <&cpu3>; - }; - psci { compatible = "arm,psci-0.2"; method = "smc"; From ccdf3aaa27ded6db9a93eed3ca7468bb2353b8fe Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Sun, 29 Sep 2019 10:52:59 +0200 Subject: [PATCH 1451/2880] arm64: dts: allwinner: a64: sopine-baseboard: Add PHY regulator delay It turns out that sopine-baseboard needs same fix as pine64-plus for ethernet PHY. Here too Realtek ethernet PHY chip needs additional power on delay to properly initialize. Datasheet mentions that chip needs 30 ms to be properly powered on and that it needs some more time to be initialized. Fix that by adding 100ms ramp delay to regulator responsible for powering PHY. Note that issue was found out and fix tested on pine64-lts, but it's basically the same as sopine-baseboard, only layout and connectors differ. Fixes: bdfe4cebea11 ("arm64: allwinner: a64: add Ethernet PHY regulator for several boards") Signed-off-by: Jernej Skrabec Signed-off-by: Maxime Ripard --- .../boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts index e6fb9683f213..25099202c52c 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts @@ -159,6 +159,12 @@ }; ®_dc1sw { + /* + * Ethernet PHY needs 30ms to properly power up and some more + * to initialize. 100ms should be plenty of time to finish + * whole process. + */ + regulator-enable-ramp-delay = <100000>; regulator-name = "vcc-phy"; }; From b1ba55cf1cfb9f3e0e00d743534684a25bf66d28 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 27 Sep 2019 11:30:30 -0300 Subject: [PATCH 1452/2880] tools headers uapi: Sync asm-generic/mman-common.h with the kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To pick the changes from: 1a4e58cce84e ("mm: introduce MADV_PAGEOUT") 9c276cc65a58 ("mm: introduce MADV_COLD") That result in these changes in the tools: $ tools/perf/trace/beauty/madvise_behavior.sh > before $ cp include/uapi/asm-generic/mman-common.h tools/include/uapi/asm-generic/mman-common.h $ git diff diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index 63b1f506ea67..c160a5354eb6 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -67,6 +67,9 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ + /* compatibility flags */ #define MAP_FILE 0 $ tools/perf/trace/beauty/madvise_behavior.sh > after $ diff -u before after --- before 2019-09-27 11:29:43.346320100 -0300 +++ after 2019-09-27 11:30:03.838570439 -0300 @@ -16,6 +16,8 @@ [17] = "DODUMP", [18] = "WIPEONFORK", [19] = "KEEPONFORK", + [20] = "COLD", + [21] = "PAGEOUT", [100] = "HWPOISON", [101] = "SOFT_OFFLINE", }; $ I.e. now when madvise gets those behaviours as args, it will be able to translate from the number to a human readable string. This addresses the following perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/mman-common.h' differs from latest version at 'include/uapi/asm-generic/mman-common.h' diff -u tools/include/uapi/asm-generic/mman-common.h include/uapi/asm-generic/mman-common.h Cc: Adrian Hunter Cc: Brendan Gregg Cc: Jiri Olsa Cc: Luis Cláudio Gonçalves Cc: Minchan Kim Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-n40y6c4sa49p29q6sl8w3ufx@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/mman-common.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index 63b1f506ea67..c160a5354eb6 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -67,6 +67,9 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ + /* compatibility flags */ #define MAP_FILE 0 From 05f371f8c55d69e4c04db4473085303291e4e734 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 27 Sep 2019 11:42:26 -0300 Subject: [PATCH 1453/2880] tools headers uapi: Sync linux/usbdevice_fs.h with the kernel sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To pick up the changes from: 4ed3350539aa ("USB: usbfs: Add a capability flag for runtime suspend") 7794f486ed0b ("usbfs: Add ioctls for runtime power management") This triggers these changes in the kernel sources, automagically supporting these new ioctls in the 'perf trace' beautifiers. Soon this will be used in things like filter expressions for tracepoints in 'perf record', 'perf trace', 'perf top', i.e. filter expressions will do a lookup to turn things like USBDEVFS_WAIT_FOR_RESUME into _IO('U', 35) before associating the tracepoint expression to tracepoint perf event. $ tools/perf/trace/beauty/usbdevfs_ioctl.sh > before $ cp include/uapi/linux/usbdevice_fs.h tools/include/uapi/linux/usbdevice_fs.h $ git diff diff --git a/tools/include/uapi/linux/usbdevice_fs.h b/tools/include/uapi/linux/usbdevice_fs.h index 78efe870c2b7..cf525cddeb94 100644 --- a/tools/include/uapi/linux/usbdevice_fs.h +++ b/tools/include/uapi/linux/usbdevice_fs.h @@ -158,6 +158,7 @@ struct usbdevfs_hub_portinfo { #define USBDEVFS_CAP_MMAP 0x20 #define USBDEVFS_CAP_DROP_PRIVILEGES 0x40 #define USBDEVFS_CAP_CONNINFO_EX 0x80 +#define USBDEVFS_CAP_SUSPEND 0x100 /* USBDEVFS_DISCONNECT_CLAIM flags & struct */ @@ -223,5 +224,8 @@ struct usbdevfs_streams { * extending size of the data returned. */ #define USBDEVFS_CONNINFO_EX(len) _IOC(_IOC_READ, 'U', 32, len) +#define USBDEVFS_FORBID_SUSPEND _IO('U', 33) +#define USBDEVFS_ALLOW_SUSPEND _IO('U', 34) +#define USBDEVFS_WAIT_FOR_RESUME _IO('U', 35) #endif /* _UAPI_LINUX_USBDEVICE_FS_H */ $ tools/perf/trace/beauty/usbdevfs_ioctl.sh > after $ diff -u before after --- before 2019-09-27 11:41:50.634867620 -0300 +++ after 2019-09-27 11:42:07.453102978 -0300 @@ -24,6 +24,9 @@ [30] = "DROP_PRIVILEGES", [31] = "GET_SPEED", [32] = "CONNINFO_EX", + [33] = "FORBID_SUSPEND", + [34] = "ALLOW_SUSPEND", + [35] = "WAIT_FOR_RESUME", [3] = "RESETEP", [4] = "SETINTERFACE", [5] = "SETCONFIGURATION", $ This addresses the following perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/usbdevice_fs.h' differs from latest version at 'include/uapi/linux/usbdevice_fs.h' diff -u tools/include/uapi/linux/usbdevice_fs.h include/uapi/linux/usbdevice_fs.h Cc: Alan Stern Cc: Adrian Hunter Cc: Brendan Gregg Cc: Greg Kroah-Hartman Cc: Jiri Olsa Cc: Luis Cláudio Gonçalves Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-x1rb109b9nfi7pukota82xhj@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/usbdevice_fs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/include/uapi/linux/usbdevice_fs.h b/tools/include/uapi/linux/usbdevice_fs.h index 78efe870c2b7..cf525cddeb94 100644 --- a/tools/include/uapi/linux/usbdevice_fs.h +++ b/tools/include/uapi/linux/usbdevice_fs.h @@ -158,6 +158,7 @@ struct usbdevfs_hub_portinfo { #define USBDEVFS_CAP_MMAP 0x20 #define USBDEVFS_CAP_DROP_PRIVILEGES 0x40 #define USBDEVFS_CAP_CONNINFO_EX 0x80 +#define USBDEVFS_CAP_SUSPEND 0x100 /* USBDEVFS_DISCONNECT_CLAIM flags & struct */ @@ -223,5 +224,8 @@ struct usbdevfs_streams { * extending size of the data returned. */ #define USBDEVFS_CONNINFO_EX(len) _IOC(_IOC_READ, 'U', 32, len) +#define USBDEVFS_FORBID_SUSPEND _IO('U', 33) +#define USBDEVFS_ALLOW_SUSPEND _IO('U', 34) +#define USBDEVFS_WAIT_FOR_RESUME _IO('U', 35) #endif /* _UAPI_LINUX_USBDEVICE_FS_H */ From 0ae4061223a3d097222cbec6599370e54db17731 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 27 Sep 2019 12:01:26 -0300 Subject: [PATCH 1454/2880] tools headers uapi: Sync linux/fs.h with the kernel sources To pick the changes from: 78a1b96bcf7a ("fscrypt: add FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS ioctl") 23c688b54016 ("fscrypt: allow unprivileged users to add/remove keys for v2 policies") 5dae460c2292 ("fscrypt: v2 encryption policy support") 5a7e29924dac ("fscrypt: add FS_IOC_GET_ENCRYPTION_KEY_STATUS ioctl") b1c0ec3599f4 ("fscrypt: add FS_IOC_REMOVE_ENCRYPTION_KEY ioctl") 22d94f493bfb ("fscrypt: add FS_IOC_ADD_ENCRYPTION_KEY ioctl") 3b6df59bc4d2 ("fscrypt: use FSCRYPT_* definitions, not FS_*") 2336d0deb2d4 ("fscrypt: use FSCRYPT_ prefix for uapi constants") 7af0ab0d3aab ("fs, fscrypt: move uapi definitions to new header ") That don't trigger any changes in tooling, as it so far is used only for: $ grep -l 'fs\.h' tools/perf/trace/beauty/*.sh | xargs grep regex= tools/perf/trace/beauty/rename_flags.sh:regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+RENAME_([[:alnum:]_]+)[[:space:]]+\(1[[:space:]]*<<[[:space:]]*([[:xdigit:]]+)[[:space:]]*\)[[:space:]]*.*' tools/perf/trace/beauty/sync_file_range.sh:regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+SYNC_FILE_RANGE_([[:alnum:]_]+)[[:space:]]+([[:xdigit:]]+)[[:space:]]*.*' tools/perf/trace/beauty/usbdevfs_ioctl.sh:regex="^#[[:space:]]*define[[:space:]]+USBDEVFS_(\w+)(\(\w+\))?[[:space:]]+_IO[CWR]{0,2}\([[:space:]]*(_IOC_\w+,[[:space:]]*)?'U'[[:space:]]*,[[:space:]]*([[:digit:]]+).*" tools/perf/trace/beauty/usbdevfs_ioctl.sh:regex="^#[[:space:]]*define[[:space:]]+USBDEVFS_(\w+)[[:space:]]+_IO[WR]{0,2}\([[:space:]]*'U'[[:space:]]*,[[:space:]]*([[:digit:]]+).*" $ This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/fs.h' differs from latest version at 'include/uapi/linux/fs.h' diff -u tools/include/uapi/linux/fs.h include/uapi/linux/fs.h Cc: Adrian Hunter Cc: Eric Biggers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-44g48exl9br9ba0t64chqb4i@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/fs.h | 55 +-------- tools/include/uapi/linux/fscrypt.h | 181 +++++++++++++++++++++++++++++ tools/perf/check-headers.sh | 1 + 3 files changed, 186 insertions(+), 51 deletions(-) create mode 100644 tools/include/uapi/linux/fscrypt.h diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index 2a616aa3f686..379a612f8f1d 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -13,6 +13,9 @@ #include #include #include +#ifndef __KERNEL__ +#include +#endif /* Use of MS_* flags within the kernel is restricted to core mount(2) code. */ #if !defined(__KERNEL__) @@ -212,57 +215,6 @@ struct fsxattr { #define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) #define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) -/* - * File system encryption support - */ -/* Policy provided via an ioctl on the topmost directory */ -#define FS_KEY_DESCRIPTOR_SIZE 8 - -#define FS_POLICY_FLAGS_PAD_4 0x00 -#define FS_POLICY_FLAGS_PAD_8 0x01 -#define FS_POLICY_FLAGS_PAD_16 0x02 -#define FS_POLICY_FLAGS_PAD_32 0x03 -#define FS_POLICY_FLAGS_PAD_MASK 0x03 -#define FS_POLICY_FLAG_DIRECT_KEY 0x04 /* use master key directly */ -#define FS_POLICY_FLAGS_VALID 0x07 - -/* Encryption algorithms */ -#define FS_ENCRYPTION_MODE_INVALID 0 -#define FS_ENCRYPTION_MODE_AES_256_XTS 1 -#define FS_ENCRYPTION_MODE_AES_256_GCM 2 -#define FS_ENCRYPTION_MODE_AES_256_CBC 3 -#define FS_ENCRYPTION_MODE_AES_256_CTS 4 -#define FS_ENCRYPTION_MODE_AES_128_CBC 5 -#define FS_ENCRYPTION_MODE_AES_128_CTS 6 -#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 /* Removed, do not use. */ -#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 /* Removed, do not use. */ -#define FS_ENCRYPTION_MODE_ADIANTUM 9 - -struct fscrypt_policy { - __u8 version; - __u8 contents_encryption_mode; - __u8 filenames_encryption_mode; - __u8 flags; - __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; -}; - -#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy) -#define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) -#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) - -/* Parameters for passing an encryption key into the kernel keyring */ -#define FS_KEY_DESC_PREFIX "fscrypt:" -#define FS_KEY_DESC_PREFIX_SIZE 8 - -/* Structure that userspace passes to the kernel keyring */ -#define FS_MAX_KEY_SIZE 64 - -struct fscrypt_key { - __u32 mode; - __u8 raw[FS_MAX_KEY_SIZE]; - __u32 size; -}; - /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) * @@ -306,6 +258,7 @@ struct fscrypt_key { #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define FS_HUGE_FILE_FL 0x00040000 /* Reserved for ext4 */ #define FS_EXTENT_FL 0x00080000 /* Extents */ +#define FS_VERITY_FL 0x00100000 /* Verity protected inode */ #define FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define FS_EOFBLOCKS_FL 0x00400000 /* Reserved for ext4 */ #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ diff --git a/tools/include/uapi/linux/fscrypt.h b/tools/include/uapi/linux/fscrypt.h new file mode 100644 index 000000000000..39ccfe9311c3 --- /dev/null +++ b/tools/include/uapi/linux/fscrypt.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * fscrypt user API + * + * These ioctls can be used on filesystems that support fscrypt. See the + * "User API" section of Documentation/filesystems/fscrypt.rst. + */ +#ifndef _UAPI_LINUX_FSCRYPT_H +#define _UAPI_LINUX_FSCRYPT_H + +#include + +/* Encryption policy flags */ +#define FSCRYPT_POLICY_FLAGS_PAD_4 0x00 +#define FSCRYPT_POLICY_FLAGS_PAD_8 0x01 +#define FSCRYPT_POLICY_FLAGS_PAD_16 0x02 +#define FSCRYPT_POLICY_FLAGS_PAD_32 0x03 +#define FSCRYPT_POLICY_FLAGS_PAD_MASK 0x03 +#define FSCRYPT_POLICY_FLAG_DIRECT_KEY 0x04 +#define FSCRYPT_POLICY_FLAGS_VALID 0x07 + +/* Encryption algorithms */ +#define FSCRYPT_MODE_AES_256_XTS 1 +#define FSCRYPT_MODE_AES_256_CTS 4 +#define FSCRYPT_MODE_AES_128_CBC 5 +#define FSCRYPT_MODE_AES_128_CTS 6 +#define FSCRYPT_MODE_ADIANTUM 9 +#define __FSCRYPT_MODE_MAX 9 + +/* + * Legacy policy version; ad-hoc KDF and no key verification. + * For new encrypted directories, use fscrypt_policy_v2 instead. + * + * Careful: the .version field for this is actually 0, not 1. + */ +#define FSCRYPT_POLICY_V1 0 +#define FSCRYPT_KEY_DESCRIPTOR_SIZE 8 +struct fscrypt_policy_v1 { + __u8 version; + __u8 contents_encryption_mode; + __u8 filenames_encryption_mode; + __u8 flags; + __u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; +}; +#define fscrypt_policy fscrypt_policy_v1 + +/* + * Process-subscribed "logon" key description prefix and payload format. + * Deprecated; prefer FS_IOC_ADD_ENCRYPTION_KEY instead. + */ +#define FSCRYPT_KEY_DESC_PREFIX "fscrypt:" +#define FSCRYPT_KEY_DESC_PREFIX_SIZE 8 +#define FSCRYPT_MAX_KEY_SIZE 64 +struct fscrypt_key { + __u32 mode; + __u8 raw[FSCRYPT_MAX_KEY_SIZE]; + __u32 size; +}; + +/* + * New policy version with HKDF and key verification (recommended). + */ +#define FSCRYPT_POLICY_V2 2 +#define FSCRYPT_KEY_IDENTIFIER_SIZE 16 +struct fscrypt_policy_v2 { + __u8 version; + __u8 contents_encryption_mode; + __u8 filenames_encryption_mode; + __u8 flags; + __u8 __reserved[4]; + __u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; +}; + +/* Struct passed to FS_IOC_GET_ENCRYPTION_POLICY_EX */ +struct fscrypt_get_policy_ex_arg { + __u64 policy_size; /* input/output */ + union { + __u8 version; + struct fscrypt_policy_v1 v1; + struct fscrypt_policy_v2 v2; + } policy; /* output */ +}; + +/* + * v1 policy keys are specified by an arbitrary 8-byte key "descriptor", + * matching fscrypt_policy_v1::master_key_descriptor. + */ +#define FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR 1 + +/* + * v2 policy keys are specified by a 16-byte key "identifier" which the kernel + * calculates as a cryptographic hash of the key itself, + * matching fscrypt_policy_v2::master_key_identifier. + */ +#define FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER 2 + +/* + * Specifies a key, either for v1 or v2 policies. This doesn't contain the + * actual key itself; this is just the "name" of the key. + */ +struct fscrypt_key_specifier { + __u32 type; /* one of FSCRYPT_KEY_SPEC_TYPE_* */ + __u32 __reserved; + union { + __u8 __reserved[32]; /* reserve some extra space */ + __u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; + __u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; + } u; +}; + +/* Struct passed to FS_IOC_ADD_ENCRYPTION_KEY */ +struct fscrypt_add_key_arg { + struct fscrypt_key_specifier key_spec; + __u32 raw_size; + __u32 __reserved[9]; + __u8 raw[]; +}; + +/* Struct passed to FS_IOC_REMOVE_ENCRYPTION_KEY */ +struct fscrypt_remove_key_arg { + struct fscrypt_key_specifier key_spec; +#define FSCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY 0x00000001 +#define FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS 0x00000002 + __u32 removal_status_flags; /* output */ + __u32 __reserved[5]; +}; + +/* Struct passed to FS_IOC_GET_ENCRYPTION_KEY_STATUS */ +struct fscrypt_get_key_status_arg { + /* input */ + struct fscrypt_key_specifier key_spec; + __u32 __reserved[6]; + + /* output */ +#define FSCRYPT_KEY_STATUS_ABSENT 1 +#define FSCRYPT_KEY_STATUS_PRESENT 2 +#define FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED 3 + __u32 status; +#define FSCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF 0x00000001 + __u32 status_flags; + __u32 user_count; + __u32 __out_reserved[13]; +}; + +#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy) +#define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) +#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) +#define FS_IOC_GET_ENCRYPTION_POLICY_EX _IOWR('f', 22, __u8[9]) /* size + version */ +#define FS_IOC_ADD_ENCRYPTION_KEY _IOWR('f', 23, struct fscrypt_add_key_arg) +#define FS_IOC_REMOVE_ENCRYPTION_KEY _IOWR('f', 24, struct fscrypt_remove_key_arg) +#define FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS _IOWR('f', 25, struct fscrypt_remove_key_arg) +#define FS_IOC_GET_ENCRYPTION_KEY_STATUS _IOWR('f', 26, struct fscrypt_get_key_status_arg) + +/**********************************************************************/ + +/* old names; don't add anything new here! */ +#ifndef __KERNEL__ +#define FS_KEY_DESCRIPTOR_SIZE FSCRYPT_KEY_DESCRIPTOR_SIZE +#define FS_POLICY_FLAGS_PAD_4 FSCRYPT_POLICY_FLAGS_PAD_4 +#define FS_POLICY_FLAGS_PAD_8 FSCRYPT_POLICY_FLAGS_PAD_8 +#define FS_POLICY_FLAGS_PAD_16 FSCRYPT_POLICY_FLAGS_PAD_16 +#define FS_POLICY_FLAGS_PAD_32 FSCRYPT_POLICY_FLAGS_PAD_32 +#define FS_POLICY_FLAGS_PAD_MASK FSCRYPT_POLICY_FLAGS_PAD_MASK +#define FS_POLICY_FLAG_DIRECT_KEY FSCRYPT_POLICY_FLAG_DIRECT_KEY +#define FS_POLICY_FLAGS_VALID FSCRYPT_POLICY_FLAGS_VALID +#define FS_ENCRYPTION_MODE_INVALID 0 /* never used */ +#define FS_ENCRYPTION_MODE_AES_256_XTS FSCRYPT_MODE_AES_256_XTS +#define FS_ENCRYPTION_MODE_AES_256_GCM 2 /* never used */ +#define FS_ENCRYPTION_MODE_AES_256_CBC 3 /* never used */ +#define FS_ENCRYPTION_MODE_AES_256_CTS FSCRYPT_MODE_AES_256_CTS +#define FS_ENCRYPTION_MODE_AES_128_CBC FSCRYPT_MODE_AES_128_CBC +#define FS_ENCRYPTION_MODE_AES_128_CTS FSCRYPT_MODE_AES_128_CTS +#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 /* removed */ +#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 /* removed */ +#define FS_ENCRYPTION_MODE_ADIANTUM FSCRYPT_MODE_ADIANTUM +#define FS_KEY_DESC_PREFIX FSCRYPT_KEY_DESC_PREFIX +#define FS_KEY_DESC_PREFIX_SIZE FSCRYPT_KEY_DESC_PREFIX_SIZE +#define FS_MAX_KEY_SIZE FSCRYPT_MAX_KEY_SIZE +#endif /* !__KERNEL__ */ + +#endif /* _UAPI_LINUX_FSCRYPT_H */ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index e2e0f06c97d0..cea13cb987d0 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -8,6 +8,7 @@ include/uapi/drm/i915_drm.h include/uapi/linux/fadvise.h include/uapi/linux/fcntl.h include/uapi/linux/fs.h +include/uapi/linux/fscrypt.h include/uapi/linux/kcmp.h include/uapi/linux/kvm.h include/uapi/linux/in.h From b7ad6108484221f431372b94763b74e550d16c93 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 27 Sep 2019 12:18:21 -0300 Subject: [PATCH 1455/2880] tools headers kvm: Sync kvm headers with the kernel sources To pick the changes in: 200824f55eef ("KVM: s390: Disallow invalid bits in kvm_valid_regs and kvm_dirty_regs") 4a53d99dd0c2 ("KVM: VMX: Introduce exit reason for receiving INIT signal on guest-mode") 7396d337cfad ("KVM: x86: Return to userspace with internal error on unexpected exit reason") 92f35b751c71 ("KVM: arm/arm64: vgic: Allow more than 256 vcpus for KVM_IRQ_LINE") None of them trigger any changes in tooling, this time this is just to silence these perf build warnings: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/vmx.h' differs from latest version at 'arch/x86/include/uapi/asm/vmx.h' diff -u tools/arch/x86/include/uapi/asm/vmx.h arch/x86/include/uapi/asm/vmx.h Warning: Kernel ABI header at 'tools/arch/s390/include/uapi/asm/kvm.h' differs from latest version at 'arch/s390/include/uapi/asm/kvm.h' diff -u tools/arch/s390/include/uapi/asm/kvm.h arch/s390/include/uapi/asm/kvm.h Warning: Kernel ABI header at 'tools/arch/arm/include/uapi/asm/kvm.h' differs from latest version at 'arch/arm/include/uapi/asm/kvm.h' diff -u tools/arch/arm/include/uapi/asm/kvm.h arch/arm/include/uapi/asm/kvm.h Warning: Kernel ABI header at 'tools/arch/arm64/include/uapi/asm/kvm.h' differs from latest version at 'arch/arm64/include/uapi/asm/kvm.h' diff -u tools/arch/arm64/include/uapi/asm/kvm.h arch/arm64/include/uapi/asm/kvm.h Cc: Adrian Hunter Cc: Janosch Frank Cc: Jiri Olsa Cc: Liran Alon Cc: Marc Zyngier Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Thomas Huth Link: https://lkml.kernel.org/n/tip-akuugvvjxte26kzv23zp5d2z@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm/include/uapi/asm/kvm.h | 4 +++- tools/arch/arm64/include/uapi/asm/kvm.h | 4 +++- tools/arch/s390/include/uapi/asm/kvm.h | 6 ++++++ tools/arch/x86/include/uapi/asm/vmx.h | 2 ++ tools/include/uapi/linux/kvm.h | 3 +++ 5 files changed, 17 insertions(+), 2 deletions(-) diff --git a/tools/arch/arm/include/uapi/asm/kvm.h b/tools/arch/arm/include/uapi/asm/kvm.h index a4217c1a5d01..2769360f195c 100644 --- a/tools/arch/arm/include/uapi/asm/kvm.h +++ b/tools/arch/arm/include/uapi/asm/kvm.h @@ -266,8 +266,10 @@ struct kvm_vcpu_events { #define KVM_DEV_ARM_ITS_CTRL_RESET 4 /* KVM_IRQ_LINE irq field index values */ +#define KVM_ARM_IRQ_VCPU2_SHIFT 28 +#define KVM_ARM_IRQ_VCPU2_MASK 0xf #define KVM_ARM_IRQ_TYPE_SHIFT 24 -#define KVM_ARM_IRQ_TYPE_MASK 0xff +#define KVM_ARM_IRQ_TYPE_MASK 0xf #define KVM_ARM_IRQ_VCPU_SHIFT 16 #define KVM_ARM_IRQ_VCPU_MASK 0xff #define KVM_ARM_IRQ_NUM_SHIFT 0 diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 9a507716ae2f..67c21f9bdbad 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -325,8 +325,10 @@ struct kvm_vcpu_events { #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 /* KVM_IRQ_LINE irq field index values */ +#define KVM_ARM_IRQ_VCPU2_SHIFT 28 +#define KVM_ARM_IRQ_VCPU2_MASK 0xf #define KVM_ARM_IRQ_TYPE_SHIFT 24 -#define KVM_ARM_IRQ_TYPE_MASK 0xff +#define KVM_ARM_IRQ_TYPE_MASK 0xf #define KVM_ARM_IRQ_VCPU_SHIFT 16 #define KVM_ARM_IRQ_VCPU_MASK 0xff #define KVM_ARM_IRQ_NUM_SHIFT 0 diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index 47104e5b47fd..436ec7636927 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -231,6 +231,12 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_GSCB (1UL << 9) #define KVM_SYNC_BPBC (1UL << 10) #define KVM_SYNC_ETOKEN (1UL << 11) + +#define KVM_SYNC_S390_VALID_FIELDS \ + (KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \ + KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \ + KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN) + /* length and alignment of the sdnx as a power of two */ #define SDNXC 8 #define SDNXL (1UL << SDNXC) diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index f0b0c90dd398..f01950aa7fae 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -31,6 +31,7 @@ #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 #define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INIT_SIGNAL 3 #define EXIT_REASON_PENDING_INTERRUPT 7 #define EXIT_REASON_NMI_WINDOW 8 @@ -90,6 +91,7 @@ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ + { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 5e3f12d5359e..233efbb1c81c 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -243,6 +243,8 @@ struct kvm_hyperv_exit { #define KVM_INTERNAL_ERROR_SIMUL_EX 2 /* Encounter unexpected vm-exit due to delivery event. */ #define KVM_INTERNAL_ERROR_DELIVERY_EV 3 +/* Encounter unexpected vm-exit reason */ +#define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON 4 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { @@ -996,6 +998,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_PTRAUTH_ADDRESS 171 #define KVM_CAP_ARM_PTRAUTH_GENERIC 172 #define KVM_CAP_PMU_EVENT_FILTER 173 +#define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 #ifdef KVM_CAP_IRQ_ROUTING From 7d4c85b7035eb2f9ab217ce649dcd1bfaf0cacd3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 26 Sep 2019 15:00:18 -0700 Subject: [PATCH 1456/2880] perf llvm: Don't access out-of-scope array The 'test_dir' variable is assigned to the 'release' array which is out-of-scope 3 lines later. Extend the scope of the 'release' array so that an out-of-scope array isn't accessed. Bug detected by clang's address sanitizer. Fixes: 07bc5c699a3d ("perf tools: Make fetch_kernel_version() publicly available") Cc: stable@vger.kernel.org # v4.4+ Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Wang Nan Link: http://lore.kernel.org/lkml/20190926220018.25402-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/llvm-utils.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index 8d04e3d070b1..8b14e4a7f1dc 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -233,14 +233,14 @@ static int detect_kbuild_dir(char **kbuild_dir) const char *prefix_dir = ""; const char *suffix_dir = ""; + /* _UTSNAME_LENGTH is 65 */ + char release[128]; + char *autoconf_path; int err; if (!test_dir) { - /* _UTSNAME_LENGTH is 65 */ - char release[128]; - err = fetch_kernel_version(NULL, release, sizeof(release)); if (err) From 02d084792273e8a5f1813dcad988229a45be96ea Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 27 Sep 2019 10:11:46 +0200 Subject: [PATCH 1457/2880] perf vendor events s390: Add JSON transaction for machine type 8561 Add s390 transaction counter definition for machine 8561. This is the same file as for the predecessor machine. Fixes: 6e67d77d673d ("perf vendor events s390: Add JSON files for machine type 8561") Signed-off-by: Thomas Richter Cc: Heiko Carstens Cc: Vasily Gorbik Link: http://lore.kernel.org/lkml/20190927081147.18345-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/s390/cf_m8561/transaction.json | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tools/perf/pmu-events/arch/s390/cf_m8561/transaction.json diff --git a/tools/perf/pmu-events/arch/s390/cf_m8561/transaction.json b/tools/perf/pmu-events/arch/s390/cf_m8561/transaction.json new file mode 100644 index 000000000000..1a0034f79f73 --- /dev/null +++ b/tools/perf/pmu-events/arch/s390/cf_m8561/transaction.json @@ -0,0 +1,7 @@ +[ + { + "BriefDescription": "Transaction count", + "MetricName": "transaction", + "MetricExpr": "TX_C_TEND + TX_NC_TEND + TX_NC_TABORT + TX_C_TABORT_SPECIAL + TX_C_TABORT_NO_SPECIAL" + } +] From 0d0e5ecec6116db6031829299e74cc71240c9ff3 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 27 Sep 2019 10:11:47 +0200 Subject: [PATCH 1458/2880] perf vendor events s390: Use s390 machine name instead of type 8561 In the pmu-events directory for JSON file definitions use the official machine name IBM z15 instead of machine type number 8561. This is consistent with previous machines. Signed-off-by: Thomas Richter Cc: Heiko Carstens Cc: Vasily Gorbik Link: http://lore.kernel.org/lkml/20190927081147.18345-2-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/s390/{cf_m8561 => cf_z15}/basic.json | 0 .../perf/pmu-events/arch/s390/{cf_m8561 => cf_z15}/crypto.json | 0 .../perf/pmu-events/arch/s390/{cf_m8561 => cf_z15}/crypto6.json | 0 .../pmu-events/arch/s390/{cf_m8561 => cf_z15}/extended.json | 0 .../pmu-events/arch/s390/{cf_m8561 => cf_z15}/transaction.json | 0 tools/perf/pmu-events/arch/s390/mapfile.csv | 2 +- 6 files changed, 1 insertion(+), 1 deletion(-) rename tools/perf/pmu-events/arch/s390/{cf_m8561 => cf_z15}/basic.json (100%) rename tools/perf/pmu-events/arch/s390/{cf_m8561 => cf_z15}/crypto.json (100%) rename tools/perf/pmu-events/arch/s390/{cf_m8561 => cf_z15}/crypto6.json (100%) rename tools/perf/pmu-events/arch/s390/{cf_m8561 => cf_z15}/extended.json (100%) rename tools/perf/pmu-events/arch/s390/{cf_m8561 => cf_z15}/transaction.json (100%) diff --git a/tools/perf/pmu-events/arch/s390/cf_m8561/basic.json b/tools/perf/pmu-events/arch/s390/cf_z15/basic.json similarity index 100% rename from tools/perf/pmu-events/arch/s390/cf_m8561/basic.json rename to tools/perf/pmu-events/arch/s390/cf_z15/basic.json diff --git a/tools/perf/pmu-events/arch/s390/cf_m8561/crypto.json b/tools/perf/pmu-events/arch/s390/cf_z15/crypto.json similarity index 100% rename from tools/perf/pmu-events/arch/s390/cf_m8561/crypto.json rename to tools/perf/pmu-events/arch/s390/cf_z15/crypto.json diff --git a/tools/perf/pmu-events/arch/s390/cf_m8561/crypto6.json b/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json similarity index 100% rename from tools/perf/pmu-events/arch/s390/cf_m8561/crypto6.json rename to tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json diff --git a/tools/perf/pmu-events/arch/s390/cf_m8561/extended.json b/tools/perf/pmu-events/arch/s390/cf_z15/extended.json similarity index 100% rename from tools/perf/pmu-events/arch/s390/cf_m8561/extended.json rename to tools/perf/pmu-events/arch/s390/cf_z15/extended.json diff --git a/tools/perf/pmu-events/arch/s390/cf_m8561/transaction.json b/tools/perf/pmu-events/arch/s390/cf_z15/transaction.json similarity index 100% rename from tools/perf/pmu-events/arch/s390/cf_m8561/transaction.json rename to tools/perf/pmu-events/arch/s390/cf_z15/transaction.json diff --git a/tools/perf/pmu-events/arch/s390/mapfile.csv b/tools/perf/pmu-events/arch/s390/mapfile.csv index bd3fc577139c..61641a3480e0 100644 --- a/tools/perf/pmu-events/arch/s390/mapfile.csv +++ b/tools/perf/pmu-events/arch/s390/mapfile.csv @@ -4,4 +4,4 @@ Family-model,Version,Filename,EventType ^IBM.282[78].*[13]\.[1-5].[[:xdigit:]]+$,1,cf_zec12,core ^IBM.296[45].*[13]\.[1-5].[[:xdigit:]]+$,1,cf_z13,core ^IBM.390[67].*[13]\.[1-5].[[:xdigit:]]+$,3,cf_z14,core -^IBM.856[12].*3\.6.[[:xdigit:]]+$,3,cf_m8561,core +^IBM.856[12].*3\.6.[[:xdigit:]]+$,3,cf_z15,core From ee212d6ea20887c0ef352be8563ca13dbf965906 Mon Sep 17 00:00:00 2001 From: Steve MacLean Date: Sat, 28 Sep 2019 01:39:00 +0000 Subject: [PATCH 1459/2880] perf map: Fix overlapped map handling Whenever an mmap/mmap2 event occurs, the map tree must be updated to add a new entry. If a new map overlaps a previous map, the overlapped section of the previous map is effectively unmapped, but the non-overlapping sections are still valid. maps__fixup_overlappings() is responsible for creating any new map entries from the previously overlapped map. It optionally creates a before and an after map. When creating the after map the existing code failed to adjust the map.pgoff. This meant the new after map would incorrectly calculate the file offset for the ip. This results in incorrect symbol name resolution for any ip in the after region. Make maps__fixup_overlappings() correctly populate map.pgoff. Add an assert that new mapping matches old mapping at the beginning of the after map. Committer-testing: Validated correct parsing of libcoreclr.so symbols from .NET Core 3.0 preview9 (which didn't strip symbols). Preparation: ~/dotnet3.0-preview9/dotnet new webapi -o perfSymbol cd perfSymbol ~/dotnet3.0-preview9/dotnet publish perf record ~/dotnet3.0-preview9/dotnet \ bin/Debug/netcoreapp3.0/publish/perfSymbol.dll ^C Before: perf script --show-mmap-events 2>&1 | grep -e MMAP -e unknown |\ grep libcoreclr.so | head -n 4 dotnet 1907 373352.698780: PERF_RECORD_MMAP2 1907/1907: \ [0x7fe615726000(0x768000) @ 0 08:02 5510620 765057155]: \ r-xp .../3.0.0-preview9-19423-09/libcoreclr.so dotnet 1907 373352.701091: PERF_RECORD_MMAP2 1907/1907: \ [0x7fe615974000(0x1000) @ 0x24e000 08:02 5510620 765057155]: \ rwxp .../3.0.0-preview9-19423-09/libcoreclr.so dotnet 1907 373352.701241: PERF_RECORD_MMAP2 1907/1907: \ [0x7fe615c42000(0x1000) @ 0x51c000 08:02 5510620 765057155]: \ rwxp .../3.0.0-preview9-19423-09/libcoreclr.so dotnet 1907 373352.705249: 250000 cpu-clock: \ 7fe6159a1f99 [unknown] \ (.../3.0.0-preview9-19423-09/libcoreclr.so) After: perf script --show-mmap-events 2>&1 | grep -e MMAP -e unknown |\ grep libcoreclr.so | head -n 4 dotnet 1907 373352.698780: PERF_RECORD_MMAP2 1907/1907: \ [0x7fe615726000(0x768000) @ 0 08:02 5510620 765057155]: \ r-xp .../3.0.0-preview9-19423-09/libcoreclr.so dotnet 1907 373352.701091: PERF_RECORD_MMAP2 1907/1907: \ [0x7fe615974000(0x1000) @ 0x24e000 08:02 5510620 765057155]: \ rwxp .../3.0.0-preview9-19423-09/libcoreclr.so dotnet 1907 373352.701241: PERF_RECORD_MMAP2 1907/1907: \ [0x7fe615c42000(0x1000) @ 0x51c000 08:02 5510620 765057155]: \ rwxp .../3.0.0-preview9-19423-09/libcoreclr.so All the [unknown] symbols were resolved. Signed-off-by: Steve MacLean Tested-by: Brian Robbins Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Davidlohr Bueso Cc: Eric Saint-Etienne Cc: John Keeping Cc: John Salem Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: Tom McDonald Link: http://lore.kernel.org/lkml/BN8PR21MB136270949F22A6A02335C238F7800@BN8PR21MB1362.namprd21.prod.outlook.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 5b83ed1ebbd6..eec9b282c047 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "symbol.h" +#include #include #include #include @@ -850,6 +851,8 @@ static int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp } after->start = map->end; + after->pgoff += map->end - pos->start; + assert(pos->map_ip(pos, map->end) == after->map_ip(after, map->end)); __map_groups__insert(pos->groups, after); if (verbose >= 2 && !use_browser) map__fprintf(after, fp); From b59711e9b0d22fd47abfa00602fd8c365cdd3ab7 Mon Sep 17 00:00:00 2001 From: Steve MacLean Date: Sat, 28 Sep 2019 01:41:18 +0000 Subject: [PATCH 1460/2880] perf inject jit: Fix JIT_CODE_MOVE filename During perf inject --jit, JIT_CODE_MOVE records were injecting MMAP records with an incorrect filename. Specifically it was missing the ".so" suffix. Further the JIT_CODE_LOAD record were silently truncating the jr->load.code_index field to 32 bits before generating the filename. Make both records emit the same filename based on the full 64 bit code_index field. Fixes: 9b07e27f88b9 ("perf inject: Add jitdump mmap injection support") Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Steve MacLean Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Brian Robbins Cc: Davidlohr Bueso Cc: Eric Saint-Etienne Cc: John Keeping Cc: John Salem Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: Tom McDonald Link: http://lore.kernel.org/lkml/BN8PR21MB1362FF8F127B31DBF4121528F7800@BN8PR21MB1362.namprd21.prod.outlook.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/jitdump.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index 1bdf4c6ea3e5..e3ccb0ce1938 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -395,7 +395,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr) size_t size; u16 idr_size; const char *sym; - uint32_t count; + uint64_t count; int ret, csize, usize; pid_t pid, tid; struct { @@ -418,7 +418,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr) return -1; filename = event->mmap2.filename; - size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%u.so", + size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%" PRIu64 ".so", jd->dir, pid, count); @@ -529,7 +529,7 @@ static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr) return -1; filename = event->mmap2.filename; - size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%"PRIu64, + size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%" PRIu64 ".so", jd->dir, pid, jr->move.code_index); From 2657983b4c0d81632c6a73bae469951b0d341251 Mon Sep 17 00:00:00 2001 From: Steve MacLean Date: Sat, 28 Sep 2019 01:53:08 +0000 Subject: [PATCH 1461/2880] perf docs: Correct and clarify jitdump spec Specification claims latest version of jitdump file format is 2. Current jit dump reading code treats 1 as the latest version. Correct spec to match code. The original language made it unclear the value to be written in the magic field. Revise language that the writer always writes the same value. Specify that the reader uses the value to detect endian mismatches. Signed-off-by: Steve MacLean Acked-by: Stephane Eranian Cc: Alexander Shishkin Cc: Andi Kleen Cc: Brian Robbins Cc: Davidlohr Bueso Cc: Eric Saint-Etienne Cc: Jiri Olsa Cc: John Keeping Cc: John Salem Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Tom McDonald Link: http://lore.kernel.org/lkml/BN8PR21MB1362F63CDE7AC69736FC7F9EF7800@BN8PR21MB1362.namprd21.prod.outlook.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/jitdump-specification.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/jitdump-specification.txt b/tools/perf/Documentation/jitdump-specification.txt index 4c62b0713651..52152d156ad9 100644 --- a/tools/perf/Documentation/jitdump-specification.txt +++ b/tools/perf/Documentation/jitdump-specification.txt @@ -36,8 +36,8 @@ III/ Jitdump file header format Each jitdump file starts with a fixed size header containing the following fields in order: -* uint32_t magic : a magic number tagging the file type. The value is 4-byte long and represents the string "JiTD" in ASCII form. It is 0x4A695444 or 0x4454694a depending on the endianness. The field can be used to detect the endianness of the file -* uint32_t version : a 4-byte value representing the format version. It is currently set to 2 +* uint32_t magic : a magic number tagging the file type. The value is 4-byte long and represents the string "JiTD" in ASCII form. It written is as 0x4A695444. The reader will detect an endian mismatch when it reads 0x4454694a. +* uint32_t version : a 4-byte value representing the format version. It is currently set to 1 * uint32_t total_size: size in bytes of file header * uint32_t elf_mach : ELF architecture encoding (ELF e_machine value as specified in /usr/include/elf.h) * uint32_t pad1 : padding. Reserved for future use From e98df280bc2a499fd41d7f9e2d6733884de69902 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 27 Sep 2019 16:35:44 -0700 Subject: [PATCH 1462/2880] perf script brstackinsn: Fix recovery from LBR/binary mismatch When the LBR data and the instructions in a binary do not match the loop printing instructions could get confused and print a long stream of bogus instructions. The problem was that if the instruction decoder cannot decode an instruction it ilen wasn't initialized, so the loop going through the basic block would continue with the previous value. Harden the code to avoid such problems: - Make sure ilen is always freshly initialized and is 0 for bad instructions. - Do not overrun the code buffer while printing instructions - Print a warning message if the final jump is not on an instruction boundary. Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20190927233546.11533-1-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 286fc70d7402..67be8d31afab 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1063,7 +1063,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, continue; insn = 0; - for (off = 0;; off += ilen) { + for (off = 0; off < (unsigned)len; off += ilen) { uint64_t ip = start + off; printed += ip__fprintf_sym(ip, thread, x.cpumode, x.cpu, &lastsym, attr, fp); @@ -1074,6 +1074,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += print_srccode(thread, x.cpumode, ip); break; } else { + ilen = 0; printed += fprintf(fp, "\t%016" PRIx64 "\t%s\n", ip, dump_insn(&x, ip, buffer + off, len - off, &ilen)); if (ilen == 0) @@ -1083,6 +1084,8 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, insn++; } } + if (off != (unsigned)len) + printed += fprintf(fp, "\tmismatch of LBR data and executable\n"); } /* @@ -1123,6 +1126,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, goto out; } for (off = 0; off <= end - start; off += ilen) { + ilen = 0; printed += fprintf(fp, "\t%016" PRIx64 "\t%s\n", start + off, dump_insn(&x, start + off, buffer + off, len - off, &ilen)); if (ilen == 0) From 6bdfd9f118bd59cf0f85d3bf4b72b586adea17c1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 27 Sep 2019 16:35:45 -0700 Subject: [PATCH 1463/2880] perf jevents: Fix period for Intel fixed counters The Intel fixed counters use a special table to override the JSON information. During this override the period information from the JSON file got dropped, which results in inst_retired.any and similar running with frequency mode instead of a period. Just specify the expected period in the table. Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20190927233546.11533-2-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/jevents.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index 9e37287da924..e2837260ca4d 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -450,12 +450,12 @@ static struct fixed { const char *name; const char *event; } fixed[] = { - { "inst_retired.any", "event=0xc0" }, - { "inst_retired.any_p", "event=0xc0" }, - { "cpu_clk_unhalted.ref", "event=0x0,umask=0x03" }, - { "cpu_clk_unhalted.thread", "event=0x3c" }, - { "cpu_clk_unhalted.core", "event=0x3c" }, - { "cpu_clk_unhalted.thread_any", "event=0x3c,any=1" }, + { "inst_retired.any", "event=0xc0,period=2000003" }, + { "inst_retired.any_p", "event=0xc0,period=2000003" }, + { "cpu_clk_unhalted.ref", "event=0x0,umask=0x03,period=2000003" }, + { "cpu_clk_unhalted.thread", "event=0x3c,period=2000003" }, + { "cpu_clk_unhalted.core", "event=0x3c,period=2000003" }, + { "cpu_clk_unhalted.thread_any", "event=0x3c,any=1,period=2000003" }, { NULL, NULL}, }; From f67001a4a08eb124197ed4376941e1da9cf94b42 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 30 Sep 2019 10:55:34 -0300 Subject: [PATCH 1464/2880] perf tools: Propagate get_cpuid() error For consistency, propagate the exact cause for get_cpuid() to have failed. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-9ig269f7ktnhh99g4l15vpu2@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/util/header.c | 3 ++- tools/perf/arch/s390/util/header.c | 9 +++++---- tools/perf/arch/x86/util/header.c | 3 ++- tools/perf/builtin-kvm.c | 7 ++++--- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/tools/perf/arch/powerpc/util/header.c b/tools/perf/arch/powerpc/util/header.c index b6b7bc7e31a1..3b4cdfc5efd6 100644 --- a/tools/perf/arch/powerpc/util/header.c +++ b/tools/perf/arch/powerpc/util/header.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include @@ -30,7 +31,7 @@ get_cpuid(char *buffer, size_t sz) buffer[nb-1] = '\0'; return 0; } - return -1; + return ENOBUFS; } char * diff --git a/tools/perf/arch/s390/util/header.c b/tools/perf/arch/s390/util/header.c index 8b0b018d896a..7933f6871c81 100644 --- a/tools/perf/arch/s390/util/header.c +++ b/tools/perf/arch/s390/util/header.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -54,7 +55,7 @@ int get_cpuid(char *buffer, size_t sz) sysinfo = fopen(SYSINFO, "r"); if (sysinfo == NULL) - return -1; + return errno; while ((read = getline(&line, &line_sz, sysinfo)) != -1) { if (!strncmp(line, SYSINFO_MANU, strlen(SYSINFO_MANU))) { @@ -89,7 +90,7 @@ int get_cpuid(char *buffer, size_t sz) /* Missing manufacturer, type or model information should not happen */ if (!manufacturer[0] || !type[0] || !model[0]) - return -1; + return EINVAL; /* * Scan /proc/service_levels and return the CPU-MF counter facility @@ -133,14 +134,14 @@ skip_sysinfo: else nbytes = snprintf(buffer, sz, "%s,%s,%s", manufacturer, type, model); - return (nbytes >= sz) ? -1 : 0; + return (nbytes >= sz) ? ENOBUFS : 0; } char *get_cpuid_str(struct perf_pmu *pmu __maybe_unused) { char *buf = malloc(128); - if (buf && get_cpuid(buf, 128) < 0) + if (buf && get_cpuid(buf, 128)) zfree(&buf); return buf; } diff --git a/tools/perf/arch/x86/util/header.c b/tools/perf/arch/x86/util/header.c index 662ecf84a421..aa6deb463bf3 100644 --- a/tools/perf/arch/x86/util/header.c +++ b/tools/perf/arch/x86/util/header.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include @@ -58,7 +59,7 @@ __get_cpuid(char *buffer, size_t sz, const char *fmt) buffer[nb-1] = '\0'; return 0; } - return -1; + return ENOBUFS; } int diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 2227e2f42c09..58a9e0989491 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -705,14 +705,15 @@ static int process_sample_event(struct perf_tool *tool, static int cpu_isa_config(struct perf_kvm_stat *kvm) { - char buf[64], *cpuid; + char buf[128], *cpuid; int err; if (kvm->live) { err = get_cpuid(buf, sizeof(buf)); if (err != 0) { - pr_err("Failed to look up CPU type\n"); - return err; + pr_err("Failed to look up CPU type: %s\n", + str_error_r(err, buf, sizeof(buf))); + return -err; } cpuid = buf; } else From 9db0e3635fb35b6695275774ab909c51221b66ad Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 30 Sep 2019 11:48:32 -0300 Subject: [PATCH 1465/2880] perf evsel: Fall back to global 'perf_env' in perf_evsel__env() I.e. if evsel->evlist or evsel->evlist->env isn't set, return the environment for the running machine, as that would be set if reading from a perf.data file. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-uqq4grmhbi12rwb0lfpo6lfu@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 3 ++- tools/perf/util/python.c | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 5591af81a070..abc7fda4a0fe 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -30,6 +30,7 @@ #include "counts.h" #include "event.h" #include "evsel.h" +#include "util/env.h" #include "util/evsel_config.h" #include "util/evsel_fprintf.h" #include "evlist.h" @@ -2512,7 +2513,7 @@ struct perf_env *perf_evsel__env(struct evsel *evsel) { if (evsel && evsel->evlist) return evsel->evlist->env; - return NULL; + return &perf_env; } static int store_evsel_ids(struct evsel *evsel, struct evlist *evlist) diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 53f31053a27a..02460362256d 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -14,6 +14,7 @@ #include "thread_map.h" #include "trace-event.h" #include "mmap.h" +#include "util/env.h" #include #include "../perf-sys.h" @@ -53,6 +54,11 @@ int parse_callchain_record(const char *arg __maybe_unused, return 0; } +/* + * Add this one here not to drag util/env.c + */ +struct perf_env perf_env; + /* * Support debug printing even though util/debug.c is not linked. That means * implementing 'verbose' and 'eprintf'. From a66fa0619a0ae3585ef09e9c33ecfb5c7c6cb72b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 30 Sep 2019 15:06:01 -0300 Subject: [PATCH 1466/2880] perf annotate: Propagate perf_env__arch() error The callers of symbol__annotate2() use symbol__strerror_disassemble() to convert its failure returns into a human readable string, so propagate error values from functions it calls, starting with perf_env__arch() that when fails the right thing to do is to look at 'errno' to see why its possible call to uname() failed. Reported-by: Russell King - ARM Linux admin Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra , Cc: Will Deacon Link: https://lkml.kernel.org/n/tip-it5d83kyusfhb1q1b0l4pxzs@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index e830eadfca2a..9b7b9176e713 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2071,7 +2071,7 @@ int symbol__annotate(struct symbol *sym, struct map *map, int err; if (!arch_name) - return -1; + return errno; args.arch = arch = arch__find(arch_name); if (arch == NULL) From 28f4417c3333940b242af03d90214f713bbef232 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 30 Sep 2019 15:11:47 -0300 Subject: [PATCH 1467/2880] perf annotate: Fix the signedness of failure returns Callers of symbol__annotate() expect a errno value or some other extended error value range in symbol__strerror_disassemble() to convert to a proper error string, fix it when propagating a failure to find the arch specific annotation routines via arch__find(arch_name). Reported-by: Russell King - ARM Linux admin Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra , Cc: Will Deacon Link: https://lkml.kernel.org/n/tip-o0k6dw7cas0vvmjjvgsyvu1i@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 9b7b9176e713..95109acf4808 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2075,7 +2075,7 @@ int symbol__annotate(struct symbol *sym, struct map *map, args.arch = arch = arch__find(arch_name); if (arch == NULL) - return -ENOTSUP; + return ENOTSUP; if (parch) *parch = arch; From 211f493b611eef012841f795166c38ec7528738d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 30 Sep 2019 15:44:13 -0300 Subject: [PATCH 1468/2880] perf annotate: Propagate the symbol__annotate() error return We were just returning -1 in symbol__annotate() when symbol__annotate() failed, propagate its error as it is used later to pass to symbol__strerror_disassemble() to present a error message to the user, that in some cases were getting: "Invalid -1 error code" Fix it to propagate the error. Reported-by: Russell King - ARM Linux admin Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra , Cc: Will Deacon Link: https://lkml.kernel.org/n/tip-0tj89rs9g7nbcyd5skadlvuu@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 95109acf4808..1de1a7091c48 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2997,7 +2997,7 @@ int symbol__annotate2(struct symbol *sym, struct map *map, struct evsel *evsel, out_free_offsets: zfree(¬es->offsets); - return -1; + return err; } #define ANNOTATION__CFG(n) \ From 42d7a9107d83223a5fcecc6732d626a6c074cbc2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 30 Sep 2019 15:48:12 -0300 Subject: [PATCH 1469/2880] perf annotate: Fix arch specific ->init() failure errors They are called from symbol__annotate() and to propagate errors that can help understand the problem make them return what symbol__strerror_disassemble() known, i.e. errno codes and other annotation specific errors in a special, out of errnos, range. Reported-by: Russell King - ARM Linux admin Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra , Cc: Will Deacon Link: https://lkml.kernel.org/n/tip-pqx7srcv7tixgid251aeboj6@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/annotate/instructions.c | 4 ++-- tools/perf/arch/arm64/annotate/instructions.c | 4 ++-- tools/perf/arch/s390/annotate/instructions.c | 6 ++++-- tools/perf/arch/x86/annotate/instructions.c | 6 ++++-- tools/perf/util/annotate.c | 6 ++++++ tools/perf/util/annotate.h | 2 ++ 6 files changed, 20 insertions(+), 8 deletions(-) diff --git a/tools/perf/arch/arm/annotate/instructions.c b/tools/perf/arch/arm/annotate/instructions.c index e1d4b484cc4b..2ff6cedeb9c5 100644 --- a/tools/perf/arch/arm/annotate/instructions.c +++ b/tools/perf/arch/arm/annotate/instructions.c @@ -37,7 +37,7 @@ static int arm__annotate_init(struct arch *arch, char *cpuid __maybe_unused) arm = zalloc(sizeof(*arm)); if (!arm) - return -1; + return ENOMEM; #define ARM_CONDS "(cc|cs|eq|ge|gt|hi|le|ls|lt|mi|ne|pl|vc|vs)" err = regcomp(&arm->call_insn, "^blx?" ARM_CONDS "?$", REG_EXTENDED); @@ -59,5 +59,5 @@ out_free_call: regfree(&arm->call_insn); out_free_arm: free(arm); - return -1; + return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP; } diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c index 43aa93ed8414..037e292ecd8e 100644 --- a/tools/perf/arch/arm64/annotate/instructions.c +++ b/tools/perf/arch/arm64/annotate/instructions.c @@ -95,7 +95,7 @@ static int arm64__annotate_init(struct arch *arch, char *cpuid __maybe_unused) arm = zalloc(sizeof(*arm)); if (!arm) - return -1; + return ENOMEM; /* bl, blr */ err = regcomp(&arm->call_insn, "^blr?$", REG_EXTENDED); @@ -118,5 +118,5 @@ out_free_call: regfree(&arm->call_insn); out_free_arm: free(arm); - return -1; + return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP; } diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c index 89bb8f2c54ce..a50e70baf918 100644 --- a/tools/perf/arch/s390/annotate/instructions.c +++ b/tools/perf/arch/s390/annotate/instructions.c @@ -164,8 +164,10 @@ static int s390__annotate_init(struct arch *arch, char *cpuid __maybe_unused) if (!arch->initialized) { arch->initialized = true; arch->associate_instruction_ops = s390__associate_ins_ops; - if (cpuid) - err = s390__cpuid_parse(arch, cpuid); + if (cpuid) { + if (s390__cpuid_parse(arch, cpuid)) + err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING; + } } return err; diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c index 44f5aba78210..7eb5621c021d 100644 --- a/tools/perf/arch/x86/annotate/instructions.c +++ b/tools/perf/arch/x86/annotate/instructions.c @@ -196,8 +196,10 @@ static int x86__annotate_init(struct arch *arch, char *cpuid) if (arch->initialized) return 0; - if (cpuid) - err = x86__cpuid_parse(arch, cpuid); + if (cpuid) { + if (x86__cpuid_parse(arch, cpuid)) + err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING; + } arch->initialized = true; return err; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 1de1a7091c48..dc15352924f9 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1631,6 +1631,12 @@ int symbol__strerror_disassemble(struct symbol *sym __maybe_unused, struct map * case SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF: scnprintf(buf, buflen, "Please link with binutils's libopcode to enable BPF annotation"); break; + case SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP: + scnprintf(buf, buflen, "Problems with arch specific instruction name regular expressions."); + break; + case SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING: + scnprintf(buf, buflen, "Problems while parsing the CPUID in the arch specific initialization."); + break; default: scnprintf(buf, buflen, "Internal error: Invalid %d error code\n", errnum); break; diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index d94be9140e31..116e21f49da6 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -370,6 +370,8 @@ enum symbol_disassemble_errno { SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX = __SYMBOL_ANNOTATE_ERRNO__START, SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF, + SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING, + SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP, __SYMBOL_ANNOTATE_ERRNO__END, }; From 16ed3c1e91159e28b02f11f71ff4ce4cbc6f99e4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 30 Sep 2019 15:53:33 -0300 Subject: [PATCH 1470/2880] perf annotate: Return appropriate error code for allocation failures We should return errno or the annotation extra range understood by symbol__strerror_disassemble() instead of -1, fix it, returning ENOMEM instead. Reported-by: Russell King - ARM Linux admin Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra , Cc: Will Deacon Link: https://lkml.kernel.org/n/tip-8of1cmj3rz0mppfcshc9bbqq@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index dc15352924f9..b49ecdd51188 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1668,7 +1668,7 @@ static int dso__disassemble_filename(struct dso *dso, char *filename, size_t fil build_id_path = strdup(filename); if (!build_id_path) - return -1; + return ENOMEM; /* * old style build-id cache has name of XX/XXXXXXX.. while @@ -2977,7 +2977,7 @@ int symbol__annotate2(struct symbol *sym, struct map *map, struct evsel *evsel, notes->offsets = zalloc(size * sizeof(struct annotation_line *)); if (notes->offsets == NULL) - return -1; + return ENOMEM; if (perf_evsel__is_group_event(evsel)) nr_pcnt = evsel->core.nr_members; From 11aad897f6d1a28eae3b7e5b293647c522d65819 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 30 Sep 2019 16:04:21 -0300 Subject: [PATCH 1471/2880] perf annotate: Don't return -1 for error when doing BPF disassembly Return errno when open_memstream() fails and add two new speciall error codes for when an invalid, non BPF file or one without BTF is passed to symbol__disassemble_bpf(), so that its callers can rely on symbol__strerror_disassemble() to convert that to a human readable error message that can help figure out what is wrong, with hints even. Cc: Russell King - ARM Linux admin Cc: Song Liu Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra , Cc: Will Deacon Link: https://lkml.kernel.org/n/tip-usevw9r2gcipfcrbpaueurw0@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 19 +++++++++++++++---- tools/perf/util/annotate.h | 2 ++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index b49ecdd51188..4036c7f7b0fb 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1637,6 +1637,13 @@ int symbol__strerror_disassemble(struct symbol *sym __maybe_unused, struct map * case SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING: scnprintf(buf, buflen, "Problems while parsing the CPUID in the arch specific initialization."); break; + case SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE: + scnprintf(buf, buflen, "Invalid BPF file: %s.", dso->long_name); + break; + case SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF: + scnprintf(buf, buflen, "The %s BPF file has no BTF section, compile with -g or use pahole -J.", + dso->long_name); + break; default: scnprintf(buf, buflen, "Internal error: Invalid %d error code\n", errnum); break; @@ -1719,13 +1726,13 @@ static int symbol__disassemble_bpf(struct symbol *sym, char tpath[PATH_MAX]; size_t buf_size; int nr_skip = 0; - int ret = -1; char *buf; bfd *bfdf; + int ret; FILE *s; if (dso->binary_type != DSO_BINARY_TYPE__BPF_PROG_INFO) - return -1; + return SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE; pr_debug("%s: handling sym %s addr %" PRIx64 " len %" PRIx64 "\n", __func__, sym->name, sym->start, sym->end - sym->start); @@ -1738,8 +1745,10 @@ static int symbol__disassemble_bpf(struct symbol *sym, assert(bfd_check_format(bfdf, bfd_object)); s = open_memstream(&buf, &buf_size); - if (!s) + if (!s) { + ret = errno; goto out; + } init_disassemble_info(&info, s, (fprintf_ftype) fprintf); @@ -1748,8 +1757,10 @@ static int symbol__disassemble_bpf(struct symbol *sym, info_node = perf_env__find_bpf_prog_info(dso->bpf_prog.env, dso->bpf_prog.id); - if (!info_node) + if (!info_node) { + return SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF; goto out; + } info_linear = info_node->info_linear; sub_id = dso->bpf_prog.sub_id; diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 116e21f49da6..d76fd0e81f46 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -372,6 +372,8 @@ enum symbol_disassemble_errno { SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF, SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING, SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP, + SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE, + SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF, __SYMBOL_ANNOTATE_ERRNO__END, }; From 61129dd29f7962f278b618a2a3e8fdb986a66dc8 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 17 Sep 2019 09:18:53 +0200 Subject: [PATCH 1472/2880] sched: Add __ASSEMBLY__ guards around struct clone_args The addition of struct clone_args to uapi/linux/sched.h is not protected by __ASSEMBLY__ guards, causing a failure to build from source for glibc on RISC-V. Add the guards to fix this. Fixes: 7f192e3cd316 ("fork: add clone3") Signed-off-by: Seth Forshee Cc: Acked-by: Ingo Molnar Link: https://lore.kernel.org/r/20190917071853.12385-1-seth.forshee@canonical.com Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index b3105ac1381a..851ff1feadd5 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -33,6 +33,7 @@ #define CLONE_NEWNET 0x40000000 /* New network namespace */ #define CLONE_IO 0x80000000 /* Clone io context */ +#ifndef __ASSEMBLY__ /* * Arguments for the clone3 syscall */ @@ -46,6 +47,7 @@ struct clone_args { __aligned_u64 stack_size; __aligned_u64 tls; }; +#endif /* * Scheduling policies From 3969e76909d3aa06715997896184ee684f68d164 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Tue, 24 Sep 2019 13:52:37 -0600 Subject: [PATCH 1473/2880] selftests: pidfd: Fix undefined reference to pthread_create() Fix build failure: undefined reference to `pthread_create' collect2: error: ld returned 1 exit status Fix CFLAGS to include pthread correctly. Fixes: 740378dc7834 ("pidfd: add polling selftests") Signed-off-by: Shuah Khan Reviewed-by: Christian Brauner Cc: Link: https://lore.kernel.org/r/20190924195237.30519-1-skhan@linuxfoundation.org Signed-off-by: Christian Brauner --- tools/testing/selftests/pidfd/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index 464c9b76148f..7550f08822a3 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS += -g -I../../../../usr/include/ -lpthread +CFLAGS += -g -I../../../../usr/include/ -pthread TEST_GEN_PROGS := pidfd_test pidfd_open_test pidfd_poll_test pidfd_wait From 517d6b9c6f71bee15c729155445c42cc4ef77dda Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 18 Sep 2019 08:30:33 +0000 Subject: [PATCH 1474/2880] erofs: fix return value check in erofs_read_superblock() In case of error, the function read_mapping_page() returns ERR_PTR() not NULL. The NULL test in the return value check should be replaced with IS_ERR(). Fixes: fe7c2423570d ("erofs: use read_mapping_page instead of sb_bread") Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Wei Yongjun Link: https://lore.kernel.org/r/20190918083033.47780-1-weiyongjun1@huawei.com Signed-off-by: Gao Xiang --- fs/erofs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index caf9a95173b0..0e369494f2f2 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -105,9 +105,9 @@ static int erofs_read_superblock(struct super_block *sb) int ret; page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL); - if (!page) { + if (IS_ERR(page)) { erofs_err(sb, "cannot read erofs superblock"); - return -EIO; + return PTR_ERR(page); } sbi = EROFS_SB(sb); From 6927cc05c2b067d4ca8ab6aeaa3b4544bea26ef8 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 19 Sep 2019 14:28:38 +0800 Subject: [PATCH 1475/2880] MAINTAINERS: erofs: complete sub-entries for erofs Add a formal git tree and missing files for erofs after moving out of staging for everyone to blame in order for better improvement. Acked-by: Chao Yu Link: https://lore.kernel.org/r/20190919062838.106423-1-gaoxiang25@huawei.com Signed-off-by: Gao Xiang --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 296de2b51c83..30a5b4028d2f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6112,7 +6112,10 @@ M: Gao Xiang M: Chao Yu L: linux-erofs@lists.ozlabs.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git +F: Documentation/filesystems/erofs.txt F: fs/erofs/ +F: include/trace/events/erofs.h ERRSEQ ERROR TRACKING INFRASTRUCTURE M: Jeff Layton From 55252ab72b774119afedfdc6d1f142ffa2a9b818 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sun, 22 Sep 2019 02:43:55 +0800 Subject: [PATCH 1476/2880] erofs: fix erofs_get_meta_page locking due to a cleanup After doing more drop_caches stress test on our products, I found the mistake introduced by a very recent cleanup [1]. The current rule is that "erofs_get_meta_page" should be returned with page locked (although it's mostly unnecessary for read-only fs after pages are PG_uptodate), but a fix should be done for this. [1] https://lore.kernel.org/r/20190904020912.63925-26-gaoxiang25@huawei.com Fixes: 618f40ea026b ("erofs: use read_cache_page_gfp for erofs_get_meta_page") Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20190921184355.149928-1-gaoxiang25@huawei.com Signed-off-by: Gao Xiang --- fs/erofs/data.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 8a9fcbd0e8ac..fc3a8d8064f8 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -34,11 +34,15 @@ static void erofs_readendio(struct bio *bio) struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr) { - struct inode *const bd_inode = sb->s_bdev->bd_inode; - struct address_space *const mapping = bd_inode->i_mapping; + struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping; + struct page *page; - return read_cache_page_gfp(mapping, blkaddr, + page = read_cache_page_gfp(mapping, blkaddr, mapping_gfp_constraint(mapping, ~__GFP_FS)); + /* should already be PageUptodate */ + if (!IS_ERR(page)) + lock_page(page); + return page; } static int erofs_map_blocks_flatmode(struct inode *inode, From dc76ea8c1087b5c44235566ed4be2202d21a8504 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sun, 22 Sep 2019 18:04:34 +0800 Subject: [PATCH 1477/2880] erofs: fix mis-inplace determination related with noio chain Fix a recent cleanup patch. noio (bypass) chain is handled asynchronously against submit chain, therefore inplace I/O or pagevec cannot be applied to such pages. Add detailed comment for this as well. Fixes: 97e86a858bc3 ("staging: erofs: tidy up decompression frontend") Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20190922100434.229340-1-gaoxiang25@huawei.com Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 96e34c90f814..fad80c97d247 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -575,7 +575,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; struct z_erofs_collector *const clt = &fe->clt; const loff_t offset = page_offset(page); - bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED); + bool tight = true; enum z_erofs_cache_alloctype cache_strategy; enum z_erofs_page_type page_type; @@ -628,8 +628,16 @@ restart_now: preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy, pagepool); - tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED); hitted: + /* + * Ensure the current partial page belongs to this submit chain rather + * than other concurrent submit chains or the noio(bypass) chain since + * those chains are handled asynchronously thus the page cannot be used + * for inplace I/O or pagevec (should be processed in strict order.) + */ + tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED && + clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); From 2b6509f42dc6c96ef0b625888d4aa56d2e140330 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 27 Sep 2019 18:27:42 +0800 Subject: [PATCH 1478/2880] MIPS: Loongson64: Fix boot failure after dropping boot_mem_map From commit a94e4f24ec83 ("MIPS: init: Drop boot_mem_map") onwards, add_memory_region() is handled by memblock_add()/memblock_reserve() directly and all bootmem API should be converted to memblock API. Otherwise it will lead to boot failure, especially in the NUMA case because add_memory_region lose the node_id information. Fixes: a94e4f24ec836c8984f83959 ("MIPS: init: Drop boot_mem_map") Signed-off-by: Huacai Chen Signed-off-by: Jiaxun Yang [paul.burton@mips.com: - Invert node_id check to de-indent the switch statement & avoid lines over 80 characters. - Fixup commit reference in commit message.] Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: linux-mips@vger.kernel.org Cc: Fuxin Zhang Cc: Zhangjin Wu Cc: Huacai Chen --- arch/mips/loongson64/common/mem.c | 35 +++++++++++++------------- arch/mips/loongson64/loongson-3/numa.c | 11 +------- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/arch/mips/loongson64/common/mem.c b/arch/mips/loongson64/common/mem.c index 4abb92e0fc39..4254ac4ec616 100644 --- a/arch/mips/loongson64/common/mem.c +++ b/arch/mips/loongson64/common/mem.c @@ -3,6 +3,7 @@ */ #include #include +#include #include #include @@ -64,24 +65,22 @@ void __init prom_init_memory(void) node_id = loongson_memmap->map[i].node_id; mem_type = loongson_memmap->map[i].mem_type; - if (node_id == 0) { - switch (mem_type) { - case SYSTEM_RAM_LOW: - add_memory_region(loongson_memmap->map[i].mem_start, - (u64)loongson_memmap->map[i].mem_size << 20, - BOOT_MEM_RAM); - break; - case SYSTEM_RAM_HIGH: - add_memory_region(loongson_memmap->map[i].mem_start, - (u64)loongson_memmap->map[i].mem_size << 20, - BOOT_MEM_RAM); - break; - case SYSTEM_RAM_RESERVED: - add_memory_region(loongson_memmap->map[i].mem_start, - (u64)loongson_memmap->map[i].mem_size << 20, - BOOT_MEM_RESERVED); - break; - } + if (node_id != 0) + continue; + + switch (mem_type) { + case SYSTEM_RAM_LOW: + memblock_add(loongson_memmap->map[i].mem_start, + (u64)loongson_memmap->map[i].mem_size << 20); + break; + case SYSTEM_RAM_HIGH: + memblock_add(loongson_memmap->map[i].mem_start, + (u64)loongson_memmap->map[i].mem_size << 20); + break; + case SYSTEM_RAM_RESERVED: + memblock_reserve(loongson_memmap->map[i].mem_start, + (u64)loongson_memmap->map[i].mem_size << 20); + break; } } } diff --git a/arch/mips/loongson64/loongson-3/numa.c b/arch/mips/loongson64/loongson-3/numa.c index 414e97de5dc0..8f20d2cb3767 100644 --- a/arch/mips/loongson64/loongson-3/numa.c +++ b/arch/mips/loongson64/loongson-3/numa.c @@ -142,8 +142,6 @@ static void __init szmem(unsigned int node) (u32)node_id, mem_type, mem_start, mem_size); pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", start_pfn, end_pfn, num_physpages); - add_memory_region((node_id << 44) + mem_start, - (u64)mem_size << 20, BOOT_MEM_RAM); memblock_add_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), node); break; @@ -156,16 +154,12 @@ static void __init szmem(unsigned int node) (u32)node_id, mem_type, mem_start, mem_size); pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", start_pfn, end_pfn, num_physpages); - add_memory_region((node_id << 44) + mem_start, - (u64)mem_size << 20, BOOT_MEM_RAM); memblock_add_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), node); break; case SYSTEM_RAM_RESERVED: pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx MB\n", (u32)node_id, mem_type, mem_start, mem_size); - add_memory_region((node_id << 44) + mem_start, - (u64)mem_size << 20, BOOT_MEM_RESERVED); memblock_reserve(((node_id << 44) + mem_start), mem_size << 20); break; @@ -191,8 +185,6 @@ static void __init node_mem_init(unsigned int node) NODE_DATA(node)->node_start_pfn = start_pfn; NODE_DATA(node)->node_spanned_pages = end_pfn - start_pfn; - free_bootmem_with_active_regions(node, end_pfn); - if (node == 0) { /* kernel end address */ unsigned long kernel_end_pfn = PFN_UP(__pa_symbol(&_end)); @@ -209,8 +201,6 @@ static void __init node_mem_init(unsigned int node) memblock_reserve((node_addrspace_offset | 0xfe000000), 32 << 20); } - - sparse_memory_present_with_active_regions(node); } static __init void prom_meminit(void) @@ -227,6 +217,7 @@ static __init void prom_meminit(void) cpumask_clear(&__node_data[(node)]->cpumask); } } + memblocks_present(); max_low_pfn = PHYS_PFN(memblock_end_of_DRAM()); for (cpu = 0; cpu < loongson_sysconf.nr_cpus; cpu++) { From 1bd63524593b964934a33afd442df16b8f90e2b5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 30 Sep 2019 14:02:03 -0700 Subject: [PATCH 1479/2880] libbpf: handle symbol versioning properly for libbpf.a bcc uses libbpf repo as a submodule. It brings in libbpf source code and builds everything together to produce shared libraries. With latest libbpf, I got the following errors: /bin/ld: libbcc_bpf.so.0.10.0: version node not found for symbol xsk_umem__create@LIBBPF_0.0.2 /bin/ld: failed to set dynamic section sizes: Bad value collect2: error: ld returned 1 exit status make[2]: *** [src/cc/libbcc_bpf.so.0.10.0] Error 1 In xsk.c, we have asm(".symver xsk_umem__create_v0_0_2, xsk_umem__create@LIBBPF_0.0.2"); asm(".symver xsk_umem__create_v0_0_4, xsk_umem__create@@LIBBPF_0.0.4"); The linker thinks the built is for LIBBPF but cannot find proper version LIBBPF_0.0.2/4, so emit errors. I also confirmed that using libbpf.a to produce a shared library also has issues: -bash-4.4$ cat t.c extern void *xsk_umem__create; void * test() { return xsk_umem__create; } -bash-4.4$ gcc -c -fPIC t.c -bash-4.4$ gcc -shared t.o libbpf.a -o t.so /bin/ld: t.so: version node not found for symbol xsk_umem__create@LIBBPF_0.0.2 /bin/ld: failed to set dynamic section sizes: Bad value collect2: error: ld returned 1 exit status -bash-4.4$ Symbol versioning does happens in commonly used libraries, e.g., elfutils and glibc. For static libraries, for a versioned symbol, the old definitions will be ignored, and the symbol will be an alias to the latest definition. For example, glibc sched_setaffinity is versioned. -bash-4.4$ readelf -s /usr/lib64/libc.so.6 | grep sched_setaffinity 756: 000000000013d3d0 13 FUNC GLOBAL DEFAULT 13 sched_setaffinity@GLIBC_2.3.3 757: 00000000000e2e70 455 FUNC GLOBAL DEFAULT 13 sched_setaffinity@@GLIBC_2.3.4 1800: 0000000000000000 0 FILE LOCAL DEFAULT ABS sched_setaffinity.c 4228: 00000000000e2e70 455 FUNC LOCAL DEFAULT 13 __sched_setaffinity_new 4648: 000000000013d3d0 13 FUNC LOCAL DEFAULT 13 __sched_setaffinity_old 7338: 000000000013d3d0 13 FUNC GLOBAL DEFAULT 13 sched_setaffinity@GLIBC_2 7380: 00000000000e2e70 455 FUNC GLOBAL DEFAULT 13 sched_setaffinity@@GLIBC_ -bash-4.4$ For static library, the definition of sched_setaffinity aliases to the new definition. -bash-4.4$ readelf -s /usr/lib64/libc.a | grep sched_setaffinity File: /usr/lib64/libc.a(sched_setaffinity.o) 8: 0000000000000000 455 FUNC GLOBAL DEFAULT 1 __sched_setaffinity_new 12: 0000000000000000 455 FUNC WEAK DEFAULT 1 sched_setaffinity For both elfutils and glibc, additional macros are used to control different handling of symbol versioning w.r.t static and shared libraries. For elfutils, the macro is SYMBOL_VERSIONING (https://sourceware.org/git/?p=elfutils.git;a=blob;f=lib/eu-config.h). For glibc, the macro is SHARED (https://sourceware.org/git/?p=glibc.git;a=blob;f=include/shlib-compat.h;hb=refs/heads/master) This patch used SHARED as the macro name. After this patch, the libbpf.a has -bash-4.4$ readelf -s libbpf.a | grep xsk_umem__create 372: 0000000000017145 1190 FUNC GLOBAL DEFAULT 1 xsk_umem__create_v0_0_4 405: 0000000000017145 1190 FUNC GLOBAL DEFAULT 1 xsk_umem__create 499: 00000000000175eb 103 FUNC GLOBAL DEFAULT 1 xsk_umem__create_v0_0_2 -bash-4.4$ No versioned symbols for xsk_umem__create. The libbpf.a can be used to build a shared library succesfully. -bash-4.4$ cat t.c extern void *xsk_umem__create; void * test() { return xsk_umem__create; } -bash-4.4$ gcc -c -fPIC t.c -bash-4.4$ gcc -shared t.o libbpf.a -o t.so -bash-4.4$ Fixes: 10d30e301732 ("libbpf: add flags to umem config") Cc: Kevin Laatz Cc: Arnaldo Carvalho de Melo Cc: Andrii Nakryiko Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/Makefile | 27 ++++++++++++++++++--------- tools/lib/bpf/libbpf_internal.h | 16 ++++++++++++++++ tools/lib/bpf/xsk.c | 4 ++-- 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 20772663d3e1..56ce6292071b 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -114,6 +114,9 @@ override CFLAGS += $(INCLUDES) override CFLAGS += -fvisibility=hidden override CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 +# flags specific for shared library +SHLIB_FLAGS := -DSHARED + ifeq ($(VERBOSE),1) Q = else @@ -130,14 +133,17 @@ all: export srctree OUTPUT CC LD CFLAGS V include $(srctree)/tools/build/Makefile.include -BPF_IN := $(OUTPUT)libbpf-in.o +SHARED_OBJDIR := $(OUTPUT)sharedobjs/ +STATIC_OBJDIR := $(OUTPUT)staticobjs/ +BPF_IN_SHARED := $(SHARED_OBJDIR)libbpf-in.o +BPF_IN_STATIC := $(STATIC_OBJDIR)libbpf-in.o VERSION_SCRIPT := libbpf.map LIB_TARGET := $(addprefix $(OUTPUT),$(LIB_TARGET)) LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE)) PC_FILE := $(addprefix $(OUTPUT),$(PC_FILE)) -GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN) | \ +GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN_SHARED) | \ cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$8}' | \ sort -u | wc -l) @@ -159,7 +165,7 @@ all: fixdep all_cmd: $(CMD_TARGETS) check -$(BPF_IN): force elfdep bpfdep +$(BPF_IN_SHARED): force elfdep bpfdep @(test -f ../../include/uapi/linux/bpf.h -a -f ../../../include/uapi/linux/bpf.h && ( \ (diff -B ../../include/uapi/linux/bpf.h ../../../include/uapi/linux/bpf.h >/dev/null) || \ echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from latest version at 'include/uapi/linux/bpf.h'" >&2 )) || true @@ -175,17 +181,20 @@ $(BPF_IN): force elfdep bpfdep @(test -f ../../include/uapi/linux/if_xdp.h -a -f ../../../include/uapi/linux/if_xdp.h && ( \ (diff -B ../../include/uapi/linux/if_xdp.h ../../../include/uapi/linux/if_xdp.h >/dev/null) || \ echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true - $(Q)$(MAKE) $(build)=libbpf + $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(SHARED_OBJDIR) CFLAGS="$(CFLAGS) $(SHLIB_FLAGS)" + +$(BPF_IN_STATIC): force elfdep bpfdep + $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) -$(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN) +$(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN_SHARED) $(QUIET_LINK)$(CC) --shared -Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \ -Wl,--version-script=$(VERSION_SCRIPT) $^ -lelf -o $@ @ln -sf $(@F) $(OUTPUT)libbpf.so @ln -sf $(@F) $(OUTPUT)libbpf.so.$(LIBBPF_MAJOR_VERSION) -$(OUTPUT)libbpf.a: $(BPF_IN) +$(OUTPUT)libbpf.a: $(BPF_IN_STATIC) $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^ $(OUTPUT)test_libbpf: test_libbpf.cpp $(OUTPUT)libbpf.a @@ -201,7 +210,7 @@ check: check_abi check_abi: $(OUTPUT)libbpf.so @if [ "$(GLOBAL_SYM_COUNT)" != "$(VERSIONED_SYM_COUNT)" ]; then \ - echo "Warning: Num of global symbols in $(BPF_IN)" \ + echo "Warning: Num of global symbols in $(BPF_IN_SHARED)" \ "($(GLOBAL_SYM_COUNT)) does NOT match with num of" \ "versioned symbols in $^ ($(VERSIONED_SYM_COUNT))." \ "Please make sure all LIBBPF_API symbols are" \ @@ -259,9 +268,9 @@ config-clean: $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null clean: - $(call QUIET_CLEAN, libbpf) $(RM) $(TARGETS) $(CXX_TEST_TARGET) \ + $(call QUIET_CLEAN, libbpf) $(RM) -rf $(TARGETS) $(CXX_TEST_TARGET) \ *.o *~ *.a *.so *.so.$(LIBBPF_MAJOR_VERSION) .*.d .*.cmd \ - *.pc LIBBPF-CFLAGS + *.pc LIBBPF-CFLAGS $(SHARED_OBJDIR) $(STATIC_OBJDIR) $(call QUIET_CLEAN, core-gen) $(RM) $(OUTPUT)FEATURE-DUMP.libbpf diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 2e83a34f8c79..98216a69c32f 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -34,6 +34,22 @@ (offsetof(TYPE, FIELD) + sizeof(((TYPE *)0)->FIELD)) #endif +/* Symbol versioning is different between static and shared library. + * Properly versioned symbols are needed for shared library, but + * only the symbol of the new version is needed for static library. + */ +#ifdef SHARED +# define COMPAT_VERSION(internal_name, api_name, version) \ + asm(".symver " #internal_name "," #api_name "@" #version); +# define DEFAULT_VERSION(internal_name, api_name, version) \ + asm(".symver " #internal_name "," #api_name "@@" #version); +#else +# define COMPAT_VERSION(internal_name, api_name, version) +# define DEFAULT_VERSION(internal_name, api_name, version) \ + extern typeof(internal_name) api_name \ + __attribute__((alias(#internal_name))); +#endif + extern void libbpf_print(enum libbpf_print_level level, const char *format, ...) __attribute__((format(printf, 2, 3))); diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 24fa313524fb..a902838f9fcc 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -261,8 +261,8 @@ int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area, return xsk_umem__create_v0_0_4(umem_ptr, umem_area, size, fill, comp, &config); } -asm(".symver xsk_umem__create_v0_0_2, xsk_umem__create@LIBBPF_0.0.2"); -asm(".symver xsk_umem__create_v0_0_4, xsk_umem__create@@LIBBPF_0.0.4"); +COMPAT_VERSION(xsk_umem__create_v0_0_2, xsk_umem__create, LIBBPF_0.0.2) +DEFAULT_VERSION(xsk_umem__create_v0_0_4, xsk_umem__create, LIBBPF_0.0.4) static int xsk_load_xdp_prog(struct xsk_socket *xsk) { From 0889d07f3e4b171c453b2aaf2b257f9074cdf624 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 30 Sep 2019 11:39:52 +0200 Subject: [PATCH 1480/2880] MIPS: dts: ar9331: fix interrupt-controller size It is two registers each of 4 byte. Signed-off-by: Oleksij Rempel Signed-off-by: Paul Burton Cc: Rob Herring Cc: Mark Rutland Cc: Pengutronix Kernel Team Cc: Ralf Baechle Cc: James Hogan Cc: devicetree@vger.kernel.org Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- arch/mips/boot/dts/qca/ar9331.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/boot/dts/qca/ar9331.dtsi b/arch/mips/boot/dts/qca/ar9331.dtsi index 63a9f33aa43e..5cfc9d347826 100644 --- a/arch/mips/boot/dts/qca/ar9331.dtsi +++ b/arch/mips/boot/dts/qca/ar9331.dtsi @@ -99,7 +99,7 @@ miscintc: interrupt-controller@18060010 { compatible = "qca,ar7240-misc-intc"; - reg = <0x18060010 0x4>; + reg = <0x18060010 0x8>; interrupt-parent = <&cpuintc>; interrupts = <6>; From 965f6603e3335a953f4f876792074cb36bf65f7f Mon Sep 17 00:00:00 2001 From: Rayagonda Kokatanur Date: Mon, 9 Sep 2019 14:05:27 +0530 Subject: [PATCH 1481/2880] arm64: dts: Fix gpio to pinmux mapping There are total of 151 non-secure gpio (0-150) and four pins of pinmux (91, 92, 93 and 94) are not mapped to any gpio pin, hence update same in DT. Fixes: 8aa428cc1e2e ("arm64: dts: Add pinctrl DT nodes for Stingray SOC") Signed-off-by: Rayagonda Kokatanur Reviewed-by: Ray Jui Signed-off-by: Florian Fainelli --- arch/arm64/boot/dts/broadcom/stingray/stingray-pinctrl.dtsi | 5 +++-- arch/arm64/boot/dts/broadcom/stingray/stingray.dtsi | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/broadcom/stingray/stingray-pinctrl.dtsi b/arch/arm64/boot/dts/broadcom/stingray/stingray-pinctrl.dtsi index 8a3a770e8f2c..56789ccf9454 100644 --- a/arch/arm64/boot/dts/broadcom/stingray/stingray-pinctrl.dtsi +++ b/arch/arm64/boot/dts/broadcom/stingray/stingray-pinctrl.dtsi @@ -42,13 +42,14 @@ pinmux: pinmux@14029c { compatible = "pinctrl-single"; - reg = <0x0014029c 0x250>; + reg = <0x0014029c 0x26c>; #address-cells = <1>; #size-cells = <1>; pinctrl-single,register-width = <32>; pinctrl-single,function-mask = <0xf>; pinctrl-single,gpio-range = < - &range 0 154 MODE_GPIO + &range 0 91 MODE_GPIO + &range 95 60 MODE_GPIO >; range: gpio-range { #pinctrl-single,gpio-range-cells = <3>; diff --git a/arch/arm64/boot/dts/broadcom/stingray/stingray.dtsi b/arch/arm64/boot/dts/broadcom/stingray/stingray.dtsi index 71e2e34400d4..0098dfdef96c 100644 --- a/arch/arm64/boot/dts/broadcom/stingray/stingray.dtsi +++ b/arch/arm64/boot/dts/broadcom/stingray/stingray.dtsi @@ -464,8 +464,7 @@ <&pinmux 108 16 27>, <&pinmux 135 77 6>, <&pinmux 141 67 4>, - <&pinmux 145 149 6>, - <&pinmux 151 91 4>; + <&pinmux 145 149 6>; }; i2c1: i2c@e0000 { From 8c7138b33e5c690c308b2a7085f6313fdcb3f616 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 27 Sep 2019 16:00:31 -0700 Subject: [PATCH 1482/2880] net: Unpublish sk from sk_reuseport_cb before call_rcu The "reuse->sock[]" array is shared by multiple sockets. The going away sk must unpublish itself from "reuse->sock[]" before making call_rcu() call. However, this unpublish-action is currently done after a grace period and it may cause use-after-free. The fix is to move reuseport_detach_sock() to sk_destruct(). Due to the above reason, any socket with sk_reuseport_cb has to go through the rcu grace period before freeing it. It is a rather old bug (~3 yrs). The Fixes tag is not necessary the right commit but it is the one that introduced the SOCK_RCU_FREE logic and this fix is depending on it. Fixes: a4298e4522d6 ("net: add SOCK_RCU_FREE socket flag") Cc: Eric Dumazet Suggested-by: Eric Dumazet Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/core/sock.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 07863edbe6fc..e23ec80a67bf 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1700,8 +1700,6 @@ static void __sk_destruct(struct rcu_head *head) sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } - if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_detach_sock(sk); sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); @@ -1728,7 +1726,14 @@ static void __sk_destruct(struct rcu_head *head) void sk_destruct(struct sock *sk) { - if (sock_flag(sk, SOCK_RCU_FREE)) + bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); + + if (rcu_access_pointer(sk->sk_reuseport_cb)) { + reuseport_detach_sock(sk); + use_call_rcu = true; + } + + if (use_call_rcu) call_rcu(&sk->sk_rcu, __sk_destruct); else __sk_destruct(&sk->sk_rcu); From 13dc8c029cabf52ba95f60c56eb104d4d95d5889 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 21 Sep 2019 15:49:54 +0900 Subject: [PATCH 1483/2880] kbuild: remove ar-option and KBUILD_ARFLAGS Commit 40df759e2b9e ("kbuild: Fix build with binutils <= 2.19") introduced ar-option and KBUILD_ARFLAGS to deal with old binutils. According to Documentation/process/changes.rst, the current minimal supported version of binutils is 2.21 so you can assume the 'D' option is always supported. Not only GNU ar but also llvm-ar supports it. With the 'D' option hard-coded, there is no more user of ar-option or KBUILD_ARFLAGS. Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers --- Documentation/kbuild/makefiles.rst | 5 ----- Makefile | 4 ---- arch/powerpc/boot/Makefile | 2 +- scripts/Kbuild.include | 5 ----- scripts/Makefile.build | 2 +- scripts/Makefile.lib | 2 +- 6 files changed, 3 insertions(+), 17 deletions(-) diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index 6ba9d5365ff3..b89c88168d6a 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -954,11 +954,6 @@ When kbuild executes, the following steps are followed (roughly): From commandline LDFLAGS_MODULE shall be used (see kbuild.txt). - KBUILD_ARFLAGS Options for $(AR) when creating archives - - $(KBUILD_ARFLAGS) set by the top level Makefile to "D" (deterministic - mode) if this option is supported by $(AR). - KBUILD_LDS The linker script with full path. Assigned by the top-level Makefile. diff --git a/Makefile b/Makefile index 6f54f2f95743..7452174f5b21 100644 --- a/Makefile +++ b/Makefile @@ -498,7 +498,6 @@ export CFLAGS_KASAN CFLAGS_KASAN_NOSANITIZE CFLAGS_UBSAN export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL -export KBUILD_ARFLAGS # Files to ignore in find ... statements @@ -914,9 +913,6 @@ ifdef CONFIG_RETPOLINE KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none) endif -# use the deterministic mode of AR if available -KBUILD_ARFLAGS := $(call ar-option,D) - include scripts/Makefile.kasan include scripts/Makefile.extrawarn include scripts/Makefile.ubsan diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 6841bd52738b..dfbd7f22eef5 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -50,7 +50,7 @@ endif BOOTAFLAGS := -D__ASSEMBLY__ $(BOOTCFLAGS) -nostdinc -BOOTARFLAGS := -cr$(KBUILD_ARFLAGS) +BOOTARFLAGS := -crD ifdef CONFIG_CC_IS_CLANG BOOTCFLAGS += $(CLANG_FLAGS) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 4b0432e095ae..10ba926ae292 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -143,11 +143,6 @@ cc-ifversion = $(shell [ $(CONFIG_GCC_VERSION)0 $(1) $(2)000 ] && echo $(3) || e # Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y) ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3)) -# ar-option -# Usage: KBUILD_ARFLAGS := $(call ar-option,D) -# Important: no spaces around options -ar-option = $(call try-run, $(AR) rc$(1) "$$TMP",$(1),$(2)) - # ld-version # Note this is mainly for HJ Lu's 3 number binutil versions ld-version = $(shell $(LD) --version | $(srctree)/scripts/ld-version.sh) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index f72aba64d611..a9e47953ca53 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -389,7 +389,7 @@ $(sort $(subdir-obj-y)): $(subdir-ym) ; ifdef builtin-target quiet_cmd_ar_builtin = AR $@ - cmd_ar_builtin = rm -f $@; $(AR) rcSTP$(KBUILD_ARFLAGS) $@ $(real-prereqs) + cmd_ar_builtin = rm -f $@; $(AR) cDPrST $@ $(real-prereqs) $(builtin-target): $(real-obj-y) FORCE $(call if_changed,ar_builtin) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 4a0cdd6f5909..179d55af5852 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -232,7 +232,7 @@ quiet_cmd_ld = LD $@ # --------------------------------------------------------------------------- quiet_cmd_ar = AR $@ - cmd_ar = rm -f $@; $(AR) rcsTP$(KBUILD_ARFLAGS) $@ $(real-prereqs) + cmd_ar = rm -f $@; $(AR) cDPrsT $@ $(real-prereqs) # Objcopy # --------------------------------------------------------------------------- From b6f2494d311a19b33b19708543e7ef6dea1de459 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 29 Sep 2019 01:08:17 +0300 Subject: [PATCH 1484/2880] net: dsa: sja1105: Ensure PTP time for rxtstamp reconstruction is not in the past Sometimes the PTP synchronization on the switch 'jumps': ptp4l[11241.155]: rms 8 max 16 freq -21732 +/- 11 delay 742 +/- 0 ptp4l[11243.157]: rms 7 max 17 freq -21731 +/- 10 delay 744 +/- 0 ptp4l[11245.160]: rms 33592410 max 134217731 freq +192422 +/- 8530253 delay 743 +/- 0 ptp4l[11247.163]: rms 811631 max 964131 freq +10326 +/- 557785 delay 743 +/- 0 ptp4l[11249.166]: rms 261936 max 533876 freq -304323 +/- 126371 delay 744 +/- 0 ptp4l[11251.169]: rms 48700 max 57740 freq -20218 +/- 30532 delay 744 +/- 0 ptp4l[11253.171]: rms 14570 max 30163 freq -5568 +/- 7563 delay 742 +/- 0 ptp4l[11255.174]: rms 2914 max 3440 freq -22001 +/- 1667 delay 744 +/- 1 ptp4l[11257.177]: rms 811 max 1710 freq -22653 +/- 451 delay 744 +/- 1 ptp4l[11259.180]: rms 177 max 218 freq -21695 +/- 89 delay 741 +/- 0 ptp4l[11261.182]: rms 45 max 92 freq -21677 +/- 32 delay 742 +/- 0 ptp4l[11263.186]: rms 14 max 32 freq -21733 +/- 11 delay 742 +/- 0 ptp4l[11265.188]: rms 9 max 14 freq -21725 +/- 12 delay 742 +/- 0 ptp4l[11267.191]: rms 9 max 16 freq -21727 +/- 13 delay 742 +/- 0 ptp4l[11269.194]: rms 6 max 15 freq -21726 +/- 9 delay 743 +/- 0 ptp4l[11271.197]: rms 8 max 15 freq -21728 +/- 11 delay 743 +/- 0 ptp4l[11273.200]: rms 6 max 12 freq -21727 +/- 8 delay 743 +/- 0 ptp4l[11275.202]: rms 9 max 17 freq -21720 +/- 11 delay 742 +/- 0 ptp4l[11277.205]: rms 9 max 18 freq -21725 +/- 12 delay 742 +/- 0 Background: the switch only offers partial RX timestamps (24 bits) and it is up to the driver to read the PTP clock to fill those timestamps up to 64 bits. But the PTP clock readout needs to happen quickly enough (in 0.135 seconds, in fact), otherwise the PTP clock will wrap around 24 bits, condition which cannot be detected. Looking at the 'max 134217731' value on output line 3, one can see that in hex it is 0x8000003. Because the PTP clock resolution is 8 ns, that means 0x1000000 in ticks, which is exactly 2^24. So indeed this is a PTP clock wraparound, but the reason might be surprising. What is going on is that sja1105_tstamp_reconstruct(priv, now, ts) expects a "now" time that is later than the "ts" was snapshotted at. This, of course, is obvious: we read the PTP time _after_ the partial RX timestamp was received. However, the workqueue is processing frames from a skb queue and reuses the same PTP time, read once at the beginning. Normally the skb queue only contains one frame and all goes well. But when the skb queue contains two frames, the second frame that gets dequeued might have been partially timestamped by the RX MAC _after_ we had read our PTP time initially. The code was originally like that due to concerns that SPI access for PTP time readout is a slow process, and we are time-constrained anyway (aka: premature optimization). But some timing analysis reveals that the time spent until the RX timestamp is completely reconstructed is 1 order of magnitude lower than the 0.135 s deadline even under worst-case conditions. So we can afford to read the PTP time for each frame in the RX timestamping queue, which of course ensures that the full PTP time is in the partial timestamp's future. Fixes: f3097be21bf1 ("net: dsa: sja1105: Add a state machine for RX timestamping") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index b9def744bcb3..f3d38ff144c4 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2005,12 +2005,12 @@ static void sja1105_rxtstamp_work(struct work_struct *work) mutex_lock(&priv->ptp_lock); - now = priv->tstamp_cc.read(&priv->tstamp_cc); - while ((skb = skb_dequeue(&data->skb_rxtstamp_queue)) != NULL) { struct skb_shared_hwtstamps *shwt = skb_hwtstamps(skb); u64 ts; + now = priv->tstamp_cc.read(&priv->tstamp_cc); + *shwt = (struct skb_shared_hwtstamps) {0}; ts = SJA1105_SKB_CB(skb)->meta_tstamp; From 7e35b42591c058b91282f95ce3b2cf0c05ffe93d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 21 Sep 2019 16:05:41 +0900 Subject: [PATCH 1485/2880] kbuild: remove SUBDIRS support Linux 5.3 is out. Remove the SUBDIRS support. Signed-off-by: Masahiro Yamada --- Makefile | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/Makefile b/Makefile index 7452174f5b21..779c9c9b9820 100644 --- a/Makefile +++ b/Makefile @@ -206,24 +206,8 @@ ifndef KBUILD_CHECKSRC KBUILD_CHECKSRC = 0 endif -# Use make M=dir to specify directory of external module to build -# Old syntax make ... SUBDIRS=$PWD is still supported -# Setting the environment variable KBUILD_EXTMOD take precedence -ifdef SUBDIRS - $(warning ================= WARNING ================) - $(warning 'SUBDIRS' will be removed after Linux 5.3) - $(warning ) - $(warning If you are building an individual subdirectory) - $(warning in the kernel tree, you can do like this:) - $(warning $$ make path/to/dir/you/want/to/build/) - $(warning (Do not forget the trailing slash)) - $(warning ) - $(warning If you are building an external module,) - $(warning Please use 'M=' or 'KBUILD_EXTMOD' instead) - $(warning ==========================================) - KBUILD_EXTMOD ?= $(SUBDIRS) -endif - +# Use make M=dir or set the environment variable KBUILD_EXTMOD to specify the +# directory of external module to build. Setting M= takes precedence. ifeq ("$(origin M)", "command line") KBUILD_EXTMOD := $(M) endif From 807f2105b87af20c4a582fb62111b16689f83bb8 Mon Sep 17 00:00:00 2001 From: Alex Gaynor Date: Sat, 21 Sep 2019 15:23:04 -0700 Subject: [PATCH 1486/2880] kbuild: correct formatting of header in kbuild module docs Minor formatting fixup. Fixes: cd238effefa2 ("docs: kbuild: convert docs to ReST and rename to *.rst") Signed-off-by: Alex Gaynor Signed-off-by: Matthew Garrett Signed-off-by: Masahiro Yamada --- Documentation/kbuild/modules.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/kbuild/modules.rst b/Documentation/kbuild/modules.rst index d2ae799237fd..33a17121d11d 100644 --- a/Documentation/kbuild/modules.rst +++ b/Documentation/kbuild/modules.rst @@ -498,7 +498,8 @@ build. will be written containing all exported symbols that were not defined in the kernel. ---- 6.3 Symbols From Another External Module +6.3 Symbols From Another External Module +---------------------------------------- Sometimes, an external module uses exported symbols from another external module. kbuild needs to have full knowledge of From 47346e96f004eca07720e1e2b24fc7f0b0df4092 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 24 Sep 2019 21:07:40 +0900 Subject: [PATCH 1487/2880] modpost: fix static EXPORT_SYMBOL warnings for UML build Johannes Berg reports lots of modpost warnings on ARCH=um builds: WARNING: "rename" [vmlinux] is a static EXPORT_SYMBOL WARNING: "lseek" [vmlinux] is a static EXPORT_SYMBOL WARNING: "ftruncate64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "getuid" [vmlinux] is a static EXPORT_SYMBOL WARNING: "lseek64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "unlink" [vmlinux] is a static EXPORT_SYMBOL WARNING: "pwrite64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "close" [vmlinux] is a static EXPORT_SYMBOL WARNING: "opendir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "pread64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "syscall" [vmlinux] is a static EXPORT_SYMBOL WARNING: "readdir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "readdir64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "futimes" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__lxstat" [vmlinux] is a static EXPORT_SYMBOL WARNING: "write" [vmlinux] is a static EXPORT_SYMBOL WARNING: "closedir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__xstat" [vmlinux] is a static EXPORT_SYMBOL WARNING: "fsync" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__lxstat64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__fxstat64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "telldir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "printf" [vmlinux] is a static EXPORT_SYMBOL WARNING: "readlink" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__sprintf_chk" [vmlinux] is a static EXPORT_SYMBOL WARNING: "link" [vmlinux] is a static EXPORT_SYMBOL WARNING: "rmdir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "fdatasync" [vmlinux] is a static EXPORT_SYMBOL WARNING: "truncate" [vmlinux] is a static EXPORT_SYMBOL WARNING: "statfs" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__errno_location" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__xmknod" [vmlinux] is a static EXPORT_SYMBOL WARNING: "open64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "truncate64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "open" [vmlinux] is a static EXPORT_SYMBOL WARNING: "read" [vmlinux] is a static EXPORT_SYMBOL WARNING: "chown" [vmlinux] is a static EXPORT_SYMBOL WARNING: "chmod" [vmlinux] is a static EXPORT_SYMBOL WARNING: "utime" [vmlinux] is a static EXPORT_SYMBOL WARNING: "fchmod" [vmlinux] is a static EXPORT_SYMBOL WARNING: "seekdir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "ioctl" [vmlinux] is a static EXPORT_SYMBOL WARNING: "dup2" [vmlinux] is a static EXPORT_SYMBOL WARNING: "statfs64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "utimes" [vmlinux] is a static EXPORT_SYMBOL WARNING: "mkdir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "fchown" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__guard" [vmlinux] is a static EXPORT_SYMBOL WARNING: "symlink" [vmlinux] is a static EXPORT_SYMBOL WARNING: "access" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__stack_smash_handler" [vmlinux] is a static EXPORT_SYMBOL When you run "make", the modpost is run twice; before linking vmlinux, and before building modules. All the warnings above are from the second modpost. The offending symbols are defined not in vmlinux, but in the C library. The first modpost is run against the relocatable vmlinux.o, and those warnings are nicely suppressed because the SH_UNDEF entries from the symbol table clear the ->is_static flag. The second modpost is run against the executable vmlinux (+ modules), where those symbols have been resolved, but the definitions do not exist. This commit fixes it in a straightforward way; suppress the static EXPORT_SYMBOL warnings from "vmlinux". Without this commit, we see valid warnings twice anyway. For example, ARCH=arm64 defconfig shows the following warning twice: WARNING: "HYPERVISOR_platform_op" [vmlinux] is a static EXPORT_SYMBOL_GPL So, it is reasonable to suppress the second one. Fixes: 15bfc2348d54 ("modpost: check for static EXPORT_SYMBOL* functions") Reported-by: Johannes Berg Signed-off-by: Masahiro Yamada Tested-by: Johannes Berg Tested-by: Denis Efremov --- scripts/mod/modpost.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 3961941e8e7a..442d5e2ad688 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2652,15 +2652,20 @@ int main(int argc, char **argv) fatal("modpost: Section mismatches detected.\n" "Set CONFIG_SECTION_MISMATCH_WARN_ONLY=y to allow them.\n"); for (n = 0; n < SYMBOL_HASH_SIZE; n++) { - struct symbol *s = symbolhash[n]; + struct symbol *s; + + for (s = symbolhash[n]; s; s = s->next) { + /* + * Do not check "vmlinux". This avoids the same warnings + * shown twice, and false-positives for ARCH=um. + */ + if (is_vmlinux(s->module->name) && !s->module->is_dot_o) + continue; - while (s) { if (s->is_static) warn("\"%s\" [%s] is a static %s\n", s->name, s->module->name, export_str(s->export)); - - s = s->next; } } From 68501df92d116b760777a2cfda314789f926476f Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Sun, 29 Sep 2019 01:43:39 +0300 Subject: [PATCH 1488/2880] net: dsa: sja1105: Prevent leaking memory In sja1105_static_config_upload, in two cases memory is leaked: when static_config_buf_prepare_for_upload fails and when sja1105_inhibit_tx fails. In both cases config_buf should be released. Fixes: 8aa9ebccae87 ("net: dsa: Introduce driver for NXP SJA1105 5-port L2 switch") Fixes: 1a4c69406cc1 ("net: dsa: sja1105: Prevent PHY jabbering during switch reset") Signed-off-by: Navid Emamdoost Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_spi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c index 84dc603138cf..58dd37ecde17 100644 --- a/drivers/net/dsa/sja1105/sja1105_spi.c +++ b/drivers/net/dsa/sja1105/sja1105_spi.c @@ -409,7 +409,8 @@ int sja1105_static_config_upload(struct sja1105_private *priv) rc = static_config_buf_prepare_for_upload(priv, config_buf, buf_len); if (rc < 0) { dev_err(dev, "Invalid config, cannot upload\n"); - return -EINVAL; + rc = -EINVAL; + goto out; } /* Prevent PHY jabbering during switch reset by inhibiting * Tx on all ports and waiting for current packet to drain. @@ -418,7 +419,8 @@ int sja1105_static_config_upload(struct sja1105_private *priv) rc = sja1105_inhibit_tx(priv, port_bitmap, true); if (rc < 0) { dev_err(dev, "Failed to inhibit Tx on ports\n"); - return -ENXIO; + rc = -ENXIO; + goto out; } /* Wait for an eventual egress packet to finish transmission * (reach IFG). It is guaranteed that a second one will not From 68ce6688a5baefde30914fc07fc27292dbbe8320 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 29 Sep 2019 02:01:39 +0300 Subject: [PATCH 1489/2880] net: sched: taprio: Fix potential integer overflow in taprio_set_picos_per_byte The speed divisor is used in a context expecting an s64, but it is evaluated using 32-bit arithmetic. To avoid that happening, instead of multiplying by 1,000,000 in the first place, simplify the fraction and do a standard 32 bit division instead. Fixes: f04b514c0ce2 ("taprio: Set default link speed to 10 Mbps in taprio_set_picos_per_byte") Reported-by: Gustavo A. R. Silva Suggested-by: Eric Dumazet Signed-off-by: Vladimir Oltean Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 2f7b34205c82..2aab46ada94f 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1048,8 +1048,7 @@ static void taprio_set_picos_per_byte(struct net_device *dev, speed = ecmd.base.speed; skip: - picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, - speed * 1000 * 1000); + picos_per_byte = (USEC_PER_SEC * 8) / speed; atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", From 21e3d6c81179bbdfa279efc8de456c34b814cfd2 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 3 Sep 2019 12:18:39 +0200 Subject: [PATCH 1490/2880] scsi: sd: Ignore a failure to sync cache due to lack of authorization I've got a report about a UAS drive enclosure reporting back Sense: Logical unit access not authorized if the drive it holds is password protected. While the drive is obviously unusable in that state as a mass storage device, it still exists as a sd device and when the system is asked to perform a suspend of the drive, it will be sent a SYNCHRONIZE CACHE. If that fails due to password protection, the error must be ignored. Cc: Link: https://lore.kernel.org/r/20190903101840.16483-1-oneukum@suse.com Signed-off-by: Oliver Neukum Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 91af598f2f53..0f96eb0ddbfa 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1655,7 +1655,8 @@ static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) /* we need to evaluate the error return */ if (scsi_sense_valid(sshdr) && (sshdr->asc == 0x3a || /* medium not present */ - sshdr->asc == 0x20)) /* invalid command */ + sshdr->asc == 0x20 || /* invalid command */ + (sshdr->asc == 0x74 && sshdr->ascq == 0x71))) /* drive is password locked */ /* this is no error here */ return 0; From 9bc6157f5fd0e898c94f3018d088a3419bde0d8f Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Fri, 27 Sep 2019 09:30:31 +0200 Subject: [PATCH 1491/2880] scsi: qla2xxx: Remove WARN_ON_ONCE in qla2x00_status_cont_entry() Commit 88263208dd23 ("scsi: qla2xxx: Complain if sp->done() is not called from the completion path") introduced the WARN_ON_ONCE in qla2x00_status_cont_entry(). The assumption was that there is only one status continuations element. According to the firmware documentation it is possible that multiple status continuations are emitted by the firmware. Fixes: 88263208dd23 ("scsi: qla2xxx: Complain if sp->done() is not called from the completion path") Link: https://lore.kernel.org/r/20190927073031.62296-1-dwagner@suse.de Cc: Bart Van Assche Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Signed-off-by: Daniel Wagner Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_isr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index 4c26630c1c3e..009fd5a33fcd 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -2837,8 +2837,6 @@ qla2x00_status_cont_entry(struct rsp_que *rsp, sts_cont_entry_t *pkt) if (sense_len == 0) { rsp->status_srb = NULL; sp->done(sp, cp->result); - } else { - WARN_ON_ONCE(true); } } From ec1db1be106101c24f6f24efbb5eb3176f66cb50 Mon Sep 17 00:00:00 2001 From: Michael Straube Date: Tue, 17 Sep 2019 21:07:55 +0200 Subject: [PATCH 1492/2880] staging: exfat: add missing SPDX line to Kconfig Add SPDX identifier to Kconfig. Signed-off-by: Michael Straube Link: https://lore.kernel.org/r/20190917190755.21723-1-straube.linux@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/exfat/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/exfat/Kconfig b/drivers/staging/exfat/Kconfig index 290dbfc7ace1..59e07afe249c 100644 --- a/drivers/staging/exfat/Kconfig +++ b/drivers/staging/exfat/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 config EXFAT_FS tristate "exFAT fs support" depends on BLOCK From a358eea07c78d9d0c246738ed1c6349d72d41920 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Kl=C4=93tnieks?= Date: Wed, 18 Sep 2019 20:38:08 -0400 Subject: [PATCH 1493/2880] staging: exfat - fix SPDX tags.. The copyright notices as I got them said "GPLv2 or later", which I screwed up when putting in the SPDX tags. Fix them to match the license I got the code under. Signed-off-by: Valdis Kletnieks Link: https://lore.kernel.org/r/122590.1568853488@turing-police Signed-off-by: Greg Kroah-Hartman --- drivers/staging/exfat/Makefile | 2 +- drivers/staging/exfat/exfat.h | 2 +- drivers/staging/exfat/exfat_blkdev.c | 2 +- drivers/staging/exfat/exfat_cache.c | 2 +- drivers/staging/exfat/exfat_core.c | 2 +- drivers/staging/exfat/exfat_nls.c | 2 +- drivers/staging/exfat/exfat_super.c | 2 +- drivers/staging/exfat/exfat_upcase.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/staging/exfat/Makefile b/drivers/staging/exfat/Makefile index 84944dfbae28..6c90aec83feb 100644 --- a/drivers/staging/exfat/Makefile +++ b/drivers/staging/exfat/Makefile @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0 +# SPDX-License-Identifier: GPL-2.0-or-later obj-$(CONFIG_EXFAT_FS) += exfat.o diff --git a/drivers/staging/exfat/exfat.h b/drivers/staging/exfat/exfat.h index 6c12f2d79f4d..3abab33e932c 100644 --- a/drivers/staging/exfat/exfat.h +++ b/drivers/staging/exfat/exfat.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ diff --git a/drivers/staging/exfat/exfat_blkdev.c b/drivers/staging/exfat/exfat_blkdev.c index f086c75e7076..81d20e6241c6 100644 --- a/drivers/staging/exfat/exfat_blkdev.c +++ b/drivers/staging/exfat/exfat_blkdev.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ diff --git a/drivers/staging/exfat/exfat_cache.c b/drivers/staging/exfat/exfat_cache.c index 1565ce65d39f..e1b001718709 100644 --- a/drivers/staging/exfat/exfat_cache.c +++ b/drivers/staging/exfat/exfat_cache.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ diff --git a/drivers/staging/exfat/exfat_core.c b/drivers/staging/exfat/exfat_core.c index b3e9cf725cf5..79174e5c4145 100644 --- a/drivers/staging/exfat/exfat_core.c +++ b/drivers/staging/exfat/exfat_core.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ diff --git a/drivers/staging/exfat/exfat_nls.c b/drivers/staging/exfat/exfat_nls.c index 03cb8290b5d2..a5c4b68925fb 100644 --- a/drivers/staging/exfat/exfat_nls.c +++ b/drivers/staging/exfat/exfat_nls.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ diff --git a/drivers/staging/exfat/exfat_super.c b/drivers/staging/exfat/exfat_super.c index 5f6caee819a6..229ecabe7a93 100644 --- a/drivers/staging/exfat/exfat_super.c +++ b/drivers/staging/exfat/exfat_super.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ diff --git a/drivers/staging/exfat/exfat_upcase.c b/drivers/staging/exfat/exfat_upcase.c index 366082fb3dab..b91a1faa0e50 100644 --- a/drivers/staging/exfat/exfat_upcase.c +++ b/drivers/staging/exfat/exfat_upcase.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ From 89d5f78fab48844d750f6f3a419e26f7f828bac8 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Mon, 30 Sep 2019 22:05:04 +0900 Subject: [PATCH 1494/2880] staging: exfat: Fix a typo in Kconfig This patch fix a spelling typo in Kconfig. Signed-off-by: Masanari Iida Link: https://lore.kernel.org/r/20190930130504.21994-1-standby24x7@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/exfat/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/exfat/Kconfig b/drivers/staging/exfat/Kconfig index 59e07afe249c..ce32dfe33bec 100644 --- a/drivers/staging/exfat/Kconfig +++ b/drivers/staging/exfat/Kconfig @@ -7,7 +7,7 @@ config EXFAT_FS This adds support for the exFAT file system. config EXFAT_DONT_MOUNT_VFAT - bool "Prohibit mounting of fat/vfat filesysems by exFAT" + bool "Prohibit mounting of fat/vfat filesystems by exFAT" depends on EXFAT_FS default y help From 7d4dea95f8281fc7f14766a6040e3504d3f658fc Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 19 Sep 2019 11:50:22 +0200 Subject: [PATCH 1495/2880] staging: octeon: Use "(uintptr_t)" to cast from pointer to int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 32-bit: In file included from drivers/staging/octeon/octeon-ethernet.h:41, from drivers/staging/octeon/ethernet-tx.c:25: drivers/staging/octeon/octeon-stubs.h: In function ‘cvmx_phys_to_ptr’: drivers/staging/octeon/octeon-stubs.h:1205:9: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] return (void *)(physical_address); ^ drivers/staging/octeon/ethernet-tx.c: In function ‘cvm_oct_xmit’: drivers/staging/octeon/ethernet-tx.c:264:37: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] hw_buffer.s.addr = XKPHYS_TO_PHYS((u64)skb->data); ^ drivers/staging/octeon/octeon-stubs.h:2:30: note: in definition of macro ‘XKPHYS_TO_PHYS’ #define XKPHYS_TO_PHYS(p) (p) ^ drivers/staging/octeon/ethernet-tx.c:268:37: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] hw_buffer.s.addr = XKPHYS_TO_PHYS((u64)skb->data); ^ drivers/staging/octeon/octeon-stubs.h:2:30: note: in definition of macro ‘XKPHYS_TO_PHYS’ #define XKPHYS_TO_PHYS(p) (p) ^ drivers/staging/octeon/ethernet-tx.c:276:20: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] XKPHYS_TO_PHYS((u64)skb_frag_address(fs)); ^ drivers/staging/octeon/octeon-stubs.h:2:30: note: in definition of macro ‘XKPHYS_TO_PHYS’ #define XKPHYS_TO_PHYS(p) (p) ^ drivers/staging/octeon/ethernet-tx.c:280:37: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] hw_buffer.s.addr = XKPHYS_TO_PHYS((u64)CVM_OCT_SKB_CB(skb)); ^ drivers/staging/octeon/octeon-stubs.h:2:30: note: in definition of macro ‘XKPHYS_TO_PHYS’ #define XKPHYS_TO_PHYS(p) (p) ^ Fix this by replacing casts to "u64" by casts to "uintptr_t", which is either 32-bit or 64-bit, and adding an intermediate cast to "uintptr_t" where needed. Exposed by commit 171a9bae68c72f2d ("staging/octeon: Allow test build on !MIPS"). Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20190919095022.29099-1-geert@linux-m68k.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/octeon/ethernet-tx.c | 9 +++++---- drivers/staging/octeon/octeon-stubs.h | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c index c64728fc21f2..7021ff07ba2a 100644 --- a/drivers/staging/octeon/ethernet-tx.c +++ b/drivers/staging/octeon/ethernet-tx.c @@ -261,11 +261,11 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev) /* Build the PKO buffer pointer */ hw_buffer.u64 = 0; if (skb_shinfo(skb)->nr_frags == 0) { - hw_buffer.s.addr = XKPHYS_TO_PHYS((u64)skb->data); + hw_buffer.s.addr = XKPHYS_TO_PHYS((uintptr_t)skb->data); hw_buffer.s.pool = 0; hw_buffer.s.size = skb->len; } else { - hw_buffer.s.addr = XKPHYS_TO_PHYS((u64)skb->data); + hw_buffer.s.addr = XKPHYS_TO_PHYS((uintptr_t)skb->data); hw_buffer.s.pool = 0; hw_buffer.s.size = skb_headlen(skb); CVM_OCT_SKB_CB(skb)[0] = hw_buffer.u64; @@ -273,11 +273,12 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev) skb_frag_t *fs = skb_shinfo(skb)->frags + i; hw_buffer.s.addr = - XKPHYS_TO_PHYS((u64)skb_frag_address(fs)); + XKPHYS_TO_PHYS((uintptr_t)skb_frag_address(fs)); hw_buffer.s.size = skb_frag_size(fs); CVM_OCT_SKB_CB(skb)[i + 1] = hw_buffer.u64; } - hw_buffer.s.addr = XKPHYS_TO_PHYS((u64)CVM_OCT_SKB_CB(skb)); + hw_buffer.s.addr = + XKPHYS_TO_PHYS((uintptr_t)CVM_OCT_SKB_CB(skb)); hw_buffer.s.size = skb_shinfo(skb)->nr_frags + 1; pko_command.s.segs = skb_shinfo(skb)->nr_frags + 1; pko_command.s.gather = 1; diff --git a/drivers/staging/octeon/octeon-stubs.h b/drivers/staging/octeon/octeon-stubs.h index a4ac3bfb62a8..b78ce9eaab85 100644 --- a/drivers/staging/octeon/octeon-stubs.h +++ b/drivers/staging/octeon/octeon-stubs.h @@ -1202,7 +1202,7 @@ static inline int cvmx_wqe_get_grp(cvmx_wqe_t *work) static inline void *cvmx_phys_to_ptr(uint64_t physical_address) { - return (void *)(physical_address); + return (void *)(uintptr_t)(physical_address); } static inline uint64_t cvmx_ptr_to_phys(void *ptr) From 63f2b1677fba11c5bd02089f25c13421948905f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Noralf=20Tr=C3=B8nnes?= Date: Tue, 17 Sep 2019 19:18:41 +0200 Subject: [PATCH 1496/2880] staging/fbtft: Depend on OF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c440eee1a7a1 ("Staging: fbtft: Switch to the gpio descriptor interface") removed setting gpios via platform data. This means that fbtft will now only work with Device Tree so set the dependency. This also prevents a NULL pointer deref on non-DT platform because fbtftops.request_gpios is not set in that case anymore. Fixes: c440eee1a7a1 ("Staging: fbtft: Switch to the gpio descriptor interface") Cc: stable Signed-off-by: Noralf Trønnes Link: https://lore.kernel.org/r/20190917171843.10334-1-noralf@tronnes.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/fbtft/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/fbtft/Kconfig b/drivers/staging/fbtft/Kconfig index 8ec524a95ec8..4e5d860fd788 100644 --- a/drivers/staging/fbtft/Kconfig +++ b/drivers/staging/fbtft/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 menuconfig FB_TFT tristate "Support for small TFT LCD display modules" - depends on FB && SPI + depends on FB && SPI && OF depends on GPIOLIB || COMPILE_TEST select FB_SYS_FILLRECT select FB_SYS_COPYAREA From 2962db71c7030868b6859d09b2138b55297df95e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Noralf=20Tr=C3=B8nnes?= Date: Tue, 17 Sep 2019 19:18:42 +0200 Subject: [PATCH 1497/2880] staging/fbtft: Remove fbtft_device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c440eee1a7a1 ("Staging: fbtft: Switch to the gpio descriptor interface") removed the gpio code from fbtft_device rendering it useless. fbtft_device is a module that was used on the Raspberry Pi to dynamically add fbtft devices when the Pi didn't have Device Tree support. Just remove the module since it's the responsibility of Device Tree, ACPI or platform code to add devices. Fixes: c440eee1a7a1 ("Staging: fbtft: Switch to the gpio descriptor interface") Signed-off-by: Noralf Trønnes Link: https://lore.kernel.org/r/20190917171843.10334-2-noralf@tronnes.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/fbtft/Kconfig | 4 - drivers/staging/fbtft/Makefile | 3 - drivers/staging/fbtft/fbtft_device.c | 1261 -------------------------- 3 files changed, 1268 deletions(-) delete mode 100644 drivers/staging/fbtft/fbtft_device.c diff --git a/drivers/staging/fbtft/Kconfig b/drivers/staging/fbtft/Kconfig index 4e5d860fd788..4c37d087e25c 100644 --- a/drivers/staging/fbtft/Kconfig +++ b/drivers/staging/fbtft/Kconfig @@ -205,7 +205,3 @@ config FB_FLEX depends on FB_TFT help Generic Framebuffer support for TFT LCD displays. - -config FB_TFT_FBTFT_DEVICE - tristate "Module to for adding FBTFT devices" - depends on FB_TFT diff --git a/drivers/staging/fbtft/Makefile b/drivers/staging/fbtft/Makefile index 6bc03311c9c7..925ec17e318d 100644 --- a/drivers/staging/fbtft/Makefile +++ b/drivers/staging/fbtft/Makefile @@ -37,6 +37,3 @@ obj-$(CONFIG_FB_TFT_UC1701) += fb_uc1701.o obj-$(CONFIG_FB_TFT_UPD161704) += fb_upd161704.o obj-$(CONFIG_FB_TFT_WATTEROTT) += fb_watterott.o obj-$(CONFIG_FB_FLEX) += flexfb.o - -# Device modules -obj-$(CONFIG_FB_TFT_FBTFT_DEVICE) += fbtft_device.o diff --git a/drivers/staging/fbtft/fbtft_device.c b/drivers/staging/fbtft/fbtft_device.c deleted file mode 100644 index 44e1410eb3fe..000000000000 --- a/drivers/staging/fbtft/fbtft_device.c +++ /dev/null @@ -1,1261 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * - * Copyright (C) 2013, Noralf Tronnes - */ - -#define pr_fmt(fmt) "fbtft_device: " fmt -#include -#include -#include -#include -#include -#include

; if - * found, dentry is grabbed and passed to caller via *. - * If no such element exists, the anchor of list is returned - * and * is set to NULL. + * found, dentry is grabbed and returned to caller. + * If no such element exists, NULL is returned. */ -static struct list_head *scan_positives(struct dentry *cursor, +static struct dentry *scan_positives(struct dentry *cursor, struct list_head *p, loff_t count, - struct dentry **res) + struct dentry *last) { struct dentry *dentry = cursor->d_parent, *found = NULL; @@ -127,9 +126,8 @@ static struct list_head *scan_positives(struct dentry *cursor, } } spin_unlock(&dentry->d_lock); - dput(*res); - *res = found; - return p; + dput(last); + return found; } loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) @@ -149,25 +147,22 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset != file->f_pos) { struct dentry *cursor = file->private_data; struct dentry *to = NULL; - struct list_head *p; - file->f_pos = offset; inode_lock_shared(dentry->d_inode); - if (file->f_pos > 2) { - p = scan_positives(cursor, &dentry->d_subdirs, - file->f_pos - 2, &to); - spin_lock(&dentry->d_lock); - list_move(&cursor->d_child, p); - spin_unlock(&dentry->d_lock); - } else { - spin_lock(&dentry->d_lock); + if (offset > 2) + to = scan_positives(cursor, &dentry->d_subdirs, + offset - 2, NULL); + spin_lock(&dentry->d_lock); + if (to) + list_move(&cursor->d_child, &to->d_child); + else list_del_init(&cursor->d_child); - spin_unlock(&dentry->d_lock); - } - + spin_unlock(&dentry->d_lock); dput(to); + file->f_pos = offset; + inode_unlock_shared(dentry->d_inode); } return offset; @@ -199,17 +194,23 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) if (ctx->pos == 2) p = anchor; - else + else if (!list_empty(&cursor->d_child)) p = &cursor->d_child; + else + return 0; - while ((p = scan_positives(cursor, p, 1, &next)) != anchor) { + while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, dt_type(d_inode(next)))) break; ctx->pos++; + p = &next->d_child; } spin_lock(&dentry->d_lock); - list_move_tail(&cursor->d_child, p); + if (next) + list_move_tail(&cursor->d_child, &next->d_child); + else + list_del_init(&cursor->d_child); spin_unlock(&dentry->d_lock); dput(next); From e0ae2c578d3909e60e9448207f5d83f785f1129f Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Wed, 9 Oct 2019 11:07:18 +0200 Subject: [PATCH 2053/2880] net: usb: qmi_wwan: add Telit 0x1050 composition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for Telit FN980 0x1050 composition 0x1050: tty, adb, rmnet, tty, tty, tty, tty Signed-off-by: Daniele Palmas Acked-by: Bjørn Mork Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 3d77cd402ba9..596428ec71df 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1327,6 +1327,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */ {QMI_FIXED_INTF(0x2357, 0x9000, 4)}, /* TP-LINK MA260 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)}, /* Telit LE922A */ + {QMI_QUIRK_SET_DTR(0x1bc7, 0x1050, 2)}, /* Telit FN980 */ {QMI_FIXED_INTF(0x1bc7, 0x1100, 3)}, /* Telit ME910 */ {QMI_FIXED_INTF(0x1bc7, 0x1101, 3)}, /* Telit ME910 dual modem */ {QMI_FIXED_INTF(0x1bc7, 0x1200, 5)}, /* Telit LE920 */ From 79a85e214d62da9a750cc63ef49483e62abbda81 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 10 Oct 2019 00:38:13 +0900 Subject: [PATCH 2054/2880] null_blk: Fix zoned command return code The return code from null_handle_zoned() sets the cmd->error value. Returning OK status when an error occured overwrites the intended cmd->error. Return the appropriate error code instead of setting the error in the cmd. Fixes: fceb5d1b19cbe626 ("null_blk: create a helper for zoned devices") Cc: Chaitanya Kulkarni Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/block/null_blk_zoned.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index eabc116832a7..3d7fdea872f8 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -142,8 +142,7 @@ static blk_status_t null_zone_reset(struct nullb_cmd *cmd, sector_t sector) zone->wp = zone->start; break; default: - cmd->error = BLK_STS_NOTSUPP; - break; + return BLK_STS_NOTSUPP; } return BLK_STS_OK; } From 35a79a63517981a8aea395497c548776347deda8 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Wed, 18 Sep 2019 22:06:58 +0530 Subject: [PATCH 2055/2880] scsi: qla2xxx: fix a potential NULL pointer dereference alloc_workqueue is not checked for errors and as a result a potential NULL dereference could occur. Link: https://lore.kernel.org/r/1568824618-4366-1-git-send-email-allen.pais@oracle.com Signed-off-by: Allen Pais Reviewed-by: Martin Wilck Acked-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_os.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index ee47de9fbc05..e4d765fc03ea 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -3224,6 +3224,10 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id) req->req_q_in, req->req_q_out, rsp->rsp_q_in, rsp->rsp_q_out); ha->wq = alloc_workqueue("qla2xxx_wq", 0, 0); + if (unlikely(!ha->wq)) { + ret = -ENOMEM; + goto probe_failed; + } if (ha->isp_ops->initialize_adapter(base_vha)) { ql_log(ql_log_fatal, base_vha, 0x00d6, From b6ce6fb121a655aefe41dccc077141c102145a37 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 7 Oct 2019 15:57:01 +0200 Subject: [PATCH 2056/2880] scsi: scsi_dh_alua: handle RTPG sense code correctly during state transitions Some arrays are not capable of returning RTPG data during state transitioning, but rather return an 'LUN not accessible, asymmetric access state transition' sense code. In these cases we can set the state to 'transitioning' directly and don't need to evaluate the RTPG data (which we won't have anyway). Link: https://lore.kernel.org/r/20191007135701.32389-1-hare@suse.de Reviewed-by: Laurence Oberman Reviewed-by: Ewan D. Milne Reviewed-by: Bart Van Assche Signed-off-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/device_handler/scsi_dh_alua.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index f0066f8a1786..c6437a442ffc 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -511,6 +511,7 @@ static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg) unsigned int tpg_desc_tbl_off; unsigned char orig_transition_tmo; unsigned long flags; + bool transitioning_sense = false; if (!pg->expiry) { unsigned long transition_tmo = ALUA_FAILOVER_TIMEOUT * HZ; @@ -571,13 +572,19 @@ static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg) goto retry; } /* - * Retry on ALUA state transition or if any - * UNIT ATTENTION occurred. + * If the array returns with 'ALUA state transition' + * sense code here it cannot return RTPG data during + * transition. So set the state to 'transitioning' directly. */ if (sense_hdr.sense_key == NOT_READY && - sense_hdr.asc == 0x04 && sense_hdr.ascq == 0x0a) - err = SCSI_DH_RETRY; - else if (sense_hdr.sense_key == UNIT_ATTENTION) + sense_hdr.asc == 0x04 && sense_hdr.ascq == 0x0a) { + transitioning_sense = true; + goto skip_rtpg; + } + /* + * Retry on any other UNIT ATTENTION occurred. + */ + if (sense_hdr.sense_key == UNIT_ATTENTION) err = SCSI_DH_RETRY; if (err == SCSI_DH_RETRY && pg->expiry != 0 && time_before(jiffies, pg->expiry)) { @@ -665,7 +672,11 @@ static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg) off = 8 + (desc[7] * 4); } + skip_rtpg: spin_lock_irqsave(&pg->lock, flags); + if (transitioning_sense) + pg->state = SCSI_ACCESS_STATE_TRANSITIONING; + sdev_printk(KERN_INFO, sdev, "%s: port group %02x state %c %s supports %c%c%c%c%c%c%c\n", ALUA_DH_NAME, pg->group_id, print_alua_state(pg->state), From 0ee6211408a8e939428f662833c7301394125b80 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Wed, 9 Oct 2019 17:11:18 +0200 Subject: [PATCH 2057/2880] scsi: sni_53c710: fix compilation error Drop out memory dev_printk() with wrong device pointer argument. [mkp: typo] Link: https://lore.kernel.org/r/20191009151118.32350-1-tbogendoerfer@suse.de Signed-off-by: Thomas Bogendoerfer Signed-off-by: Martin K. Petersen --- drivers/scsi/sni_53c710.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/scsi/sni_53c710.c b/drivers/scsi/sni_53c710.c index aef4881d8e21..a85d52b5dc32 100644 --- a/drivers/scsi/sni_53c710.c +++ b/drivers/scsi/sni_53c710.c @@ -66,10 +66,8 @@ static int snirm710_probe(struct platform_device *dev) base = res->start; hostdata = kzalloc(sizeof(*hostdata), GFP_KERNEL); - if (!hostdata) { - dev_printk(KERN_ERR, dev, "Failed to allocate host data\n"); + if (!hostdata) return -ENOMEM; - } hostdata->dev = &dev->dev; dma_set_mask(&dev->dev, DMA_BIT_MASK(32)); From 8cbf0c173aa096dda526d1ccd66fc751c31da346 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Wed, 9 Oct 2019 17:11:28 +0200 Subject: [PATCH 2058/2880] scsi: fix kconfig dependency warning related to 53C700_LE_ON_BE When building a kernel with SCSI_SNI_53C710 enabled, Kconfig warns: WARNING: unmet direct dependencies detected for 53C700_LE_ON_BE Depends on [n]: SCSI_LOWLEVEL [=y] && SCSI [=y] && SCSI_LASI700 [=n] Selected by [y]: - SCSI_SNI_53C710 [=y] && SCSI_LOWLEVEL [=y] && SNI_RM [=y] && SCSI [=y] Add the missing depends SCSI_SNI_53C710 to 53C700_LE_ON_BE to fix it. Link: https://lore.kernel.org/r/20191009151128.32411-1-tbogendoerfer@suse.de Signed-off-by: Thomas Bogendoerfer Signed-off-by: Martin K. Petersen --- drivers/scsi/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 75f66f8ad3ea..d00e1ee09af3 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -898,7 +898,7 @@ config SCSI_SNI_53C710 config 53C700_LE_ON_BE bool - depends on SCSI_LASI700 + depends on SCSI_LASI700 || SCSI_SNI_53C710 default y config SCSI_STEX From 6a0990eaa768dfb7064f06777743acc6d392084b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 9 Oct 2019 10:35:36 -0700 Subject: [PATCH 2059/2880] scsi: ch: Make it possible to open a ch device multiple times again Clearing ch->device in ch_release() is wrong because that pointer must remain valid until ch_remove() is called. This patch fixes the following crash the second time a ch device is opened: BUG: kernel NULL pointer dereference, address: 0000000000000790 RIP: 0010:scsi_device_get+0x5/0x60 Call Trace: ch_open+0x4c/0xa0 [ch] chrdev_open+0xa2/0x1c0 do_dentry_open+0x13a/0x380 path_openat+0x591/0x1470 do_filp_open+0x91/0x100 do_sys_open+0x184/0x220 do_syscall_64+0x5f/0x1a0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 085e56766f74 ("scsi: ch: add refcounting") Cc: Hannes Reinecke Cc: Link: https://lore.kernel.org/r/20191009173536.247889-1-bvanassche@acm.org Reported-by: Rob Turk Suggested-by: Rob Turk Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/ch.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/ch.c b/drivers/scsi/ch.c index 5f8153c37f77..76751d6c7f0d 100644 --- a/drivers/scsi/ch.c +++ b/drivers/scsi/ch.c @@ -579,7 +579,6 @@ ch_release(struct inode *inode, struct file *file) scsi_changer *ch = file->private_data; scsi_device_put(ch->device); - ch->device = NULL; file->private_data = NULL; kref_put(&ch->ref, ch_destroy); return 0; From 993e4c929a073595d22c85f59082f0c387e31c21 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 9 Oct 2019 11:19:10 +0200 Subject: [PATCH 2060/2880] netns: fix NLM_F_ECHO mechanism for RTM_NEWNSID The flag NLM_F_ECHO aims to reply to the user the message notified to all listeners. It was not the case with the command RTM_NEWNSID, let's fix this. Fixes: 0c7aecd4bde4 ("netns: add rtnl cmd to add and get peer netns ids") Reported-by: Guillaume Nault Signed-off-by: Nicolas Dichtel Acked-by: Guillaume Nault Tested-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- net/core/net_namespace.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a0e0d298c991..6d3e4821b02d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -245,7 +245,8 @@ static int __peernet2id(struct net *net, struct net *peer) return __peernet2id_alloc(net, peer, &no); } -static void rtnl_net_notifyid(struct net *net, int cmd, int id); +static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, + struct nlmsghdr *nlh); /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ @@ -268,7 +269,7 @@ int peernet2id_alloc(struct net *net, struct net *peer) id = __peernet2id_alloc(net, peer, &alloc); spin_unlock_bh(&net->nsid_lock); if (alloc && id >= 0) - rtnl_net_notifyid(net, RTM_NEWNSID, id); + rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL); if (alive) put_net(peer); return id; @@ -532,7 +533,7 @@ static void unhash_nsid(struct net *net, struct net *last) idr_remove(&tmp->netns_ids, id); spin_unlock_bh(&tmp->nsid_lock); if (id >= 0) - rtnl_net_notifyid(tmp, RTM_DELNSID, id); + rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL); if (tmp == last) break; } @@ -764,7 +765,8 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, err = alloc_netid(net, peer, nsid); spin_unlock_bh(&net->nsid_lock); if (err >= 0) { - rtnl_net_notifyid(net, RTM_NEWNSID, err); + rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, + nlh); err = 0; } else if (err == -ENOSPC && nsid >= 0) { err = -EEXIST; @@ -1051,9 +1053,12 @@ end: return err < 0 ? err : skb->len; } -static void rtnl_net_notifyid(struct net *net, int cmd, int id) +static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, + struct nlmsghdr *nlh) { struct net_fill_args fillargs = { + .portid = portid, + .seq = nlh ? nlh->nlmsg_seq : 0, .cmd = cmd, .nsid = id, }; @@ -1068,7 +1073,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (err < 0) goto err_out; - rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0); + rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, 0); return; err_out: From e37542ba111f3974dc622ae0a21c1787318de500 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 09:19:13 -0700 Subject: [PATCH 2061/2880] netfilter: conntrack: avoid possible false sharing As hinted by KCSAN, we need at least one READ_ONCE() to prevent a compiler optimization. More details on : https://github.com/google/ktsan/wiki/READ_ONCE-and-WRITE_ONCE#it-may-improve-performance sysbot report : BUG: KCSAN: data-race in __nf_ct_refresh_acct / __nf_ct_refresh_acct read to 0xffff888123eb4f08 of 4 bytes by interrupt on cpu 0: __nf_ct_refresh_acct+0xd4/0x1b0 net/netfilter/nf_conntrack_core.c:1796 nf_ct_refresh_acct include/net/netfilter/nf_conntrack.h:201 [inline] nf_conntrack_tcp_packet+0xd40/0x3390 net/netfilter/nf_conntrack_proto_tcp.c:1161 nf_conntrack_handle_packet net/netfilter/nf_conntrack_core.c:1633 [inline] nf_conntrack_in+0x410/0xaa0 net/netfilter/nf_conntrack_core.c:1727 ipv4_conntrack_in+0x27/0x40 net/netfilter/nf_conntrack_proto.c:178 nf_hook_entry_hookfn include/linux/netfilter.h:135 [inline] nf_hook_slow+0x83/0x160 net/netfilter/core.c:512 nf_hook include/linux/netfilter.h:260 [inline] NF_HOOK include/linux/netfilter.h:303 [inline] ip_rcv+0x12f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5004 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5118 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5208 napi_skb_finish net/core/dev.c:5671 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5704 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 virtnet_receive drivers/net/virtio_net.c:1323 [inline] virtnet_poll+0x436/0x7d0 drivers/net/virtio_net.c:1428 napi_poll net/core/dev.c:6352 [inline] net_rx_action+0x3ae/0xa50 net/core/dev.c:6418 __do_softirq+0x115/0x33f kernel/softirq.c:292 write to 0xffff888123eb4f08 of 4 bytes by task 7191 on cpu 1: __nf_ct_refresh_acct+0xfb/0x1b0 net/netfilter/nf_conntrack_core.c:1797 nf_ct_refresh_acct include/net/netfilter/nf_conntrack.h:201 [inline] nf_conntrack_tcp_packet+0xd40/0x3390 net/netfilter/nf_conntrack_proto_tcp.c:1161 nf_conntrack_handle_packet net/netfilter/nf_conntrack_core.c:1633 [inline] nf_conntrack_in+0x410/0xaa0 net/netfilter/nf_conntrack_core.c:1727 ipv4_conntrack_local+0xbe/0x130 net/netfilter/nf_conntrack_proto.c:200 nf_hook_entry_hookfn include/linux/netfilter.h:135 [inline] nf_hook_slow+0x83/0x160 net/netfilter/core.c:512 nf_hook include/linux/netfilter.h:260 [inline] __ip_local_out+0x1f7/0x2b0 net/ipv4/ip_output.c:114 ip_local_out+0x31/0x90 net/ipv4/ip_output.c:123 __ip_queue_xmit+0x3a8/0xa40 net/ipv4/ip_output.c:532 ip_queue_xmit+0x45/0x60 include/net/ip.h:236 __tcp_transmit_skb+0xdeb/0x1cd0 net/ipv4/tcp_output.c:1158 __tcp_send_ack+0x246/0x300 net/ipv4/tcp_output.c:3685 tcp_send_ack+0x34/0x40 net/ipv4/tcp_output.c:3691 tcp_cleanup_rbuf+0x130/0x360 net/ipv4/tcp.c:1575 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 7191 Comm: syz-fuzzer Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: cc16921351d8 ("netfilter: conntrack: avoid same-timeout update") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jozsef Kadlecsik Cc: Florian Westphal Acked-by: Pablo Neira Ayuso Signed-off-by: Jakub Kicinski --- net/netfilter/nf_conntrack_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0c63120b2db2..5cd610b547e0 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1792,8 +1792,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, if (nf_ct_is_confirmed(ct)) extra_jiffies += nfct_time_stamp; - if (ct->timeout != extra_jiffies) - ct->timeout = extra_jiffies; + if (READ_ONCE(ct->timeout) != extra_jiffies) + WRITE_ONCE(ct->timeout, extra_jiffies); acct: if (do_acct) nf_ct_acct_update(ct, ctinfo, skb->len); From 4ffdd22e49f47db543906d75453a0048a53071ab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 09:20:02 -0700 Subject: [PATCH 2062/2880] tun: remove possible false sharing in tun_flow_update() As mentioned in https://github.com/google/ktsan/wiki/READ_ONCE-and-WRITE_ONCE#it-may-improve-performance a C compiler can legally transform if (e->queue_index != queue_index) e->queue_index = queue_index; to : e->queue_index = queue_index; Note that the code using jiffies has no issue, since jiffies has volatile attribute. if (e->updated != jiffies) e->updated = jiffies; Fixes: 83b1bc122cab ("tun: align write-heavy flow entry members to a cache line") Signed-off-by: Eric Dumazet Cc: Zhang Yu Cc: Wang Li Cc: Li RongQing Acked-by: Jason Wang Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 812dc3a65efb..a8d3141582a5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -526,8 +526,8 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash, e = tun_flow_find(head, rxhash); if (likely(e)) { /* TODO: keep queueing to old queue until it's empty? */ - if (e->queue_index != queue_index) - e->queue_index = queue_index; + if (READ_ONCE(e->queue_index) != queue_index) + WRITE_ONCE(e->queue_index, queue_index); if (e->updated != jiffies) e->updated = jiffies; sock_rps_record_flow_hash(e->rps_rxhash); From 503978aca46124cd714703e180b9c8292ba50ba7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 12:55:53 -0700 Subject: [PATCH 2063/2880] net: avoid possible false sharing in sk_leave_memory_pressure() As mentioned in https://github.com/google/ktsan/wiki/READ_ONCE-and-WRITE_ONCE#it-may-improve-performance a C compiler can legally transform : if (memory_pressure && *memory_pressure) *memory_pressure = 0; to : if (memory_pressure) *memory_pressure = 0; Fixes: 0604475119de ("tcp: add TCPMemoryPressuresChrono counter") Fixes: 180d8cd942ce ("foundations of per-cgroup memory pressure controlling.") Fixes: 3ab224be6d69 ("[NET] CORE: Introducing new memory accounting interface.") Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/core/sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index fac2b4d80de5..50647a10fdb7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2334,8 +2334,8 @@ static void sk_leave_memory_pressure(struct sock *sk) } else { unsigned long *memory_pressure = sk->sk_prot->memory_pressure; - if (memory_pressure && *memory_pressure) - *memory_pressure = 0; + if (memory_pressure && READ_ONCE(*memory_pressure)) + WRITE_ONCE(*memory_pressure, 0); } } From 60b173ca3d1cd1782bd0096dc17298ec242f6fb1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 14:51:20 -0700 Subject: [PATCH 2064/2880] net: add {READ|WRITE}_ONCE() annotations on ->rskq_accept_head reqsk_queue_empty() is called from inet_csk_listen_poll() while other cpus might write ->rskq_accept_head value. Use {READ|WRITE}_ONCE() to avoid compiler tricks and potential KCSAN splats. Fixes: fff1f3001cc5 ("tcp: add a spinlock to protect struct request_sock_queue") Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- drivers/xen/pvcalls-back.c | 2 +- include/net/request_sock.h | 4 ++-- net/ipv4/inet_connection_sock.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c index 69a626b0e594..c57c71b7d53d 100644 --- a/drivers/xen/pvcalls-back.c +++ b/drivers/xen/pvcalls-back.c @@ -775,7 +775,7 @@ static int pvcalls_back_poll(struct xenbus_device *dev, mappass->reqcopy = *req; icsk = inet_csk(mappass->sock->sk); queue = &icsk->icsk_accept_queue; - data = queue->rskq_accept_head != NULL; + data = READ_ONCE(queue->rskq_accept_head) != NULL; if (data) { mappass->reqcopy.cmd = 0; ret = 0; diff --git a/include/net/request_sock.h b/include/net/request_sock.h index fd178d58fa84..cf8b33213bbc 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -185,7 +185,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, static inline bool reqsk_queue_empty(const struct request_sock_queue *queue) { - return queue->rskq_accept_head == NULL; + return READ_ONCE(queue->rskq_accept_head) == NULL; } static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue, @@ -197,7 +197,7 @@ static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue req = queue->rskq_accept_head; if (req) { sk_acceptq_removed(parent); - queue->rskq_accept_head = req->dl_next; + WRITE_ONCE(queue->rskq_accept_head, req->dl_next); if (queue->rskq_accept_head == NULL) queue->rskq_accept_tail = NULL; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a9183543ca30..dbcf34ec8dd2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -934,7 +934,7 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, req->sk = child; req->dl_next = NULL; if (queue->rskq_accept_head == NULL) - queue->rskq_accept_head = req; + WRITE_ONCE(queue->rskq_accept_head, req); else queue->rskq_accept_tail->dl_next = req; queue->rskq_accept_tail = req; From 1f142c17d19a5618d5a633195a46f2c8be9bf232 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:10:15 -0700 Subject: [PATCH 2065/2880] tcp: annotate lockless access to tcp_memory_pressure tcp_memory_pressure is read without holding any lock, and its value could be changed on other cpus. Use READ_ONCE() to annotate these lockless reads. The write side is already using atomic ops. Fixes: b8da51ebb1aa ("tcp: introduce tcp_under_memory_pressure()") Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- net/ipv4/tcp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index c9a3f9688223..88e63d64c698 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -258,7 +258,7 @@ static inline bool tcp_under_memory_pressure(const struct sock *sk) mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; - return tcp_memory_pressure; + return READ_ONCE(tcp_memory_pressure); } /* * The next routines deal with comparing 32 bit unsigned ints diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f98a1882e537..888c92b63f5a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -326,7 +326,7 @@ void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; - if (tcp_memory_pressure) + if (READ_ONCE(tcp_memory_pressure)) return; val = jiffies; @@ -341,7 +341,7 @@ void tcp_leave_memory_pressure(struct sock *sk) { unsigned long val; - if (!tcp_memory_pressure) + if (!READ_ONCE(tcp_memory_pressure)) return; val = xchg(&tcp_memory_pressure, 0); if (val) From 8265792bf8871acc2d00fd03883d830e2249d395 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:21:13 -0700 Subject: [PATCH 2066/2880] net: silence KCSAN warnings around sk_add_backlog() calls sk_add_backlog() callers usually read sk->sk_rcvbuf without owning the socket lock. This means sk_rcvbuf value can be changed by other cpus, and KCSAN complains. Add READ_ONCE() annotations to document the lockless nature of these reads. Note that writes over sk_rcvbuf should also use WRITE_ONCE(), but this will be done in separate patches to ease stable backports (if we decide this is relevant for stable trees). BUG: KCSAN: data-race in tcp_add_backlog / tcp_recvmsg write to 0xffff88812ab369f8 of 8 bytes by interrupt on cpu 1: __sk_add_backlog include/net/sock.h:902 [inline] sk_add_backlog include/net/sock.h:933 [inline] tcp_add_backlog+0x45a/0xcc0 net/ipv4/tcp_ipv4.c:1737 tcp_v4_rcv+0x1aba/0x1bf0 net/ipv4/tcp_ipv4.c:1925 ip_protocol_deliver_rcu+0x51/0x470 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:442 [inline] ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5004 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5118 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5208 napi_skb_finish net/core/dev.c:5671 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5704 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 virtnet_receive drivers/net/virtio_net.c:1323 [inline] virtnet_poll+0x436/0x7d0 drivers/net/virtio_net.c:1428 napi_poll net/core/dev.c:6352 [inline] net_rx_action+0x3ae/0xa50 net/core/dev.c:6418 read to 0xffff88812ab369f8 of 8 bytes by task 7271 on cpu 0: tcp_recvmsg+0x470/0x1a30 net/ipv4/tcp.c:2047 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 sock_read_iter+0x15f/0x1e0 net/socket.c:967 call_read_iter include/linux/fs.h:1864 [inline] new_sync_read+0x389/0x4f0 fs/read_write.c:414 __vfs_read+0xb1/0xc0 fs/read_write.c:427 vfs_read fs/read_write.c:461 [inline] vfs_read+0x143/0x2c0 fs/read_write.c:446 ksys_read+0xd5/0x1b0 fs/read_write.c:587 __do_sys_read fs/read_write.c:597 [inline] __se_sys_read fs/read_write.c:595 [inline] __x64_sys_read+0x4c/0x60 fs/read_write.c:595 do_syscall_64+0xcf/0x2f0 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 7271 Comm: syz-fuzzer Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Jakub Kicinski --- net/core/sock.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/llc/llc_conn.c | 2 +- net/sctp/input.c | 6 +++--- net/tipc/socket.c | 6 +++--- net/x25/x25_dev.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 50647a10fdb7..1cf06934da50 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -522,7 +522,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { + } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); goto discard_and_relse; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bf124b1742df..492bf6a6b023 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1644,7 +1644,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { - u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; + u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index a79b739eb223..7b620acaca9e 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -813,7 +813,7 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) else { dprintk("%s: adding to backlog...\n", __func__); llc_set_backlog_type(skb, LLC_PACKET); - if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) + if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) goto drop_unlock; } out: diff --git a/net/sctp/input.c b/net/sctp/input.c index f2771375bfc0..2277981559d0 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -322,7 +322,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) bh_lock_sock(sk); if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { - if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) + if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) sctp_chunk_free(chunk); else backloged = 1; @@ -337,7 +337,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) return 0; } else { if (!sctp_newsk_ready(sk)) { - if (!sk_add_backlog(sk, skb, sk->sk_rcvbuf)) + if (!sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) return 0; sctp_chunk_free(chunk); } else { @@ -364,7 +364,7 @@ static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) struct sctp_ep_common *rcvr = chunk->rcvr; int ret; - ret = sk_add_backlog(sk, skb, sk->sk_rcvbuf); + ret = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); if (!ret) { /* Hold the assoc/ep while hanging on the backlog queue. * This way, we know structures we need will not disappear diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3b9f8cc328f5..7c736cfec57f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2119,13 +2119,13 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) struct tipc_msg *hdr = buf_msg(skb); if (unlikely(msg_in_group(hdr))) - return sk->sk_rcvbuf; + return READ_ONCE(sk->sk_rcvbuf); if (unlikely(!msg_connected(hdr))) - return sk->sk_rcvbuf << msg_importance(hdr); + return READ_ONCE(sk->sk_rcvbuf) << msg_importance(hdr); if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL)) - return sk->sk_rcvbuf; + return READ_ONCE(sk->sk_rcvbuf); return FLOWCTL_MSG_LIM; } diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 5c111bc3c8ea..00e782335cb0 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -55,7 +55,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb) if (!sock_owned_by_user(sk)) { queued = x25_process_rx_frame(sk, skb); } else { - queued = !sk_add_backlog(sk, skb, sk->sk_rcvbuf); + queued = !sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); } bh_unlock_sock(sk); sock_put(sk); From eac66402d1c342f07ff38f8d631ff95eb7ad3220 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:32:35 -0700 Subject: [PATCH 2067/2880] net: annotate sk->sk_rcvlowat lockless reads sock_rcvlowat() or int_sk_rcvlowat() might be called without the socket lock for example from tcp_poll(). Use READ_ONCE() to document the fact that other cpus might change sk->sk_rcvlowat under us and avoid KCSAN splats. Use WRITE_ONCE() on write sides too. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 +++- net/core/filter.c | 2 +- net/core/sock.c | 2 +- net/ipv4/tcp.c | 2 +- net/sched/em_meta.c | 2 +- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 2c53f1a1d905..79f54e1f8827 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2271,7 +2271,9 @@ static inline long sock_sndtimeo(const struct sock *sk, bool noblock) static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) { - return (waitall ? len : min_t(int, sk->sk_rcvlowat, len)) ? : 1; + int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len); + + return v ?: 1; } /* Alas, with timeout socket operations are not restartable. diff --git a/net/core/filter.c b/net/core/filter.c index ed6563622ce3..a50c0b6846f2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4274,7 +4274,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; case SO_MARK: if (sk->sk_mark != val) { diff --git a/net/core/sock.c b/net/core/sock.c index 1cf06934da50..b7c5c6ea51ba 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -974,7 +974,7 @@ set_rcvbuf: if (sock->ops->set_rcvlowat) ret = sock->ops->set_rcvlowat(sk, val); else - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; case SO_RCVTIMEO_OLD: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 888c92b63f5a..8781a92ea4b6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1699,7 +1699,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) else cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; val = min(val, cap); - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ tcp_data_ready(sk); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 82bd14e7ac93..4c9122fc35c9 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -554,7 +554,7 @@ META_COLLECTOR(int_sk_rcvlowat) *err = -1; return; } - dst->value = sk->sk_rcvlowat; + dst->value = READ_ONCE(sk->sk_rcvlowat); } META_COLLECTOR(int_sk_rcvtimeo) From 70c2655849a25431f31b505a07fe0c861e5e41fb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:41:03 -0700 Subject: [PATCH 2068/2880] net: silence KCSAN warnings about sk->sk_backlog.len reads sk->sk_backlog.len can be written by BH handlers, and read from process contexts in a lockless way. Note the write side should also use WRITE_ONCE() or a variant. We need some agreement about the best way to do this. syzbot reported : BUG: KCSAN: data-race in tcp_add_backlog / tcp_grow_window.isra.0 write to 0xffff88812665f32c of 4 bytes by interrupt on cpu 1: sk_add_backlog include/net/sock.h:934 [inline] tcp_add_backlog+0x4a0/0xcc0 net/ipv4/tcp_ipv4.c:1737 tcp_v4_rcv+0x1aba/0x1bf0 net/ipv4/tcp_ipv4.c:1925 ip_protocol_deliver_rcu+0x51/0x470 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:442 [inline] ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5004 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5118 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5208 napi_skb_finish net/core/dev.c:5671 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5704 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 virtnet_receive drivers/net/virtio_net.c:1323 [inline] virtnet_poll+0x436/0x7d0 drivers/net/virtio_net.c:1428 napi_poll net/core/dev.c:6352 [inline] net_rx_action+0x3ae/0xa50 net/core/dev.c:6418 read to 0xffff88812665f32c of 4 bytes by task 7292 on cpu 0: tcp_space include/net/tcp.h:1373 [inline] tcp_grow_window.isra.0+0x6b/0x480 net/ipv4/tcp_input.c:413 tcp_event_data_recv+0x68f/0x990 net/ipv4/tcp_input.c:717 tcp_rcv_established+0xbfe/0xf50 net/ipv4/tcp_input.c:5618 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1542 sk_backlog_rcv include/net/sock.h:945 [inline] __release_sock+0x135/0x1e0 net/core/sock.c:2427 release_sock+0x61/0x160 net/core/sock.c:2943 tcp_recvmsg+0x63b/0x1a30 net/ipv4/tcp.c:2181 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 sock_read_iter+0x15f/0x1e0 net/socket.c:967 call_read_iter include/linux/fs.h:1864 [inline] new_sync_read+0x389/0x4f0 fs/read_write.c:414 __vfs_read+0xb1/0xc0 fs/read_write.c:427 vfs_read fs/read_write.c:461 [inline] vfs_read+0x143/0x2c0 fs/read_write.c:446 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 7292 Comm: syz-fuzzer Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 3 ++- net/core/sock.c | 2 +- net/sctp/diag.c | 2 +- net/tipc/socket.c | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 88e63d64c698..35f6f7e0fdc2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1380,7 +1380,8 @@ static inline int tcp_win_from_space(const struct sock *sk, int space) /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { - return tcp_win_from_space(sk, sk->sk_rcvbuf - sk->sk_backlog.len - + return tcp_win_from_space(sk, sk->sk_rcvbuf - + READ_ONCE(sk->sk_backlog.len) - atomic_read(&sk->sk_rmem_alloc)); } diff --git a/net/core/sock.c b/net/core/sock.c index b7c5c6ea51ba..2a053999df11 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3210,7 +3210,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); } diff --git a/net/sctp/diag.c b/net/sctp/diag.c index fc9a4c6629ce..0851166b9175 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -175,7 +175,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7c736cfec57f..f8bbc4aab213 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3790,7 +3790,7 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) i += scnprintf(buf + i, sz - i, " %d", sk->sk_sndbuf); i += scnprintf(buf + i, sz - i, " | %d", sk_rmem_alloc_get(sk)); i += scnprintf(buf + i, sz - i, " %d", sk->sk_rcvbuf); - i += scnprintf(buf + i, sz - i, " | %d\n", sk->sk_backlog.len); + i += scnprintf(buf + i, sz - i, " | %d\n", READ_ONCE(sk->sk_backlog.len)); if (dqueues & TIPC_DUMP_SK_SNDQ) { i += scnprintf(buf + i, sz - i, "sk_write_queue: "); From 05668e1d74b84c53fbe0f28565e4c9502a6b8a67 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Mon, 30 Sep 2019 17:38:02 +0200 Subject: [PATCH 2069/2880] s390/cio: fix virtio-ccw DMA without PV Commit 37db8985b211 ("s390/cio: add basic protected virtualization support") breaks virtio-ccw devices with VIRTIO_F_IOMMU_PLATFORM for non Protected Virtualization (PV) guests. The problem is that the dma_mask of the ccw device, which is used by virtio core, gets changed from 64 to 31 bit, because some of the DMA allocations do require 31 bit addressable memory. For PV the only drawback is that some of the virtio structures must end up in ZONE_DMA because we have the bounce the buffers mapped via DMA API anyway. But for non PV guests we have a problem: because of the 31 bit mask guests bigger than 2G are likely to try bouncing buffers. The swiotlb however is only initialized for PV guests, because we don't want to bounce anything for non PV guests. The first such map kills the guest. Since the DMA API won't allow us to specify for each allocation whether we need memory from ZONE_DMA (31 bit addressable) or any DMA capable memory will do, let us use coherent_dma_mask (which is used for allocations) to force allocating form ZONE_DMA while changing dma_mask to DMA_BIT_MASK(64) so that at least the streaming API will regard the whole memory DMA capable. Signed-off-by: Halil Pasic Reported-by: Marc Hartmayer Suggested-by: Robin Murphy Fixes: 37db8985b211 ("s390/cio: add basic protected virtualization support") Link: https://lore.kernel.org/lkml/20190930153803.7958-1-pasic@linux.ibm.com Reviewed-by: Christoph Hellwig Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- drivers/s390/cio/cio.h | 1 + drivers/s390/cio/css.c | 7 ++++++- drivers/s390/cio/device.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h index ba7d2480613b..dcdaba689b20 100644 --- a/drivers/s390/cio/cio.h +++ b/drivers/s390/cio/cio.h @@ -113,6 +113,7 @@ struct subchannel { enum sch_todo todo; struct work_struct todo_work; struct schib_config config; + u64 dma_mask; char *driver_override; /* Driver name to force a match */ } __attribute__ ((aligned(8))); diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 1fbfb0a93f5f..831850435c23 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -232,7 +232,12 @@ struct subchannel *css_alloc_subchannel(struct subchannel_id schid, * belong to a subchannel need to fit 31 bit width (e.g. ccw). */ sch->dev.coherent_dma_mask = DMA_BIT_MASK(31); - sch->dev.dma_mask = &sch->dev.coherent_dma_mask; + /* + * But we don't have such restrictions imposed on the stuff that + * is handled by the streaming API. + */ + sch->dma_mask = DMA_BIT_MASK(64); + sch->dev.dma_mask = &sch->dma_mask; return sch; err: diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index 131430bd48d9..0c6245fc7706 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -710,7 +710,7 @@ static struct ccw_device * io_subchannel_allocate_dev(struct subchannel *sch) if (!cdev->private) goto err_priv; cdev->dev.coherent_dma_mask = sch->dev.coherent_dma_mask; - cdev->dev.dma_mask = &cdev->dev.coherent_dma_mask; + cdev->dev.dma_mask = sch->dev.dma_mask; dma_pool = cio_gp_dma_create(&cdev->dev, 1); if (!dma_pool) goto err_dma_pool; From 2189624b3c5a6fb03416129e35c09aa5151b7392 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 26 Sep 2019 11:08:57 -0500 Subject: [PATCH 2070/2880] ACPI: PM: Drop Dell XPS13 9360 from LPS0 Idle _DSM blacklist This reverts part of commit 71630b7a832f ("ACPI / PM: Blacklist Low Power S0 Idle _DSM for Dell XPS13 9360") to remove the S0ix blacklist for the XPS 9360. The problems with this system occurred in one possible NVME SSD when putting system into s0ix. As the NVME sleep behavior has been adjusted in commit d916b1be94b6 ("nvme-pci: use host managed power state for suspend") this is expected to be now resolved. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=196907 Signed-off-by: Mario Limonciello Tested-by: Paul Menzel Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 9fa77d72ef27..2af937a8b1c5 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -361,19 +361,6 @@ static const struct dmi_system_id acpisleep_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "80E3"), }, }, - /* - * https://bugzilla.kernel.org/show_bug.cgi?id=196907 - * Some Dell XPS13 9360 cannot do suspend-to-idle using the Low Power - * S0 Idle firmware interface. - */ - { - .callback = init_default_s3, - .ident = "Dell XPS13 9360", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "XPS 13 9360"), - }, - }, /* * ThinkPad X1 Tablet(2016) cannot do suspend-to-idle using * the Low Power S0 Idle firmware interface (see From 65650b35133ff20f0c9ef0abd5c3c66dbce3ae57 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 9 Oct 2019 01:29:10 +0200 Subject: [PATCH 2071/2880] cpufreq: Avoid cpufreq_suspend() deadlock on system shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is incorrect to set the cpufreq syscore shutdown callback pointer to cpufreq_suspend(), because that function cannot be run in the syscore stage of system shutdown for two reasons: (a) it may attempt to carry out actions depending on devices that have already been shut down at that point and (b) the RCU synchronization carried out by it may not be able to make progress then. The latter issue has been present since commit 45975c7d21a1 ("rcu: Define RCU-sched API in terms of RCU for Tree RCU PREEMPT builds"), but the former one has been there since commit 90de2a4aa9f3 ("cpufreq: suspend cpufreq governors on shutdown") regardless. Fix that by dropping cpufreq_syscore_ops altogether and making device_shutdown() call cpufreq_suspend() directly before shutting down devices, which is along the lines of what system-wide power management does. Fixes: 45975c7d21a1 ("rcu: Define RCU-sched API in terms of RCU for Tree RCU PREEMPT builds") Fixes: 90de2a4aa9f3 ("cpufreq: suspend cpufreq governors on shutdown") Reported-by: Ville Syrjälä Tested-by: Ville Syrjälä Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Cc: 4.0+ # 4.0+ --- drivers/base/core.c | 3 +++ drivers/cpufreq/cpufreq.c | 10 ---------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 2db62d98e395..7bd9cd366d41 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -3179,6 +3180,8 @@ void device_shutdown(void) wait_for_device_probe(); device_block_probing(); + cpufreq_suspend(); + spin_lock(&devices_kset->list_lock); /* * Walk the devices list backward, shutting down each in turn. diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index c52d6fa32aac..bffc11b87247 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2737,14 +2737,6 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) } EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); -/* - * Stop cpufreq at shutdown to make sure it isn't holding any locks - * or mutexes when secondary CPUs are halted. - */ -static struct syscore_ops cpufreq_syscore_ops = { - .shutdown = cpufreq_suspend, -}; - struct kobject *cpufreq_global_kobject; EXPORT_SYMBOL(cpufreq_global_kobject); @@ -2756,8 +2748,6 @@ static int __init cpufreq_core_init(void) cpufreq_global_kobject = kobject_create_and_add("cpufreq", &cpu_subsys.dev_root->kobj); BUG_ON(!cpufreq_global_kobject); - register_syscore_ops(&cpufreq_syscore_ops); - return 0; } module_param(off, int, 0444); From f49249d58abdc12067d69f15d509487171148b30 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 8 Oct 2019 11:46:46 +0100 Subject: [PATCH 2072/2880] PM: sleep: include for pm_wq Include the for the definition of pm_wq to avoid the following warning: kernel/power/main.c:890:25: warning: symbol 'pm_wq' was not declared. Should it be static? Signed-off-by: Ben Dooks Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/power/main.c b/kernel/power/main.c index e8710d179b35..e26de7af520b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "power.h" From fd70c7755bf0172ddd33b558aef69c322de3b5cf Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Tue, 24 Sep 2019 16:17:02 +0300 Subject: [PATCH 2073/2880] drm/bridge: tc358767: fix max_tu_symbol value max_tu_symbol was programmed to TU_SIZE_RECOMMENDED - 1, which is not what the spec says. The spec says: roundup ((input active video bandwidth in bytes/output active video bandwidth in bytes) * tu_size) It is not quite clear what the above means, but calculating max_tu_symbol = (input Bps / output Bps) * tu_size seems to work and fixes the issues seen. This fixes artifacts in some videomodes (e.g. 1024x768@60 on 2-lanes & 1.62Gbps was pretty bad for me). Signed-off-by: Tomi Valkeinen Tested-by: Jyri Sarha Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/20190924131702.9988-1-tomi.valkeinen@ti.com --- drivers/gpu/drm/bridge/tc358767.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index cebc8e620820..8a8d605021f0 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -728,6 +728,8 @@ static int tc_set_video_mode(struct tc_data *tc, int lower_margin = mode->vsync_start - mode->vdisplay; int vsync_len = mode->vsync_end - mode->vsync_start; u32 dp0_syncval; + u32 bits_per_pixel = 24; + u32 in_bw, out_bw; /* * Recommended maximum number of symbols transferred in a transfer unit: @@ -735,7 +737,10 @@ static int tc_set_video_mode(struct tc_data *tc, * (output active video bandwidth in bytes)) * Must be less than tu_size. */ - max_tu_symbol = TU_SIZE_RECOMMENDED - 1; + + in_bw = mode->clock * bits_per_pixel / 8; + out_bw = tc->link.base.num_lanes * tc->link.base.rate; + max_tu_symbol = DIV_ROUND_UP(in_bw * TU_SIZE_RECOMMENDED, out_bw); dev_dbg(tc->dev, "set mode %dx%d\n", mode->hdisplay, mode->vdisplay); From bed5ef230943863b9abf5eae226a20fad9a8ff71 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 19:09:42 +0200 Subject: [PATCH 2074/2880] USB: usb-skeleton: fix NULL-deref on disconnect The driver was using its struct usb_interface pointer as an inverted disconnected flag and was setting it to NULL before making sure all completion handlers had run. This could lead to NULL-pointer dereferences in the dev_err() statements in the completion handlers which relies on said pointer. Fix this by using a dedicated disconnected flag. Note that this is also addresses a NULL-pointer dereference at release() and a struct usb_interface reference leak introduced by a recent runtime PM fix, which depends on and should have been submitted together with this patch. Fixes: 4212cd74ca6f ("USB: usb-skeleton.c: remove err() usage") Fixes: 5c290a5e42c3 ("USB: usb-skeleton: fix runtime PM after driver unbind") Cc: stable Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009170944.30057-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usb-skeleton.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/usb/usb-skeleton.c b/drivers/usb/usb-skeleton.c index 8001d6384c73..c2843fcfa52d 100644 --- a/drivers/usb/usb-skeleton.c +++ b/drivers/usb/usb-skeleton.c @@ -61,6 +61,7 @@ struct usb_skel { spinlock_t err_lock; /* lock for errors */ struct kref kref; struct mutex io_mutex; /* synchronize I/O with disconnect */ + unsigned long disconnected:1; wait_queue_head_t bulk_in_wait; /* to wait for an ongoing read */ }; #define to_skel_dev(d) container_of(d, struct usb_skel, kref) @@ -238,7 +239,7 @@ static ssize_t skel_read(struct file *file, char *buffer, size_t count, if (rv < 0) return rv; - if (!dev->interface) { /* disconnect() was called */ + if (dev->disconnected) { /* disconnect() was called */ rv = -ENODEV; goto exit; } @@ -420,7 +421,7 @@ static ssize_t skel_write(struct file *file, const char *user_buffer, /* this lock makes sure we don't submit URBs to gone devices */ mutex_lock(&dev->io_mutex); - if (!dev->interface) { /* disconnect() was called */ + if (dev->disconnected) { /* disconnect() was called */ mutex_unlock(&dev->io_mutex); retval = -ENODEV; goto error; @@ -571,7 +572,7 @@ static void skel_disconnect(struct usb_interface *interface) /* prevent more I/O from starting */ mutex_lock(&dev->io_mutex); - dev->interface = NULL; + dev->disconnected = 1; mutex_unlock(&dev->io_mutex); usb_kill_anchored_urbs(&dev->submitted); From 6353001852776e7eeaab4da78922d4c6f2b076af Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 19:09:43 +0200 Subject: [PATCH 2075/2880] USB: usb-skeleton: fix use-after-free after driver unbind The driver failed to stop its read URB on disconnect, something which could lead to a use-after-free in the completion handler after driver unbind in case the character device has been closed. Fixes: e7389cc9a7ff ("USB: skel_read really sucks royally") Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009170944.30057-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usb-skeleton.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/usb-skeleton.c b/drivers/usb/usb-skeleton.c index c2843fcfa52d..be311787403e 100644 --- a/drivers/usb/usb-skeleton.c +++ b/drivers/usb/usb-skeleton.c @@ -575,6 +575,7 @@ static void skel_disconnect(struct usb_interface *interface) dev->disconnected = 1; mutex_unlock(&dev->io_mutex); + usb_kill_urb(dev->bulk_in_urb); usb_kill_anchored_urbs(&dev->submitted); /* decrement our usage count */ From 369dca424a3f9ef80bc4e5a5ccdd41a17ce306c1 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 19:09:44 +0200 Subject: [PATCH 2076/2880] USB: usb-skeleton: drop redundant in-urb check The driver bails out at probe if we can't find a bulk-in endpoint or if we fail to allocate the URB, so drop the check in read(). Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009170944.30057-4-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usb-skeleton.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/usb-skeleton.c b/drivers/usb/usb-skeleton.c index be311787403e..2dc58766273a 100644 --- a/drivers/usb/usb-skeleton.c +++ b/drivers/usb/usb-skeleton.c @@ -230,8 +230,7 @@ static ssize_t skel_read(struct file *file, char *buffer, size_t count, dev = file->private_data; - /* if we cannot read at all, return EOF */ - if (!dev->bulk_in_urb || !count) + if (!count) return 0; /* no concurrent readers */ From ac9099e10a603f02f52f583fba78dd54c52b5542 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 7 Oct 2019 15:16:01 +0300 Subject: [PATCH 2077/2880] usb: cdns3: gadget: Fix full-speed mode We need to disable USB3 PHY for full-speed mode else gadget mode is broken. Signed-off-by: Roger Quadros Signed-off-by: Sekhar Nori Reviewed-by: Peter Chen Link: https://lore.kernel.org/r/20191007121601.25996-3-rogerq@ti.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/gadget.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/cdns3/gadget.c b/drivers/usb/cdns3/gadget.c index 228cdc4ab886..157536753b8c 100644 --- a/drivers/usb/cdns3/gadget.c +++ b/drivers/usb/cdns3/gadget.c @@ -2571,6 +2571,7 @@ static int cdns3_gadget_start(struct cdns3 *cdns) switch (max_speed) { case USB_SPEED_FULL: writel(USB_CONF_SFORCE_FS, &priv_dev->regs->usb_conf); + writel(USB_CONF_USB3DIS, &priv_dev->regs->usb_conf); break; case USB_SPEED_HIGH: writel(USB_CONF_USB3DIS, &priv_dev->regs->usb_conf); From 02ffc26df96b487f476b8945eaef13c7f170e79a Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 7 Oct 2019 15:16:00 +0300 Subject: [PATCH 2078/2880] usb: cdns3: fix cdns3_core_init_role() At startup we should trigger the HW state machine only if it is OTG mode. Otherwise we should just start the respective role. Initialize idle role by default. If we don't do this then cdns3_idle_role_stop() is not called when switching to host/device role and so lane switch mechanism doesn't work. This results to super-speed device not working in one orientation if it was plugged before driver probe. Signed-off-by: Roger Quadros Signed-off-by: Sekhar Nori Link: https://lore.kernel.org/r/20191007121601.25996-2-rogerq@ti.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/core.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/usb/cdns3/core.c b/drivers/usb/cdns3/core.c index 06f1e105be4e..1109dc5a4c39 100644 --- a/drivers/usb/cdns3/core.c +++ b/drivers/usb/cdns3/core.c @@ -160,10 +160,28 @@ static int cdns3_core_init_role(struct cdns3 *cdns) if (ret) goto err; - if (cdns->dr_mode != USB_DR_MODE_OTG) { + /* Initialize idle role to start with */ + ret = cdns3_role_start(cdns, USB_ROLE_NONE); + if (ret) + goto err; + + switch (cdns->dr_mode) { + case USB_DR_MODE_UNKNOWN: + case USB_DR_MODE_OTG: ret = cdns3_hw_role_switch(cdns); if (ret) goto err; + break; + case USB_DR_MODE_PERIPHERAL: + ret = cdns3_role_start(cdns, USB_ROLE_DEVICE); + if (ret) + goto err; + break; + case USB_DR_MODE_HOST: + ret = cdns3_role_start(cdns, USB_ROLE_HOST); + if (ret) + goto err; + break; } return ret; From eb21a74adaa11846737f503dd618bd35337e2c37 Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Mon, 7 Oct 2019 13:03:23 +0100 Subject: [PATCH 2079/2880] usb: cdns3: Fix for incorrect DMA mask. This patch restores the correct DMA mask after switching back to device mode. The issue occurred because Device part of controller use 32 bits DMA and Host side use 64 bits DMA. During loading XHCI driver the DMA mask used by driver is overwritten by XHCI driver so it must be restored to 32 bits. Reported-by: Pawel Laszczak Signed-off-by: Roger Quadros Signed-off-by: Pawel Laszczak Fixes: 7733f6c32e36 ("usb: cdns3: Add Cadence USB3 DRD Driver") Reviewed-by: Peter Chen Tested-by: Roger Quadros Link: https://lore.kernel.org/r/1570449803-15299-1-git-send-email-pawell@cadence.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/gadget.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/cdns3/gadget.c b/drivers/usb/cdns3/gadget.c index 157536753b8c..2ca280f4c054 100644 --- a/drivers/usb/cdns3/gadget.c +++ b/drivers/usb/cdns3/gadget.c @@ -2663,6 +2663,13 @@ static int __cdns3_gadget_init(struct cdns3 *cdns) { int ret = 0; + /* Ensure 32-bit DMA Mask in case we switched back from Host mode */ + ret = dma_set_mask_and_coherent(cdns->dev, DMA_BIT_MASK(32)); + if (ret) { + dev_err(cdns->dev, "Failed to set dma mask: %d\n", ret); + return ret; + } + cdns3_drd_switch_gadget(cdns, 1); pm_runtime_get_sync(cdns->dev); From 726b55d0e22ca72c69c947af87785c830289ddbc Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 17:38:47 +0200 Subject: [PATCH 2080/2880] USB: legousbtower: fix use-after-free on release The driver was accessing its struct usb_device in its release() callback without holding a reference. This would lead to a use-after-free whenever the device was disconnected while the character device was still open. Fixes: fef526cae700 ("USB: legousbtower: remove custom debug macro") Cc: stable # 3.12 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009153848.8664-5-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/legousbtower.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c index 44d6a3381804..9d4c52a7ebe0 100644 --- a/drivers/usb/misc/legousbtower.c +++ b/drivers/usb/misc/legousbtower.c @@ -296,6 +296,7 @@ static inline void tower_delete (struct lego_usb_tower *dev) kfree (dev->read_buffer); kfree (dev->interrupt_in_buffer); kfree (dev->interrupt_out_buffer); + usb_put_dev(dev->udev); kfree (dev); } @@ -810,7 +811,7 @@ static int tower_probe (struct usb_interface *interface, const struct usb_device mutex_init(&dev->lock); - dev->udev = udev; + dev->udev = usb_get_dev(udev); dev->open_count = 0; dev->disconnected = 0; From 58ecf131e74620305175a7aa103f81350bb37570 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 17:38:46 +0200 Subject: [PATCH 2081/2880] USB: ldusb: fix NULL-derefs on driver unbind The driver was using its struct usb_interface pointer as an inverted disconnected flag, but was setting it to NULL before making sure all completion handlers had run. This could lead to a NULL-pointer dereference in a number of dev_dbg, dev_warn and dev_err statements in the completion handlers which relies on said pointer. Fix this by unconditionally stopping all I/O and preventing resubmissions by poisoning the interrupt URBs at disconnect and using a dedicated disconnected flag. This also makes sure that all I/O has completed by the time the disconnect callback returns. Fixes: 2824bd250f0b ("[PATCH] USB: add ldusb driver") Cc: stable # 2.6.13 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009153848.8664-4-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/ldusb.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c index 6581774bdfa4..f3108d85e768 100644 --- a/drivers/usb/misc/ldusb.c +++ b/drivers/usb/misc/ldusb.c @@ -153,6 +153,7 @@ MODULE_PARM_DESC(min_interrupt_out_interval, "Minimum interrupt out interval in struct ld_usb { struct mutex mutex; /* locks this structure */ struct usb_interface *intf; /* save off the usb interface pointer */ + unsigned long disconnected:1; int open_count; /* number of times this port has been opened */ @@ -192,12 +193,10 @@ static void ld_usb_abort_transfers(struct ld_usb *dev) /* shutdown transfer */ if (dev->interrupt_in_running) { dev->interrupt_in_running = 0; - if (dev->intf) - usb_kill_urb(dev->interrupt_in_urb); + usb_kill_urb(dev->interrupt_in_urb); } if (dev->interrupt_out_busy) - if (dev->intf) - usb_kill_urb(dev->interrupt_out_urb); + usb_kill_urb(dev->interrupt_out_urb); } /** @@ -205,8 +204,6 @@ static void ld_usb_abort_transfers(struct ld_usb *dev) */ static void ld_usb_delete(struct ld_usb *dev) { - ld_usb_abort_transfers(dev); - /* free data structures */ usb_free_urb(dev->interrupt_in_urb); usb_free_urb(dev->interrupt_out_urb); @@ -263,7 +260,7 @@ static void ld_usb_interrupt_in_callback(struct urb *urb) resubmit: /* resubmit if we're still running */ - if (dev->interrupt_in_running && !dev->buffer_overflow && dev->intf) { + if (dev->interrupt_in_running && !dev->buffer_overflow) { retval = usb_submit_urb(dev->interrupt_in_urb, GFP_ATOMIC); if (retval) { dev_err(&dev->intf->dev, @@ -392,7 +389,7 @@ static int ld_usb_release(struct inode *inode, struct file *file) retval = -ENODEV; goto unlock_exit; } - if (dev->intf == NULL) { + if (dev->disconnected) { /* the device was unplugged before the file was released */ mutex_unlock(&dev->mutex); /* unlock here as ld_usb_delete frees dev */ @@ -423,7 +420,7 @@ static __poll_t ld_usb_poll(struct file *file, poll_table *wait) dev = file->private_data; - if (!dev->intf) + if (dev->disconnected) return EPOLLERR | EPOLLHUP; poll_wait(file, &dev->read_wait, wait); @@ -462,7 +459,7 @@ static ssize_t ld_usb_read(struct file *file, char __user *buffer, size_t count, } /* verify that the device wasn't unplugged */ - if (dev->intf == NULL) { + if (dev->disconnected) { retval = -ENODEV; printk(KERN_ERR "ldusb: No device or device unplugged %d\n", retval); goto unlock_exit; @@ -542,7 +539,7 @@ static ssize_t ld_usb_write(struct file *file, const char __user *buffer, } /* verify that the device wasn't unplugged */ - if (dev->intf == NULL) { + if (dev->disconnected) { retval = -ENODEV; printk(KERN_ERR "ldusb: No device or device unplugged %d\n", retval); goto unlock_exit; @@ -764,6 +761,9 @@ static void ld_usb_disconnect(struct usb_interface *intf) /* give back our minor */ usb_deregister_dev(intf, &ld_usb_class); + usb_poison_urb(dev->interrupt_in_urb); + usb_poison_urb(dev->interrupt_out_urb); + mutex_lock(&dev->mutex); /* if the device is not opened, then we clean up right now */ @@ -771,7 +771,7 @@ static void ld_usb_disconnect(struct usb_interface *intf) mutex_unlock(&dev->mutex); ld_usb_delete(dev); } else { - dev->intf = NULL; + dev->disconnected = 1; /* wake up pollers */ wake_up_interruptible_all(&dev->read_wait); wake_up_interruptible_all(&dev->write_wait); From 123a0f125fa3d2104043697baa62899d9e549272 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 17:38:44 +0200 Subject: [PATCH 2082/2880] USB: adutux: fix use-after-free on release The driver was accessing its struct usb_device in its release() callback without holding a reference. This would lead to a use-after-free whenever the device was disconnected while the character device was still open. Fixes: 66d4bc30d128 ("USB: adutux: remove custom debug macro") Cc: stable # 3.12 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009153848.8664-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/adutux.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/misc/adutux.c b/drivers/usb/misc/adutux.c index f9efec719359..6f5edb9fc61e 100644 --- a/drivers/usb/misc/adutux.c +++ b/drivers/usb/misc/adutux.c @@ -149,6 +149,7 @@ static void adu_delete(struct adu_device *dev) kfree(dev->read_buffer_secondary); kfree(dev->interrupt_in_buffer); kfree(dev->interrupt_out_buffer); + usb_put_dev(dev->udev); kfree(dev); } @@ -664,7 +665,7 @@ static int adu_probe(struct usb_interface *interface, mutex_init(&dev->mtx); spin_lock_init(&dev->buflock); - dev->udev = udev; + dev->udev = usb_get_dev(udev); init_waitqueue_head(&dev->read_wait); init_waitqueue_head(&dev->write_wait); From 93ddb1f56ae102f14f9e46a9a9c8017faa970003 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 17:38:45 +0200 Subject: [PATCH 2083/2880] USB: chaoskey: fix use-after-free on release The driver was accessing its struct usb_interface in its release() callback without holding a reference. This would lead to a use-after-free whenever the device was disconnected while the character device was still open. Fixes: 66e3e591891d ("usb: Add driver for Altus Metrum ChaosKey device (v2)") Cc: stable # 4.1 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009153848.8664-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/chaoskey.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/misc/chaoskey.c b/drivers/usb/misc/chaoskey.c index cf5828ce927a..34e6cd6f40d3 100644 --- a/drivers/usb/misc/chaoskey.c +++ b/drivers/usb/misc/chaoskey.c @@ -98,6 +98,7 @@ static void chaoskey_free(struct chaoskey *dev) usb_free_urb(dev->urb); kfree(dev->name); kfree(dev->buf); + usb_put_intf(dev->interface); kfree(dev); } } @@ -145,6 +146,8 @@ static int chaoskey_probe(struct usb_interface *interface, if (dev == NULL) goto out; + dev->interface = usb_get_intf(interface); + dev->buf = kmalloc(size, GFP_KERNEL); if (dev->buf == NULL) @@ -174,8 +177,6 @@ static int chaoskey_probe(struct usb_interface *interface, goto out; } - dev->interface = interface; - dev->in_ep = in_ep; if (le16_to_cpu(udev->descriptor.idVendor) != ALEA_VENDOR_ID) From edc4746f253d907d048de680a621e121517f484b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 12:48:41 +0200 Subject: [PATCH 2084/2880] USB: iowarrior: fix use-after-free on disconnect A recent fix addressing a deadlock on disconnect introduced a new bug by moving the present flag out of the critical section protected by the driver-data mutex. This could lead to a racing release() freeing the driver data before disconnect() is done with it. Due to insufficient locking a related use-after-free could be triggered also before the above mentioned commit. Specifically, the driver needs to hold the driver-data mutex also while checking the opened flag at disconnect(). Fixes: c468a8aa790e ("usb: iowarrior: fix deadlock on disconnect") Fixes: 946b960d13c1 ("USB: add driver for iowarrior devices.") Cc: stable # 2.6.21 Reported-by: syzbot+0761012cebf7bdb38137@syzkaller.appspotmail.com Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009104846.5925-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/iowarrior.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index f5bed9f29e56..4fe1d3267b3c 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -866,8 +866,6 @@ static void iowarrior_disconnect(struct usb_interface *interface) dev = usb_get_intfdata(interface); mutex_lock(&iowarrior_open_disc_lock); usb_set_intfdata(interface, NULL); - /* prevent device read, write and ioctl */ - dev->present = 0; minor = dev->minor; mutex_unlock(&iowarrior_open_disc_lock); @@ -878,8 +876,7 @@ static void iowarrior_disconnect(struct usb_interface *interface) mutex_lock(&dev->mutex); /* prevent device read, write and ioctl */ - - mutex_unlock(&dev->mutex); + dev->present = 0; if (dev->opened) { /* There is a process that holds a filedescriptor to the device , @@ -889,8 +886,10 @@ static void iowarrior_disconnect(struct usb_interface *interface) usb_kill_urb(dev->int_in_urb); wake_up_interruptible(&dev->read_wait); wake_up_interruptible(&dev->write_wait); + mutex_unlock(&dev->mutex); } else { /* no process is using the device, cleanup now */ + mutex_unlock(&dev->mutex); iowarrior_delete(dev); } From 80cd5479b525093a56ef768553045741af61b250 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 12:48:42 +0200 Subject: [PATCH 2085/2880] USB: iowarrior: fix use-after-free on release The driver was accessing its struct usb_interface from its release() callback without holding a reference. This would lead to a use-after-free whenever debugging was enabled and the device was disconnected while its character device was open. Fixes: 549e83500b80 ("USB: iowarrior: Convert local dbg macro to dev_dbg") Cc: stable # 3.16 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009104846.5925-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/iowarrior.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index 4fe1d3267b3c..6841267820c6 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -243,6 +243,7 @@ static inline void iowarrior_delete(struct iowarrior *dev) kfree(dev->int_in_buffer); usb_free_urb(dev->int_in_urb); kfree(dev->read_queue); + usb_put_intf(dev->interface); kfree(dev); } @@ -764,7 +765,7 @@ static int iowarrior_probe(struct usb_interface *interface, init_waitqueue_head(&dev->write_wait); dev->udev = udev; - dev->interface = interface; + dev->interface = usb_get_intf(interface); iface_desc = interface->cur_altsetting; dev->product_id = le16_to_cpu(udev->descriptor.idProduct); From b5f8d46867ca233d773408ffbe691a8062ed718f Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 12:48:43 +0200 Subject: [PATCH 2086/2880] USB: iowarrior: fix use-after-free after driver unbind Make sure to stop also the asynchronous write URBs on disconnect() to avoid use-after-free in the completion handler after driver unbind. Fixes: 946b960d13c1 ("USB: add driver for iowarrior devices.") Cc: stable # 2.6.21: 51a2f077c44e ("USB: introduce usb_anchor") Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009104846.5925-4-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/iowarrior.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index 6841267820c6..f405fa734bcc 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -87,6 +87,7 @@ struct iowarrior { char chip_serial[9]; /* the serial number string of the chip connected */ int report_size; /* number of bytes in a report */ u16 product_id; + struct usb_anchor submitted; }; /*--------------*/ @@ -425,11 +426,13 @@ static ssize_t iowarrior_write(struct file *file, retval = -EFAULT; goto error; } + usb_anchor_urb(int_out_urb, &dev->submitted); retval = usb_submit_urb(int_out_urb, GFP_KERNEL); if (retval) { dev_dbg(&dev->interface->dev, "submit error %d for urb nr.%d\n", retval, atomic_read(&dev->write_busy)); + usb_unanchor_urb(int_out_urb); goto error; } /* submit was ok */ @@ -770,6 +773,8 @@ static int iowarrior_probe(struct usb_interface *interface, iface_desc = interface->cur_altsetting; dev->product_id = le16_to_cpu(udev->descriptor.idProduct); + init_usb_anchor(&dev->submitted); + res = usb_find_last_int_in_endpoint(iface_desc, &dev->int_in_endpoint); if (res) { dev_err(&interface->dev, "no interrupt-in endpoint found\n"); @@ -885,6 +890,7 @@ static void iowarrior_disconnect(struct usb_interface *interface) Deleting the device is postponed until close() was called. */ usb_kill_urb(dev->int_in_urb); + usb_kill_anchored_urbs(&dev->submitted); wake_up_interruptible(&dev->read_wait); wake_up_interruptible(&dev->write_wait); mutex_unlock(&dev->mutex); From 7c5b971d623fdb40c03205e99f9ef68002b34726 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 12:48:44 +0200 Subject: [PATCH 2087/2880] USB: iowarrior: drop redundant disconnect mutex Drop the redundant disconnect mutex which was introduced after the open-disconnect race had been addressed generally in USB core by commit d4ead16f50f9 ("USB: prevent char device open/deregister race"). Specifically, the rw-semaphore in core guarantees that all calls to open() will have completed and that no new calls to open() will occur after usb_deregister_dev() returns. Hence there is no need use the driver data as an inverted disconnected flag. Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009104846.5925-5-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/iowarrior.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index f405fa734bcc..d844c2098e42 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -58,7 +58,6 @@ MODULE_LICENSE("GPL"); static DEFINE_MUTEX(iowarrior_mutex); static struct usb_driver iowarrior_driver; -static DEFINE_MUTEX(iowarrior_open_disc_lock); /*--------------*/ /* data */ @@ -601,16 +600,13 @@ static int iowarrior_open(struct inode *inode, struct file *file) return -ENODEV; } - mutex_lock(&iowarrior_open_disc_lock); dev = usb_get_intfdata(interface); if (!dev) { - mutex_unlock(&iowarrior_open_disc_lock); mutex_unlock(&iowarrior_mutex); return -ENODEV; } mutex_lock(&dev->mutex); - mutex_unlock(&iowarrior_open_disc_lock); /* Only one process can open each device, no sharing. */ if (dev->opened) { @@ -842,7 +838,6 @@ static int iowarrior_probe(struct usb_interface *interface, if (retval) { /* something prevented us from registering this driver */ dev_err(&interface->dev, "Not able to get a minor for this device.\n"); - usb_set_intfdata(interface, NULL); goto error; } @@ -866,16 +861,8 @@ error: */ static void iowarrior_disconnect(struct usb_interface *interface) { - struct iowarrior *dev; - int minor; - - dev = usb_get_intfdata(interface); - mutex_lock(&iowarrior_open_disc_lock); - usb_set_intfdata(interface, NULL); - - minor = dev->minor; - mutex_unlock(&iowarrior_open_disc_lock); - /* give back our minor - this will call close() locks need to be dropped at this point*/ + struct iowarrior *dev = usb_get_intfdata(interface); + int minor = dev->minor; usb_deregister_dev(interface, &iowarrior_class); From 8d33e828f72c216ae264ad39088a595d9a6cc95c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 12:48:45 +0200 Subject: [PATCH 2088/2880] USB: iowarrior: drop redundant iowarrior mutex Drop the redundant iowarrior mutex introduced by commit 925ce689bb31 ("USB: autoconvert trivial BKL users to private mutex") which replaced an earlier BKL use. The lock serialised calls to open() against other open() and ioctl(), but neither is needed. Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009104846.5925-6-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/iowarrior.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index d844c2098e42..ad29ef51e53f 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -54,9 +54,6 @@ MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); -/* Module parameters */ -static DEFINE_MUTEX(iowarrior_mutex); - static struct usb_driver iowarrior_driver; /*--------------*/ @@ -480,8 +477,6 @@ static long iowarrior_ioctl(struct file *file, unsigned int cmd, if (!buffer) return -ENOMEM; - /* lock this object */ - mutex_lock(&iowarrior_mutex); mutex_lock(&dev->mutex); /* verify that the device wasn't unplugged */ @@ -574,7 +569,6 @@ static long iowarrior_ioctl(struct file *file, unsigned int cmd, error_out: /* unlock the device */ mutex_unlock(&dev->mutex); - mutex_unlock(&iowarrior_mutex); kfree(buffer); return retval; } @@ -589,22 +583,18 @@ static int iowarrior_open(struct inode *inode, struct file *file) int subminor; int retval = 0; - mutex_lock(&iowarrior_mutex); subminor = iminor(inode); interface = usb_find_interface(&iowarrior_driver, subminor); if (!interface) { - mutex_unlock(&iowarrior_mutex); printk(KERN_ERR "%s - error, can't find device for minor %d\n", __func__, subminor); return -ENODEV; } dev = usb_get_intfdata(interface); - if (!dev) { - mutex_unlock(&iowarrior_mutex); + if (!dev) return -ENODEV; - } mutex_lock(&dev->mutex); @@ -628,7 +618,6 @@ static int iowarrior_open(struct inode *inode, struct file *file) out: mutex_unlock(&dev->mutex); - mutex_unlock(&iowarrior_mutex); return retval; } From ebb2fe57a51c630e0f852becbbdd295ad5d60514 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 12:48:46 +0200 Subject: [PATCH 2089/2880] USB: iowarrior: use pr_err() Replace the one remaining printk with pr_err(). Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009104846.5925-7-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/iowarrior.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index ad29ef51e53f..dce44fbf031f 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -587,7 +587,7 @@ static int iowarrior_open(struct inode *inode, struct file *file) interface = usb_find_interface(&iowarrior_driver, subminor); if (!interface) { - printk(KERN_ERR "%s - error, can't find device for minor %d\n", + pr_err("%s - error, can't find device for minor %d\n", __func__, subminor); return -ENODEV; } From f2edbb6699b0bc6e4f789846b99007200546c6c2 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 10 Oct 2019 15:55:33 +0530 Subject: [PATCH 2090/2880] opp: of: drop incorrect lockdep_assert_held() _find_opp_of_np() doesn't traverse the list of OPP tables but instead just the entries within an OPP table and so only requires to lock the OPP table itself. The lockdep_assert_held() was added there by mistake and isn't really required. Fixes: 5d6d106fa455 ("OPP: Populate required opp tables from "required-opps" property") Cc: v5.0+ # v5.0+ Reported-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/opp/of.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 1813f5ad5fa2..6dc41faf74b5 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -77,8 +77,6 @@ static struct dev_pm_opp *_find_opp_of_np(struct opp_table *opp_table, { struct dev_pm_opp *opp; - lockdep_assert_held(&opp_table_lock); - mutex_lock(&opp_table->lock); list_for_each_entry(opp, &opp_table->opp_list, node) { From ff30283a8de4f978dad120936c1af507d01a6a98 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 8 Oct 2019 13:46:53 -0700 Subject: [PATCH 2091/2880] serial: fix kernel-doc warning in comments Fix Sphinx warning in serial_core.c: ../drivers/tty/serial/serial_core.c:1969: WARNING: Definition list ends without a blank line; unexpected unindent. Fixes: 73abaf87f01b ("serial: earlycon: Refactor parse_options into serial core") Signed-off-by: Randy Dunlap Cc: Peter Hurley Cc: Greg Kroah-Hartman Link: https://lore.kernel.org/r/e989641c-224a-1090-e596-e7cc800bed44@infradead.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 6e713be1d4e9..c4a414a46c7f 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -1964,8 +1964,10 @@ uart_get_console(struct uart_port *ports, int nr, struct console *co) * console=,io|mmio|mmio16|mmio32|mmio32be|mmio32native,, * * The optional form + * * earlycon=,0x, * console=,0x, + * * is also accepted; the returned @iotype will be UPIO_MEM. * * Returns 0 on success or -EINVAL on failure From 31a8d8fa84c51d3ab00bf059158d5de6178cf890 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Wed, 9 Oct 2019 17:49:19 +0800 Subject: [PATCH 2092/2880] tty: serial: imx: Use platform_get_irq_optional() for optional IRQs All i.MX SoCs except i.MX1 have ONLY one necessary IRQ, use platform_get_irq_optional() to get second/third IRQ which are optional to avoid below error message during probe: [ 0.726219] imx-uart 30860000.serial: IRQ index 1 not found [ 0.731329] imx-uart 30860000.serial: IRQ index 2 not found Fixes: 7723f4c5ecdb8d83 ("driver core: platform: Add an error message to platform_get_irq*()") Signed-off-by: Anson Huang Link: https://lore.kernel.org/r/1570614559-11900-1-git-send-email-Anson.Huang@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/imx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index 87c58f9f6390..5e08f2657b90 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -2222,8 +2222,8 @@ static int imx_uart_probe(struct platform_device *pdev) return PTR_ERR(base); rxirq = platform_get_irq(pdev, 0); - txirq = platform_get_irq(pdev, 1); - rtsirq = platform_get_irq(pdev, 2); + txirq = platform_get_irq_optional(pdev, 1); + rtsirq = platform_get_irq_optional(pdev, 2); sport->port.dev = &pdev->dev; sport->port.mapbase = res->start; From 18380f52dbac230032a16545e67a6d90b548cf18 Mon Sep 17 00:00:00 2001 From: yu kuai Date: Sun, 29 Sep 2019 21:42:49 +0800 Subject: [PATCH 2093/2880] platform/x86: classmate-laptop: remove unused variable Fixes gcc '-Wunused-but-set-variable' warning: drivers/platform/x86/classmate-laptop.c: In function cmpc_accel_remove_v4: drivers/platform/x86/classmate-laptop.c:424:21: warning: variable accel set but not used [-Wunused-but-set-variable] drivers/platform/x86/classmate-laptop.c: In function cmpc_accel_remove: drivers/platform/x86/classmate-laptop.c:660:21: warning: variable accel set but not used [-Wunused-but-set-variable] In function cmpc_accel_remove_v4 and cmpc_accel_remove, variable accel is set but not used, so it can be removed. In that case, variable inputdev is set but not used and can be removed. Fixes: 7125587df4e8 ("classmate-laptop: Add support for Classmate V4 accelerometer.") Reported-by: Hulk Robot Signed-off-by: yu kuai Signed-off-by: Andy Shevchenko --- drivers/platform/x86/classmate-laptop.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/platform/x86/classmate-laptop.c b/drivers/platform/x86/classmate-laptop.c index 86cc2cc68fb5..af063f690846 100644 --- a/drivers/platform/x86/classmate-laptop.c +++ b/drivers/platform/x86/classmate-laptop.c @@ -420,12 +420,6 @@ failed_sensitivity: static int cmpc_accel_remove_v4(struct acpi_device *acpi) { - struct input_dev *inputdev; - struct cmpc_accel *accel; - - inputdev = dev_get_drvdata(&acpi->dev); - accel = dev_get_drvdata(&inputdev->dev); - device_remove_file(&acpi->dev, &cmpc_accel_sensitivity_attr_v4); device_remove_file(&acpi->dev, &cmpc_accel_g_select_attr_v4); return cmpc_remove_acpi_notify_device(acpi); @@ -656,12 +650,6 @@ failed_file: static int cmpc_accel_remove(struct acpi_device *acpi) { - struct input_dev *inputdev; - struct cmpc_accel *accel; - - inputdev = dev_get_drvdata(&acpi->dev); - accel = dev_get_drvdata(&inputdev->dev); - device_remove_file(&acpi->dev, &cmpc_accel_sensitivity_attr); return cmpc_remove_acpi_notify_device(acpi); } From 71eea7071583b04e9b796ee1d6f7a07334426495 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 10 Oct 2019 11:15:20 +0300 Subject: [PATCH 2094/2880] platform/x86: intel_punit_ipc: Avoid error message when retrieving IRQ Since the commit 7723f4c5ecdb ("driver core: platform: Add an error message to platform_get_irq*()") the platform_get_irq() started issuing an error message which is not what we want here. Switch to platform_get_irq_optional() to have only warning message provided by the driver. Fixes: 7723f4c5ecdb ("driver core: platform: Add an error message to platform_get_irq*()") Signed-off-by: Andy Shevchenko --- drivers/platform/x86/intel_punit_ipc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel_punit_ipc.c b/drivers/platform/x86/intel_punit_ipc.c index ab7ae1950867..fa97834fdb78 100644 --- a/drivers/platform/x86/intel_punit_ipc.c +++ b/drivers/platform/x86/intel_punit_ipc.c @@ -293,9 +293,8 @@ static int intel_punit_ipc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, punit_ipcdev); - irq = platform_get_irq(pdev, 0); + irq = platform_get_irq_optional(pdev, 0); if (irq < 0) { - punit_ipcdev->irq = 0; dev_warn(&pdev->dev, "Invalid IRQ, using polling mode\n"); } else { ret = devm_request_irq(&pdev->dev, irq, intel_punit_ioc, From aafb00a977cf7d81821f7c9d12e04c558c22dc3c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2019 17:38:48 +0200 Subject: [PATCH 2095/2880] USB: yurex: fix NULL-derefs on disconnect The driver was using its struct usb_interface pointer as an inverted disconnected flag, but was setting it to NULL without making sure all code paths that used it were done with it. Before commit ef61eb43ada6 ("USB: yurex: Fix protection fault after device removal") this included the interrupt-in completion handler, but there are further accesses in dev_err and dev_dbg statements in yurex_write() and the driver-data destructor (sic!). Fix this by unconditionally stopping also the control URB at disconnect and by using a dedicated disconnected flag. Note that we need to take a reference to the struct usb_interface to avoid a use-after-free in the destructor whenever the device was disconnected while the character device was still open. Fixes: aadd6472d904 ("USB: yurex.c: remove dbg() usage") Fixes: 45714104b9e8 ("USB: yurex.c: remove err() usage") Cc: stable # 3.5: ef61eb43ada6 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20191009153848.8664-6-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/yurex.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/usb/misc/yurex.c b/drivers/usb/misc/yurex.c index 8d52d4336c29..be0505b8b5d4 100644 --- a/drivers/usb/misc/yurex.c +++ b/drivers/usb/misc/yurex.c @@ -60,6 +60,7 @@ struct usb_yurex { struct kref kref; struct mutex io_mutex; + unsigned long disconnected:1; struct fasync_struct *async_queue; wait_queue_head_t waitq; @@ -107,6 +108,7 @@ static void yurex_delete(struct kref *kref) dev->int_buffer, dev->urb->transfer_dma); usb_free_urb(dev->urb); } + usb_put_intf(dev->interface); usb_put_dev(dev->udev); kfree(dev); } @@ -205,7 +207,7 @@ static int yurex_probe(struct usb_interface *interface, const struct usb_device_ init_waitqueue_head(&dev->waitq); dev->udev = usb_get_dev(interface_to_usbdev(interface)); - dev->interface = interface; + dev->interface = usb_get_intf(interface); /* set up the endpoint information */ iface_desc = interface->cur_altsetting; @@ -316,8 +318,9 @@ static void yurex_disconnect(struct usb_interface *interface) /* prevent more I/O from starting */ usb_poison_urb(dev->urb); + usb_poison_urb(dev->cntl_urb); mutex_lock(&dev->io_mutex); - dev->interface = NULL; + dev->disconnected = 1; mutex_unlock(&dev->io_mutex); /* wakeup waiters */ @@ -405,7 +408,7 @@ static ssize_t yurex_read(struct file *file, char __user *buffer, size_t count, dev = file->private_data; mutex_lock(&dev->io_mutex); - if (!dev->interface) { /* already disconnected */ + if (dev->disconnected) { /* already disconnected */ mutex_unlock(&dev->io_mutex); return -ENODEV; } @@ -440,7 +443,7 @@ static ssize_t yurex_write(struct file *file, const char __user *user_buffer, goto error; mutex_lock(&dev->io_mutex); - if (!dev->interface) { /* already disconnected */ + if (dev->disconnected) { /* already disconnected */ mutex_unlock(&dev->io_mutex); retval = -ENODEV; goto error; From 51d8a7eca67784b155a07aeab4bfb9f63ebaaf9e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 8 Oct 2019 15:01:59 +0200 Subject: [PATCH 2096/2880] binder: prevent UAF read in print_binder_transaction_log_entry() When a binder transaction is initiated on a binder device coming from a binderfs instance, a pointer to the name of the binder device is stashed in the binder_transaction_log_entry's context_name member. Later on it is used to print the name in print_binder_transaction_log_entry(). By the time print_binder_transaction_log_entry() accesses context_name binderfs_evict_inode() might have already freed the associated memory thereby causing a UAF. Do the simple thing and prevent this by copying the name of the binder device instead of stashing a pointer to it. Reported-by: Jann Horn Fixes: 03e2e07e3814 ("binder: Make transaction_log available in binderfs") Link: https://lore.kernel.org/r/CAG48ez14Q0-F8LqsvcNbyR2o6gPW8SHXsm4u5jmD9MpsteM2Tw@mail.gmail.com Signed-off-by: Christian Brauner Reviewed-by: Joel Fernandes (Google) Acked-by: Todd Kjos Reviewed-by: Hridya Valsaraju Link: https://lore.kernel.org/r/20191008130159.10161-1-christian.brauner@ubuntu.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 4 +++- drivers/android/binder_internal.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index c0a491277aca..5b9ac2122e89 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -66,6 +67,7 @@ #include #include +#include #include @@ -2876,7 +2878,7 @@ static void binder_transaction(struct binder_proc *proc, e->target_handle = tr->target.handle; e->data_size = tr->data_size; e->offsets_size = tr->offsets_size; - e->context_name = proc->context->name; + strscpy(e->context_name, proc->context->name, BINDERFS_MAX_NAME); if (reply) { binder_inner_proc_lock(proc); diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h index bd47f7f72075..ae991097d14d 100644 --- a/drivers/android/binder_internal.h +++ b/drivers/android/binder_internal.h @@ -130,7 +130,7 @@ struct binder_transaction_log_entry { int return_error_line; uint32_t return_error; uint32_t return_error_param; - const char *context_name; + char context_name[BINDERFS_MAX_NAME + 1]; }; struct binder_transaction_log { From 5dc54a06f6e575f146492661129d24f3b50d17bb Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 30 Sep 2019 16:12:50 -0400 Subject: [PATCH 2097/2880] binder: Fix comment headers on binder_alloc_prepare_to_free() binder_alloc_buffer_lookup() doesn't exist and is named "binder_alloc_prepare_to_free()". Correct the code comments to reflect this. Signed-off-by: Joel Fernandes (Google) Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20190930201250.139554-1-joel@joelfernandes.org Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 6d79a1b0d446..d42a8b2f636a 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -156,7 +156,7 @@ static struct binder_buffer *binder_alloc_prepare_to_free_locked( } /** - * binder_alloc_buffer_lookup() - get buffer given user ptr + * binder_alloc_prepare_to_free() - get buffer given user ptr * @alloc: binder_alloc for this proc * @user_ptr: User pointer to buffer data * From e0b0cb9388642c104838fac100a4af32745621e2 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Mon, 30 Sep 2019 15:42:22 -0500 Subject: [PATCH 2098/2880] virt: vbox: fix memory leak in hgcm_call_preprocess_linaddr In hgcm_call_preprocess_linaddr memory is allocated for bounce_buf but is not released if copy_form_user fails. In order to prevent memory leak in case of failure, the assignment to bounce_buf_ret is moved before the error check. This way the allocated bounce_buf will be released by the caller. Fixes: 579db9d45cb4 ("virt: Add vboxguest VMMDEV communication code") Signed-off-by: Navid Emamdoost Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20190930204223.3660-1-navid.emamdoost@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/virt/vboxguest/vboxguest_utils.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/virt/vboxguest/vboxguest_utils.c b/drivers/virt/vboxguest/vboxguest_utils.c index 75fd140b02ff..43c391626a00 100644 --- a/drivers/virt/vboxguest/vboxguest_utils.c +++ b/drivers/virt/vboxguest/vboxguest_utils.c @@ -220,6 +220,8 @@ static int hgcm_call_preprocess_linaddr( if (!bounce_buf) return -ENOMEM; + *bounce_buf_ret = bounce_buf; + if (copy_in) { ret = copy_from_user(bounce_buf, (void __user *)buf, len); if (ret) @@ -228,7 +230,6 @@ static int hgcm_call_preprocess_linaddr( memset(bounce_buf, 0, len); } - *bounce_buf_ret = bounce_buf; hgcm_call_add_pagelist_size(bounce_buf, len, extra); return 0; } From b33a654a5bd6bba0052ca2d86098826b309f27b8 Mon Sep 17 00:00:00 2001 From: Ulf Magnusson Date: Fri, 27 Sep 2019 19:42:32 +0200 Subject: [PATCH 2099/2880] drm/tiny: Kconfig: Remove always-y THERMAL dep. from TINYDRM_REPAPER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [cherry-picked to drm-misc-fixes: drm-misc-next commit dfef959803c7] Commit 554b3529fe01 ("thermal/drivers/core: Remove the module Kconfig's option") changed the type of THERMAL from tristate to bool, so THERMAL || !THERMAL is now always y. Remove the redundant dependency. Discovered through Kconfiglib detecting a dependency loop. The C tools simplify the expression to y before running dependency loop detection, and so don't see it. Changing the type of THERMAL back to tristate makes the C tools detect the same loop. Not sure if running dep. loop detection after simplification can be called a bug. Fixing this nit unbreaks Kconfiglib on the kernel at least. Signed-off-by: Ulf Magnusson Signed-off-by: Noralf Trønnes Link: https://patchwork.freedesktop.org/patch/msgid/20190927174218.GA32085@huvuddator --- drivers/gpu/drm/tiny/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/tiny/Kconfig b/drivers/gpu/drm/tiny/Kconfig index 504763423d46..a46ac284dd5e 100644 --- a/drivers/gpu/drm/tiny/Kconfig +++ b/drivers/gpu/drm/tiny/Kconfig @@ -63,7 +63,6 @@ config TINYDRM_REPAPER depends on DRM && SPI select DRM_KMS_HELPER select DRM_KMS_CMA_HELPER - depends on THERMAL || !THERMAL help DRM driver for the following Pervasive Displays panels: 1.44" TFT EPD Panel (E1144CS021) From b058b2552edb4a6ce62922bf904149bbafd5fd9d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 20 Sep 2019 14:03:18 +0800 Subject: [PATCH 2100/2880] w1: ds250x: Fix build error without CRC16 If CRC16 is not set, building will fails: drivers/w1/slaves/w1_ds250x.o: In function `w1_ds2505_read_page': w1_ds250x.c:(.text+0x82f): undefined reference to `crc16' w1_ds250x.c:(.text+0x90a): undefined reference to `crc16' w1_ds250x.c:(.text+0x91a): undefined reference to `crc16' Reported-by: Hulk Robot Fixes: 25ec8710d9c2 ("w1: add DS2501, DS2502, DS2505 EPROM device driver") Signed-off-by: YueHaibing Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20190920060318.35020-1-yuehaibing@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/w1/slaves/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/w1/slaves/Kconfig b/drivers/w1/slaves/Kconfig index ebed495b9e69..b7847636501d 100644 --- a/drivers/w1/slaves/Kconfig +++ b/drivers/w1/slaves/Kconfig @@ -103,6 +103,7 @@ config W1_SLAVE_DS2438 config W1_SLAVE_DS250X tristate "512b/1kb/16kb EPROM family support" + select CRC16 help Say Y here if you want to use a 1-wire 512b/1kb/16kb EPROM family device (DS250x). From a2f83e8b0c82c9500421a26c49eb198b25fcdea3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 2 Oct 2019 06:14:17 -0400 Subject: [PATCH 2101/2880] dm snapshot: introduce account_start_copy() and account_end_copy() This simple refactoring moves code for modifying the semaphore cow_count into separate functions to prepare for changes that will extend these methods to provide for a more sophisticated mechanism for COW throttling. Signed-off-by: Mikulas Patocka Reviewed-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index f150f5c5492b..da3bd1794ee0 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1512,6 +1512,16 @@ static void snapshot_dtr(struct dm_target *ti) kfree(s); } +static void account_start_copy(struct dm_snapshot *s) +{ + down(&s->cow_count); +} + +static void account_end_copy(struct dm_snapshot *s) +{ + up(&s->cow_count); +} + /* * Flush a list of buffers. */ @@ -1732,7 +1742,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) rb_link_node(&pe->out_of_order_node, parent, p); rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree); } - up(&s->cow_count); + account_end_copy(s); } /* @@ -1756,7 +1766,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) dest.count = src.count; /* Hand over to kcopyd */ - down(&s->cow_count); + account_start_copy(s); dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); } @@ -1776,7 +1786,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, pe->full_bio = bio; pe->full_bio_end_io = bio->bi_end_io; - down(&s->cow_count); + account_start_copy(s); callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, copy_callback, pe); @@ -1866,7 +1876,7 @@ static void zero_callback(int read_err, unsigned long write_err, void *context) struct bio *bio = context; struct dm_snapshot *s = bio->bi_private; - up(&s->cow_count); + account_end_copy(s); bio->bi_status = write_err ? BLK_STS_IOERR : 0; bio_endio(bio); } @@ -1880,7 +1890,7 @@ static void zero_exception(struct dm_snapshot *s, struct dm_exception *e, dest.sector = bio->bi_iter.bi_sector; dest.count = s->store->chunk_size; - down(&s->cow_count); + account_start_copy(s); WARN_ON_ONCE(bio->bi_private); bio->bi_private = s; dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio); From b21555786f18cd77f2311ad89074533109ae3ffa Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 2 Oct 2019 06:15:53 -0400 Subject: [PATCH 2102/2880] dm snapshot: rework COW throttling to fix deadlock Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and workqueue stalls") introduced a semaphore to limit the maximum number of in-flight kcopyd (COW) jobs. The implementation of this throttling mechanism is prone to a deadlock: 1. One or more threads write to the origin device causing COW, which is performed by kcopyd. 2. At some point some of these threads might reach the s->cow_count semaphore limit and block in down(&s->cow_count), holding a read lock on _origins_lock. 3. Someone tries to acquire a write lock on _origins_lock, e.g., snapshot_ctr(), which blocks because the threads at step (2) already hold a read lock on it. 4. A COW operation completes and kcopyd runs dm-snapshot's completion callback, which ends up calling pending_complete(). pending_complete() tries to resubmit any deferred origin bios. This requires acquiring a read lock on _origins_lock, which blocks. This happens because the read-write semaphore implementation gives priority to writers, meaning that as soon as a writer tries to enter the critical section, no readers will be allowed in, until all writers have completed their work. So, pending_complete() waits for the writer at step (3) to acquire and release the lock. This writer waits for the readers at step (2) to release the read lock and those readers wait for pending_complete() (the kcopyd thread) to signal the s->cow_count semaphore: DEADLOCK. The above was thoroughly analyzed and documented by Nikos Tsironis as part of his initial proposal for fixing this deadlock, see: https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html Fix this deadlock by reworking COW throttling so that it waits without holding any locks. Add a variable 'in_progress' that counts how many kcopyd jobs are running. A function wait_for_in_progress() will sleep if 'in_progress' is over the limit. It drops _origins_lock in order to avoid the deadlock. Reported-by: Guruswamy Basavaiah Reported-by: Nikos Tsironis Reviewed-by: Nikos Tsironis Tested-by: Nikos Tsironis Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls") Cc: stable@vger.kernel.org # v5.0+ Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()") Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 78 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index da3bd1794ee0..4fb1a40e68a0 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "dm.h" @@ -107,8 +106,8 @@ struct dm_snapshot { /* The on disk metadata handler */ struct dm_exception_store *store; - /* Maximum number of in-flight COW jobs. */ - struct semaphore cow_count; + unsigned in_progress; + struct wait_queue_head in_progress_wait; struct dm_kcopyd_client *kcopyd_client; @@ -162,8 +161,8 @@ struct dm_snapshot { */ #define DEFAULT_COW_THRESHOLD 2048 -static int cow_threshold = DEFAULT_COW_THRESHOLD; -module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644); +static unsigned cow_threshold = DEFAULT_COW_THRESHOLD; +module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644); MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write"); DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, @@ -1327,7 +1326,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_hash_tables; } - sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX); + init_waitqueue_head(&s->in_progress_wait); s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(s->kcopyd_client)) { @@ -1509,17 +1508,54 @@ static void snapshot_dtr(struct dm_target *ti) dm_put_device(ti, s->origin); + WARN_ON(s->in_progress); + kfree(s); } static void account_start_copy(struct dm_snapshot *s) { - down(&s->cow_count); + spin_lock(&s->in_progress_wait.lock); + s->in_progress++; + spin_unlock(&s->in_progress_wait.lock); } static void account_end_copy(struct dm_snapshot *s) { - up(&s->cow_count); + spin_lock(&s->in_progress_wait.lock); + BUG_ON(!s->in_progress); + s->in_progress--; + if (likely(s->in_progress <= cow_threshold) && + unlikely(waitqueue_active(&s->in_progress_wait))) + wake_up_locked(&s->in_progress_wait); + spin_unlock(&s->in_progress_wait.lock); +} + +static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins) +{ + if (unlikely(s->in_progress > cow_threshold)) { + spin_lock(&s->in_progress_wait.lock); + if (likely(s->in_progress > cow_threshold)) { + /* + * NOTE: this throttle doesn't account for whether + * the caller is servicing an IO that will trigger a COW + * so excess throttling may result for chunks not required + * to be COW'd. But if cow_threshold was reached, extra + * throttling is unlikely to negatively impact performance. + */ + DECLARE_WAITQUEUE(wait, current); + __add_wait_queue(&s->in_progress_wait, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&s->in_progress_wait.lock); + if (unlock_origins) + up_read(&_origins_lock); + io_schedule(); + remove_wait_queue(&s->in_progress_wait, &wait); + return false; + } + spin_unlock(&s->in_progress_wait.lock); + } + return true; } /* @@ -1537,7 +1573,7 @@ static void flush_bios(struct bio *bio) } } -static int do_origin(struct dm_dev *origin, struct bio *bio); +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit); /* * Flush a list of buffers. @@ -1550,7 +1586,7 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) while (bio) { n = bio->bi_next; bio->bi_next = NULL; - r = do_origin(s->origin, bio); + r = do_origin(s->origin, bio, false); if (r == DM_MAPIO_REMAPPED) generic_make_request(bio); bio = n; @@ -1926,6 +1962,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!s->valid) return DM_MAPIO_KILL; + if (bio_data_dir(bio) == WRITE) { + while (unlikely(!wait_for_in_progress(s, false))) + ; /* wait_for_in_progress() has slept */ + } + down_read(&s->lock); dm_exception_table_lock(&lock); @@ -2122,7 +2163,7 @@ redirect_to_origin: if (bio_data_dir(bio) == WRITE) { up_write(&s->lock); - return do_origin(s->origin, bio); + return do_origin(s->origin, bio, false); } out_unlock: @@ -2497,15 +2538,24 @@ next_snapshot: /* * Called on a write from the origin driver. */ -static int do_origin(struct dm_dev *origin, struct bio *bio) +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit) { struct origin *o; int r = DM_MAPIO_REMAPPED; +again: down_read(&_origins_lock); o = __lookup_origin(origin->bdev); - if (o) + if (o) { + if (limit) { + struct dm_snapshot *s; + list_for_each_entry(s, &o->snapshots, list) + if (unlikely(!wait_for_in_progress(s, true))) + goto again; + } + r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio); + } up_read(&_origins_lock); return r; @@ -2618,7 +2668,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio) dm_accept_partial_bio(bio, available_sectors); /* Only tell snapshots if this is a write */ - return do_origin(o->dev, bio); + return do_origin(o->dev, bio, true); } /* From 11bcf5f78905b90baae8fb01e16650664ed0cb00 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Tue, 2 Apr 2019 11:30:37 +0800 Subject: [PATCH 2103/2880] drm/edid: Add 6 bpc quirk for SDC panel in Lenovo G50 Another panel that needs 6BPC quirk. BugLink: https://bugs.launchpad.net/bugs/1819968 Cc: # v4.8+ Reviewed-by: Alex Deucher Signed-off-by: Kai-Heng Feng Signed-off-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20190402033037.21877-1-kai.heng.feng@canonical.com --- drivers/gpu/drm/drm_edid.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index 82a4ceed3fcf..6b0177112e18 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -159,6 +159,9 @@ static const struct edid_quirk { /* Medion MD 30217 PG */ { "MED", 0x7b8, EDID_QUIRK_PREFER_LARGE_75 }, + /* Lenovo G50 */ + { "SDC", 18514, EDID_QUIRK_FORCE_6BPC }, + /* Panel in Samsung NP700G7A-S01PL notebook reports 6bpc */ { "SEC", 0xd033, EDID_QUIRK_FORCE_8BPC }, From 574878f98c05aaae7e046cd18be9195f8da38cbd Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Thu, 10 Oct 2019 16:32:09 +0800 Subject: [PATCH 2104/2880] xen/grant-table: remove unnecessary printing xen_auto_xlat_grant_frames.vaddr is definitely NULL in this case. So the address printing is unnecessary. Signed-off-by: Fuqian Huang Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/grant-table.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 7ea6fb6a2e5d..49b381e104ef 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -1363,8 +1363,7 @@ static int gnttab_setup(void) if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; if (gnttab_shared.addr == NULL) { - pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n", - (unsigned long)xen_auto_xlat_grant_frames.vaddr); + pr_warn("gnttab share frames is not mapped!\n"); return -ENOMEM; } } From c3d79a83ca10d2026fd1cce096cde2d8e316592e Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Sun, 22 Sep 2019 15:49:00 +0800 Subject: [PATCH 2105/2880] dma-buf/resv: fix exclusive fence get MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This causes kernel crash when testing lima driver. Cc: Christian König Fixes: b8c036dfc66f ("dma-buf: simplify reservation_object_get_fences_rcu a bit") Signed-off-by: Qiang Yu Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20190922074900.853-1-yuq825@gmail.com Signed-off-by: Christian König --- drivers/dma-buf/dma-resv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c index 42a8f3f11681..709002515550 100644 --- a/drivers/dma-buf/dma-resv.c +++ b/drivers/dma-buf/dma-resv.c @@ -471,7 +471,7 @@ unlock: if (pfence_excl) *pfence_excl = fence_excl; else if (fence_excl) - shared[++shared_count] = fence_excl; + shared[shared_count++] = fence_excl; if (!shared_count) { kfree(shared); From 5e48e55fb57a9026e6b3b6961def6d2aeb210659 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 10 Oct 2019 14:30:46 +0200 Subject: [PATCH 2106/2880] MAINTAINERS: Remove Simon as Renesas SoC Co-Maintainer At the end of the v5.3 upstream kernel development cycle, Simon stepped down from his role as Renesas SoC maintainer. Remove his maintainership, git repository, and branch from the MAINTAINERS file, and add an entry to the CREDITS file to honor his work. Signed-off-by: Geert Uytterhoeven Signed-off-by: Linus Torvalds --- CREDITS | 4 ++++ MAINTAINERS | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CREDITS b/CREDITS index 8b67a85844b5..031605d46b4d 100644 --- a/CREDITS +++ b/CREDITS @@ -1637,6 +1637,10 @@ S: Panoramastrasse 18 S: D-69126 Heidelberg S: Germany +N: Simon Horman +M: horms@verge.net.au +D: Renesas ARM/ARM64 SoC maintainer + N: Christopher Horn E: chorn@warwick.net D: Miscellaneous sysctl hacks diff --git a/MAINTAINERS b/MAINTAINERS index 94ce075907a0..d44d6732510d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2165,12 +2165,10 @@ F: arch/arm64/boot/dts/realtek/ F: Documentation/devicetree/bindings/arm/realtek.yaml ARM/RENESAS ARM64 ARCHITECTURE -M: Simon Horman M: Geert Uytterhoeven M: Magnus Damm L: linux-renesas-soc@vger.kernel.org Q: http://patchwork.kernel.org/project/linux-renesas-soc/list/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next T: git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git next S: Supported F: arch/arm64/boot/dts/renesas/ @@ -2282,12 +2280,10 @@ S: Maintained F: drivers/media/platform/s5p-mfc/ ARM/SHMOBILE ARM ARCHITECTURE -M: Simon Horman M: Geert Uytterhoeven M: Magnus Damm L: linux-renesas-soc@vger.kernel.org Q: http://patchwork.kernel.org/project/linux-renesas-soc/list/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next T: git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git next S: Supported F: arch/arm/boot/dts/emev2* From ee7f5225dc3cc7c19df1603597532ff34571f895 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 8 Oct 2019 14:41:55 -0500 Subject: [PATCH 2107/2880] xen: Stop abusing DT of_dma_configure API As the removed comments say, these aren't DT based devices. of_dma_configure() is going to stop allowing a NULL DT node and calling it will no longer work. The comment is also now out of date as of commit 9ab91e7c5c51 ("arm64: default to the direct mapping in get_arch_dma_ops"). Direct mapping is now the default rather than dma_dummy_ops. According to Stefano and Oleksandr, the only other part needed is setting the DMA masks and there's no reason to restrict the masks to 32-bits. So set the masks to 64 bits. Cc: Robin Murphy Cc: Julien Grall Cc: Nicolas Saenz Julienne Cc: Oleksandr Andrushchenko Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Christoph Hellwig Cc: xen-devel@lists.xenproject.org Signed-off-by: Rob Herring Acked-by: Oleksandr Andrushchenko Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/gpu/drm/xen/xen_drm_front.c | 12 ++---------- drivers/xen/gntdev.c | 13 ++----------- 2 files changed, 4 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/xen/xen_drm_front.c b/drivers/gpu/drm/xen/xen_drm_front.c index 84aa4d61dc42..3406dfc9a605 100644 --- a/drivers/gpu/drm/xen/xen_drm_front.c +++ b/drivers/gpu/drm/xen/xen_drm_front.c @@ -716,17 +716,9 @@ static int xen_drv_probe(struct xenbus_device *xb_dev, struct device *dev = &xb_dev->dev; int ret; - /* - * The device is not spawn from a device tree, so arch_setup_dma_ops - * is not called, thus leaving the device with dummy DMA ops. - * This makes the device return error on PRIME buffer import, which - * is not correct: to fix this call of_dma_configure() with a NULL - * node to set default DMA ops. - */ - dev->coherent_dma_mask = DMA_BIT_MASK(32); - ret = of_dma_configure(dev, NULL, true); + ret = dma_coerce_mask_and_coherent(dev, DMA_BIT_MASK(64)); if (ret < 0) { - DRM_ERROR("Cannot setup DMA ops, ret %d", ret); + DRM_ERROR("Cannot setup DMA mask, ret %d", ret); return ret; } diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index a446a7221e13..81401f386c9c 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -22,6 +22,7 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -34,9 +35,6 @@ #include #include #include -#ifdef CONFIG_XEN_GRANT_DMA_ALLOC -#include -#endif #include #include @@ -625,14 +623,7 @@ static int gntdev_open(struct inode *inode, struct file *flip) flip->private_data = priv; #ifdef CONFIG_XEN_GRANT_DMA_ALLOC priv->dma_dev = gntdev_miscdev.this_device; - - /* - * The device is not spawn from a device tree, so arch_setup_dma_ops - * is not called, thus leaving the device with dummy DMA ops. - * Fix this by calling of_dma_configure() with a NULL node to set - * default DMA ops. - */ - of_dma_configure(priv->dma_dev, NULL, true); + dma_coerce_mask_and_coherent(priv->dma_dev, DMA_BIT_MASK(64)); #endif pr_debug("priv %p\n", priv); From 862488105b84ca744b3d8ff131e0fcfe10644be1 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 19 Sep 2019 11:44:27 +0530 Subject: [PATCH 2108/2880] nbd: fix possible sysfs duplicate warning 1. nbd_put takes the mutex and drops nbd->ref to 0. It then does idr_remove and drops the mutex. 2. nbd_genl_connect takes the mutex. idr_find/idr_for_each fails to find an existing device, so it does nbd_dev_add. 3. just before the nbd_put could call nbd_dev_remove or not finished totally, but if nbd_dev_add try to add_disk, we can hit: debugfs: Directory 'nbd1' with parent 'block' already present! This patch will make sure all the disk add/remove stuff are done by holding the nbd_index_mutex lock. Reported-by: Mike Christie Reviewed-by: Josef Bacik Signed-off-by: Xiubo Li Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index ac07e8c94c79..478aa86fc1f2 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -248,8 +248,8 @@ static void nbd_put(struct nbd_device *nbd) if (refcount_dec_and_mutex_lock(&nbd->refs, &nbd_index_mutex)) { idr_remove(&nbd_index_idr, nbd->index); - mutex_unlock(&nbd_index_mutex); nbd_dev_remove(nbd); + mutex_unlock(&nbd_index_mutex); } } From f1e5aa6c13feb8ee47ff5d116bed0a10007686f1 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Sun, 21 Jul 2019 18:10:31 +0200 Subject: [PATCH 2109/2880] MAINTAINERS: Add BCM2711 to BCM2835 ARCH Clarify that BCM2711 belongs to the BCM2835 ARCH. Signed-off-by: Stefan Wahren Acked-by: Eric Anholt --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 296de2b51c83..14a939a9e1e7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3187,7 +3187,7 @@ N: bcm216* N: kona F: arch/arm/mach-bcm/ -BROADCOM BCM2835 ARM ARCHITECTURE +BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE M: Eric Anholt M: Stefan Wahren L: bcm-kernel-feedback-list@broadcom.com @@ -3195,6 +3195,7 @@ L: linux-rpi-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) T: git git://github.com/anholt/linux S: Maintained +N: bcm2711 N: bcm2835 F: drivers/staging/vc04_services From 38dffe1e4dde1d3174fdce09d67370412843ebb5 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Thu, 10 Oct 2019 23:01:57 +0800 Subject: [PATCH 2110/2880] MIPS: elf_hwcap: Export userspace ASEs A Golang developer reported MIPS hwcap isn't reflecting instructions that the processor actually supported so programs can't apply optimized code at runtime. Thus we export the ASEs that can be used in userspace programs. Reported-by: Meng Zhuo Signed-off-by: Jiaxun Yang Cc: linux-mips@vger.kernel.org Cc: Paul Burton Cc: # 4.14+ Signed-off-by: Paul Burton --- arch/mips/include/uapi/asm/hwcap.h | 11 ++++++++++ arch/mips/kernel/cpu-probe.c | 33 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/arch/mips/include/uapi/asm/hwcap.h b/arch/mips/include/uapi/asm/hwcap.h index a2aba4b059e6..1ade1daa4921 100644 --- a/arch/mips/include/uapi/asm/hwcap.h +++ b/arch/mips/include/uapi/asm/hwcap.h @@ -6,5 +6,16 @@ #define HWCAP_MIPS_R6 (1 << 0) #define HWCAP_MIPS_MSA (1 << 1) #define HWCAP_MIPS_CRC32 (1 << 2) +#define HWCAP_MIPS_MIPS16 (1 << 3) +#define HWCAP_MIPS_MDMX (1 << 4) +#define HWCAP_MIPS_MIPS3D (1 << 5) +#define HWCAP_MIPS_SMARTMIPS (1 << 6) +#define HWCAP_MIPS_DSP (1 << 7) +#define HWCAP_MIPS_DSP2 (1 << 8) +#define HWCAP_MIPS_DSP3 (1 << 9) +#define HWCAP_MIPS_MIPS16E2 (1 << 10) +#define HWCAP_LOONGSON_MMI (1 << 11) +#define HWCAP_LOONGSON_EXT (1 << 12) +#define HWCAP_LOONGSON_EXT2 (1 << 13) #endif /* _UAPI_ASM_HWCAP_H */ diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index c2eb392597bf..f521cbf934e7 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -2180,6 +2180,39 @@ void cpu_probe(void) elf_hwcap |= HWCAP_MIPS_MSA; } + if (cpu_has_mips16) + elf_hwcap |= HWCAP_MIPS_MIPS16; + + if (cpu_has_mdmx) + elf_hwcap |= HWCAP_MIPS_MDMX; + + if (cpu_has_mips3d) + elf_hwcap |= HWCAP_MIPS_MIPS3D; + + if (cpu_has_smartmips) + elf_hwcap |= HWCAP_MIPS_SMARTMIPS; + + if (cpu_has_dsp) + elf_hwcap |= HWCAP_MIPS_DSP; + + if (cpu_has_dsp2) + elf_hwcap |= HWCAP_MIPS_DSP2; + + if (cpu_has_dsp3) + elf_hwcap |= HWCAP_MIPS_DSP3; + + if (cpu_has_mips16e2) + elf_hwcap |= HWCAP_MIPS_MIPS16E2; + + if (cpu_has_loongson_mmi) + elf_hwcap |= HWCAP_LOONGSON_MMI; + + if (cpu_has_loongson_ext) + elf_hwcap |= HWCAP_LOONGSON_EXT; + + if (cpu_has_loongson_ext2) + elf_hwcap |= HWCAP_LOONGSON_EXT2; + if (cpu_has_vz) cpu_probe_vz(c); From 2f2b4fd674cadd8c6b40eb629e140a14db4068fd Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 10 Oct 2019 18:54:03 +0000 Subject: [PATCH 2111/2880] MIPS: Disable Loongson MMI instructions for kernel build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC 9.x automatically enables support for Loongson MMI instructions when using some -march= flags, and then errors out when -msoft-float is specified with: cc1: error: ‘-mloongson-mmi’ must be used with ‘-mhard-float’ The kernel shouldn't be using these MMI instructions anyway, just as it doesn't use floating point instructions. Explicitly disable them in order to fix the build with GCC 9.x. Signed-off-by: Paul Burton Fixes: 3702bba5eb4f ("MIPS: Loongson: Add GCC 4.4 support for Loongson2E") Fixes: 6f7a251a259e ("MIPS: Loongson: Add basic Loongson 2F support") Fixes: 5188129b8c9f ("MIPS: Loongson-3: Improve -march option and move it to Platform") Cc: Huacai Chen Cc: Jiaxun Yang Cc: stable@vger.kernel.org # v2.6.32+ Cc: linux-mips@vger.kernel.org --- arch/mips/loongson64/Platform | 4 ++++ arch/mips/vdso/Makefile | 1 + 2 files changed, 5 insertions(+) diff --git a/arch/mips/loongson64/Platform b/arch/mips/loongson64/Platform index c1a4d4dc4665..9f79908f5063 100644 --- a/arch/mips/loongson64/Platform +++ b/arch/mips/loongson64/Platform @@ -66,6 +66,10 @@ else $(call cc-option,-march=mips64r2,-mips64r2 -U_MIPS_ISA -D_MIPS_ISA=_MIPS_ISA_MIPS64) endif +# Some -march= flags enable MMI instructions, and GCC complains about that +# support being enabled alongside -msoft-float. Thus explicitly disable MMI. +cflags-y += $(call cc-option,-mno-loongson-mmi) + # # Loongson Machines' Support # diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index 807f0f782f75..996a934ece7d 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -15,6 +15,7 @@ ccflags-vdso := \ $(filter -mmicromips,$(KBUILD_CFLAGS)) \ $(filter -march=%,$(KBUILD_CFLAGS)) \ $(filter -m%-float,$(KBUILD_CFLAGS)) \ + $(filter -mno-loongson-%,$(KBUILD_CFLAGS)) \ -D__VDSO__ ifdef CONFIG_CC_IS_CLANG From 1047ec868332034d1fbcb2fae19fe6d4cb869ff2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 4 Oct 2019 09:58:54 -0400 Subject: [PATCH 2112/2880] NFSv4: Fix leak of clp->cl_acceptor string Our client can issue multiple SETCLIENTID operations to the same server in some circumstances. Ensure that calls to nfs4_proc_setclientid() after the first one do not overwrite the previously allocated cl_acceptor string. unreferenced object 0xffff888461031800 (size 32): comm "mount.nfs", pid 2227, jiffies 4294822467 (age 1407.749s) hex dump (first 32 bytes): 6e 66 73 40 6b 6c 69 6d 74 2e 69 62 2e 31 30 31 nfs@klimt.ib.101 35 67 72 61 6e 67 65 72 2e 6e 65 74 00 00 00 00 5granger.net.... backtrace: [<00000000ab820188>] __kmalloc+0x128/0x176 [<00000000eeaf4ec8>] gss_stringify_acceptor+0xbd/0x1a7 [auth_rpcgss] [<00000000e85e3382>] nfs4_proc_setclientid+0x34e/0x46c [nfsv4] [<000000003d9cf1fa>] nfs40_discover_server_trunking+0x7a/0xed [nfsv4] [<00000000b81c3787>] nfs4_discover_server_trunking+0x81/0x244 [nfsv4] [<000000000801b55f>] nfs4_init_client+0x1b0/0x238 [nfsv4] [<00000000977daf7f>] nfs4_set_client+0xfe/0x14d [nfsv4] [<0000000053a68a2a>] nfs4_create_server+0x107/0x1db [nfsv4] [<0000000088262019>] nfs4_remote_mount+0x2c/0x59 [nfsv4] [<00000000e84a2fd0>] legacy_get_tree+0x2d/0x4c [<00000000797e947c>] vfs_get_tree+0x20/0xc7 [<00000000ecabaaa8>] fc_mount+0xe/0x36 [<00000000f15fafc2>] vfs_kern_mount+0x74/0x8d [<00000000a3ff4e26>] nfs_do_root_mount+0x8a/0xa3 [nfsv4] [<00000000d1c2b337>] nfs4_try_mount+0x58/0xad [nfsv4] [<000000004c9bddee>] nfs_fs_mount+0x820/0x869 [nfs] Fixes: f11b2a1cfbf5 ("nfs4: copy acceptor name from context ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 11eafcfc490b..ab8ca20fd579 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6106,6 +6106,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, status = nfs4_call_sync_custom(&task_setup_data); if (setclientid.sc_cred) { + kfree(clp->cl_acceptor); clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred); put_rpccred(setclientid.sc_cred); } From af84537dbd1b39505d1f3d8023029b4a59666513 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 2 Oct 2019 10:40:55 -0400 Subject: [PATCH 2113/2880] SUNRPC: fix race to sk_err after xs_error_report Since commit 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context") there has been a race to the value of the sk_err if both XPRT_SOCK_WAKE_ERROR and XPRT_SOCK_WAKE_DISCONNECT are set. In that case, we may end up losing the sk_err value that existed when xs_error_report was called. Fix this by reverting to the previous behavior: instead of using SO_ERROR to retrieve the value at a later time (which might also return sk_err_soft), copy the sk_err value onto struct sock_xprt, and use that value to wake pending tasks. Signed-off-by: Benjamin Coddington Fixes: 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context") Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtsock.h | 1 + net/sunrpc/xprtsock.c | 17 ++++++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 7638dbe7bc50..a940de03808d 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -61,6 +61,7 @@ struct sock_xprt { struct mutex recv_mutex; struct sockaddr_storage srcaddr; unsigned short srcport; + int xprt_err; /* * UDP socket buffer size parameters diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9ac88722fa83..70e52f567b2a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1249,19 +1249,21 @@ static void xs_error_report(struct sock *sk) { struct sock_xprt *transport; struct rpc_xprt *xprt; - int err; read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; transport = container_of(xprt, struct sock_xprt, xprt); - err = -sk->sk_err; - if (err == 0) + transport->xprt_err = -sk->sk_err; + if (transport->xprt_err == 0) goto out; dprintk("RPC: xs_error_report client %p, error=%d...\n", - xprt, -err); - trace_rpc_socket_error(xprt, sk->sk_socket, err); + xprt, -transport->xprt_err); + trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err); + + /* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */ + smp_mb__before_atomic(); xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR); out: read_unlock_bh(&sk->sk_callback_lock); @@ -2476,7 +2478,6 @@ static void xs_wake_write(struct sock_xprt *transport) static void xs_wake_error(struct sock_xprt *transport) { int sockerr; - int sockerr_len = sizeof(sockerr); if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) return; @@ -2485,9 +2486,7 @@ static void xs_wake_error(struct sock_xprt *transport) goto out; if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) goto out; - if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR, - (char *)&sockerr, &sockerr_len) != 0) - goto out; + sockerr = xchg(&transport->xprt_err, 0); if (sockerr < 0) xprt_wake_pending_tasks(&transport->xprt, sockerr); out: From 851140ab0d083c78e5723a8b1cbd258f567a7aff Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 2 Oct 2019 11:28:02 +0100 Subject: [PATCH 2114/2880] ARM: 8908/1: add __always_inline to functions called from __get_user_check() KernelCI reports that bcm2835_defconfig is no longer booting since commit ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") (https://lkml.org/lkml/2019/9/26/825). I also received a regression report from Nicolas Saenz Julienne (https://lkml.org/lkml/2019/9/27/263). This problem has cropped up on bcm2835_defconfig because it enables CONFIG_CC_OPTIMIZE_FOR_SIZE. The compiler tends to prefer not inlining functions with -Os. I was able to reproduce it with other boards and defconfig files by manually enabling CONFIG_CC_OPTIMIZE_FOR_SIZE. The __get_user_check() specifically uses r0, r1, r2 registers. So, uaccess_save_and_enable() and uaccess_restore() must be inlined. Otherwise, those register assignments would be entirely dropped, according to my analysis of the disassembly. Prior to commit 9012d011660e ("compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING"), the 'inline' marker was always enough for inlining functions, except on x86. Since that commit, all architectures can enable CONFIG_OPTIMIZE_INLINING. So, __always_inline is now the only guaranteed way of forcible inlining. I added __always_inline to 4 functions in the call-graph from the __get_user_check() macro. Fixes: 9012d011660e ("compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING") Reported-by: "kernelci.org bot" Reported-by: Nicolas Saenz Julienne Signed-off-by: Masahiro Yamada Tested-by: Nicolas Saenz Julienne Signed-off-by: Russell King --- arch/arm/include/asm/domain.h | 8 ++++---- arch/arm/include/asm/uaccess.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/domain.h b/arch/arm/include/asm/domain.h index 567dbede4785..f1d0a7807cd0 100644 --- a/arch/arm/include/asm/domain.h +++ b/arch/arm/include/asm/domain.h @@ -82,7 +82,7 @@ #ifndef __ASSEMBLY__ #ifdef CONFIG_CPU_CP15_MMU -static inline unsigned int get_domain(void) +static __always_inline unsigned int get_domain(void) { unsigned int domain; @@ -94,7 +94,7 @@ static inline unsigned int get_domain(void) return domain; } -static inline void set_domain(unsigned val) +static __always_inline void set_domain(unsigned int val) { asm volatile( "mcr p15, 0, %0, c3, c0 @ set domain" @@ -102,12 +102,12 @@ static inline void set_domain(unsigned val) isb(); } #else -static inline unsigned int get_domain(void) +static __always_inline unsigned int get_domain(void) { return 0; } -static inline void set_domain(unsigned val) +static __always_inline void set_domain(unsigned int val) { } #endif diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 303248e5b990..98c6b91be4a8 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -22,7 +22,7 @@ * perform such accesses (eg, via list poison values) which could then * be exploited for priviledge escalation. */ -static inline unsigned int uaccess_save_and_enable(void) +static __always_inline unsigned int uaccess_save_and_enable(void) { #ifdef CONFIG_CPU_SW_DOMAIN_PAN unsigned int old_domain = get_domain(); @@ -37,7 +37,7 @@ static inline unsigned int uaccess_save_and_enable(void) #endif } -static inline void uaccess_restore(unsigned int flags) +static __always_inline void uaccess_restore(unsigned int flags) { #ifdef CONFIG_CPU_SW_DOMAIN_PAN /* Restore the user access mask */ From 4c0742f65b4ee466546fd24b71b56516cacd4613 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Thu, 10 Oct 2019 10:12:20 +0100 Subject: [PATCH 2115/2880] ARM: 8914/1: NOMMU: Fix exc_ret for XIP It was reported that 72cd4064fcca "NOMMU: Toggle only bits in EXC_RETURN we are really care of" breaks NOMMU+XIP combination. It happens because saved EXC_RETURN gets overwritten when data section is relocated. The fix is to propagate EXC_RETURN via register and let relocation code to commit that value into memory. Fixes: 72cd4064fcca ("ARM: 8830/1: NOMMU: Toggle only bits in EXC_RETURN we are really care of") Reported-by: afzal mohammed Tested-by: afzal mohammed Signed-off-by: Vladimir Murzin Signed-off-by: Russell King --- arch/arm/kernel/head-common.S | 5 +++-- arch/arm/kernel/head-nommu.S | 2 ++ arch/arm/mm/proc-v7m.S | 5 ++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm/kernel/head-common.S b/arch/arm/kernel/head-common.S index a7810be07da1..4a3982812a40 100644 --- a/arch/arm/kernel/head-common.S +++ b/arch/arm/kernel/head-common.S @@ -68,7 +68,7 @@ ENDPROC(__vet_atags) * The following fragment of code is executed with the MMU on in MMU mode, * and uses absolute addresses; this is not position independent. * - * r0 = cp#15 control register + * r0 = cp#15 control register (exc_ret for M-class) * r1 = machine ID * r2 = atags/dtb pointer * r9 = processor ID @@ -137,7 +137,8 @@ __mmap_switched_data: #ifdef CONFIG_CPU_CP15 .long cr_alignment @ r3 #else - .long 0 @ r3 +M_CLASS(.long exc_ret) @ r3 +AR_CLASS(.long 0) @ r3 #endif .size __mmap_switched_data, . - __mmap_switched_data diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S index afa350f44dea..0fc814bbc34b 100644 --- a/arch/arm/kernel/head-nommu.S +++ b/arch/arm/kernel/head-nommu.S @@ -201,6 +201,8 @@ M_CLASS(streq r3, [r12, #PMSAv8_MAIR1]) bic r0, r0, #V7M_SCB_CCR_IC #endif str r0, [r12, V7M_SCB_CCR] + /* Pass exc_ret to __mmap_switched */ + mov r0, r10 #endif /* CONFIG_CPU_CP15 elif CONFIG_CPU_V7M */ ret lr ENDPROC(__after_proc_init) diff --git a/arch/arm/mm/proc-v7m.S b/arch/arm/mm/proc-v7m.S index 1448f144e7fb..efebf4120a0c 100644 --- a/arch/arm/mm/proc-v7m.S +++ b/arch/arm/mm/proc-v7m.S @@ -136,9 +136,8 @@ __v7m_setup_cont: cpsie i svc #0 1: cpsid i - ldr r0, =exc_ret - orr lr, lr, #EXC_RET_THREADMODE_PROCESSSTACK - str lr, [r0] + /* Calculate exc_ret */ + orr r10, lr, #EXC_RET_THREADMODE_PROCESSSTACK ldmia sp, {r0-r3, r12} str r5, [r12, #11 * 4] @ restore the original SVC vector entry mov lr, r6 @ restore LR From 5234c14531152702a9f3e575cb552b7e9cea9f94 Mon Sep 17 00:00:00 2001 From: Soeren Moch Date: Fri, 4 Oct 2019 22:32:13 +0200 Subject: [PATCH 2116/2880] arm64: dts: rockchip: fix RockPro64 sdmmc settings According to the RockPro64 schematic [1] the rk3399 sdmmc controller is connected to a microSD (TF card) slot. Remove the cap-mmc-highspeed property of the sdmmc controller, since no mmc card can be connected here. [1] http://files.pine64.org/doc/rockpro64/rockpro64_v21-SCH.pdf Fixes: e4f3fb490967 ("arm64: dts: rockchip: add initial dts support for Rockpro64") Signed-off-by: Soeren Moch Link: https://lore.kernel.org/r/20191004203213.4995-1-smoch@web.de Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dts | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dts b/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dts index aa06a6587a11..e544deb61d28 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dts @@ -624,7 +624,6 @@ &sdmmc { bus-width = <4>; - cap-mmc-highspeed; cap-sd-highspeed; cd-gpios = <&gpio0 7 GPIO_ACTIVE_LOW>; disable-wp; From 389206e806d892d36de0df6e7b07721432957801 Mon Sep 17 00:00:00 2001 From: Vivek Unune Date: Sat, 28 Sep 2019 23:22:30 -0400 Subject: [PATCH 2117/2880] arm64: dts: rockchip: Fix usb-c on Hugsun X99 TV Box Fix usb-c on X99 TV Box. Tested with armbian w/ kernel 5.3 Signed-off-by: Vivek Unune Link: https://lore.kernel.org/r/20190929032230.24628-1-npcomplete13@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-hugsun-x99.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-hugsun-x99.dts b/arch/arm64/boot/dts/rockchip/rk3399-hugsun-x99.dts index 0d1f5f9a0de9..c133e8d64b2a 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-hugsun-x99.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-hugsun-x99.dts @@ -644,7 +644,7 @@ status = "okay"; u2phy0_host: host-port { - phy-supply = <&vcc5v0_host>; + phy-supply = <&vcc5v0_typec>; status = "okay"; }; @@ -712,7 +712,7 @@ &usbdrd_dwc3_0 { status = "okay"; - dr_mode = "otg"; + dr_mode = "host"; }; &usbdrd3_1 { From a9082575f8d09a3a55f300d04f46ef5243e99688 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 8 Oct 2019 12:49:54 -0700 Subject: [PATCH 2118/2880] arm64: dts: rockchip: Fix override mode for rk3399-kevin panel When I re-posted Sean's original commit to add the override mode for the kevin panel, for some reason I didn't notice that the pixel clock wasn't quite right. Looking at /sys/kernel/debug/clk/clk_summary on downstream kernels it can be seen that the VOP clock is supposed to be 266,666,667 Hz achieved by dividing the 800 MHz PLL by 3. Looking at history, it seems that even Sean's first patch [1] had this funny clock rate. I'm not sure where it came from since the commit message specifically mentioned 26666 kHz and the Chrome OS tree [2] can be seen to request 266667 kHz. In any case, let's fix it up. This together with my patch [3] to do the proper rounding when setting the clock rate makes the VOP clock more proper as seen in /sys/kernel/debug/clk/clk_summary. [1] https://lore.kernel.org/r/20180206165626.37692-4-seanpaul@chromium.org [2] https://chromium.googlesource.com/chromiumos/third_party/kernel/+/chromeos-4.4/drivers/gpu/drm/panel/panel-simple.c#1172 [3] https://lkml.kernel.org/r/20191003114726.v2.1.Ib233b3e706cf6317858384264d5b0ed35657456e@changeid Fixes: 84ebd2da6d04 ("arm64: dts: rockchip: Specify override mode for kevin panel") Cc: Sean Paul Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20191008124949.1.I674acd441997dd0690c86c9003743aacda1cf5dd@changeid Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-gru-kevin.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-kevin.dts b/arch/arm64/boot/dts/rockchip/rk3399-gru-kevin.dts index e152b0ca0290..b8066868a3fe 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru-kevin.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-kevin.dts @@ -44,7 +44,7 @@ power-supply = <&pp3300_disp>; panel-timing { - clock-frequency = <266604720>; + clock-frequency = <266666667>; hactive = <2400>; hfront-porch = <48>; hback-porch = <84>; From 4ebcb113edcc498888394821bca2e60ef89c6ff3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 9 Oct 2019 20:55:48 +0200 Subject: [PATCH 2119/2880] r8169: fix jumbo packet handling on resume from suspend Mariusz reported that invalid packets are sent after resume from suspend if jumbo packets are active. It turned out that his BIOS resets chip settings to non-jumbo on resume. Most chip settings are re-initialized on resume from suspend by calling rtl_hw_start(), so let's add configuring jumbo to this function. There's nothing wrong with the commit marked as fixed, it's just the first one where the patch applies cleanly. Fixes: 7366016d2d4c ("r8169: read common register for PCI commit") Reported-by: Mariusz Bialonczyk Tested-by: Mariusz Bialonczyk Signed-off-by: Heiner Kallweit Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 35 +++++++---------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 74f81fe03810..350b0d949611 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4146,6 +4146,14 @@ static void rtl_hw_jumbo_disable(struct rtl8169_private *tp) rtl_lock_config_regs(tp); } +static void rtl_jumbo_config(struct rtl8169_private *tp, int mtu) +{ + if (mtu > ETH_DATA_LEN) + rtl_hw_jumbo_enable(tp); + else + rtl_hw_jumbo_disable(tp); +} + DECLARE_RTL_COND(rtl_chipcmd_cond) { return RTL_R8(tp, ChipCmd) & CmdReset; @@ -4442,11 +4450,6 @@ static void rtl8168g_set_pause_thresholds(struct rtl8169_private *tp, static void rtl_hw_start_8168bb(struct rtl8169_private *tp) { RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en); - - if (tp->dev->mtu <= ETH_DATA_LEN) { - rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B | - PCI_EXP_DEVCTL_NOSNOOP_EN); - } } static void rtl_hw_start_8168bef(struct rtl8169_private *tp) @@ -4462,9 +4465,6 @@ static void __rtl_hw_start_8168cp(struct rtl8169_private *tp) RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en); - if (tp->dev->mtu <= ETH_DATA_LEN) - rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B); - rtl_disable_clock_request(tp); } @@ -4490,9 +4490,6 @@ static void rtl_hw_start_8168cp_2(struct rtl8169_private *tp) rtl_set_def_aspm_entry_latency(tp); RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en); - - if (tp->dev->mtu <= ETH_DATA_LEN) - rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B); } static void rtl_hw_start_8168cp_3(struct rtl8169_private *tp) @@ -4503,9 +4500,6 @@ static void rtl_hw_start_8168cp_3(struct rtl8169_private *tp) /* Magic. */ RTL_W8(tp, DBG_REG, 0x20); - - if (tp->dev->mtu <= ETH_DATA_LEN) - rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B); } static void rtl_hw_start_8168c_1(struct rtl8169_private *tp) @@ -4611,9 +4605,6 @@ static void rtl_hw_start_8168e_1(struct rtl8169_private *tp) rtl_ephy_init(tp, e_info_8168e_1); - if (tp->dev->mtu <= ETH_DATA_LEN) - rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B); - rtl_disable_clock_request(tp); /* Reset tx FIFO pointer */ @@ -4636,9 +4627,6 @@ static void rtl_hw_start_8168e_2(struct rtl8169_private *tp) rtl_ephy_init(tp, e_info_8168e_2); - if (tp->dev->mtu <= ETH_DATA_LEN) - rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B); - rtl_eri_write(tp, 0xc0, ERIAR_MASK_0011, 0x0000); rtl_eri_write(tp, 0xb8, ERIAR_MASK_0011, 0x0000); rtl_set_fifo_size(tp, 0x10, 0x10, 0x02, 0x06); @@ -5485,6 +5473,8 @@ static void rtl_hw_start(struct rtl8169_private *tp) rtl_set_rx_tx_desc_registers(tp); rtl_lock_config_regs(tp); + rtl_jumbo_config(tp, tp->dev->mtu); + /* Initially a 10 us delay. Turned it into a PCI commit. - FR */ RTL_R16(tp, CPlusCmd); RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb); @@ -5498,10 +5488,7 @@ static int rtl8169_change_mtu(struct net_device *dev, int new_mtu) { struct rtl8169_private *tp = netdev_priv(dev); - if (new_mtu > ETH_DATA_LEN) - rtl_hw_jumbo_enable(tp); - else - rtl_hw_jumbo_disable(tp); + rtl_jumbo_config(tp, new_mtu); dev->mtu = new_mtu; netdev_update_features(dev); From 2168da4594040529f6a6cb413c22668a4402f83c Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 9 Oct 2019 12:18:31 -0700 Subject: [PATCH 2120/2880] net: update net_dim documentation after rename Commit 8960b38932be ("linux/dim: Rename externally used net_dim members") renamed the net_dim API, removing the "net_" prefix from the structures and functions. The patch didn't update the net_dim.txt documentation file. Fix the documentation so that its examples match the current code. Fixes: 8960b38932be ("linux/dim: Rename externally used net_dim members", 2019-06-25) Fixes: c002bd529d71 ("linux/dim: Rename externally exposed macros", 2019-06-25) Fixes: 4f75da3666c0 ("linux/dim: Move implementation to .c files") Cc: Tal Gilboa Signed-off-by: Jacob Keller Signed-off-by: Jakub Kicinski --- Documentation/networking/net_dim.txt | 36 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/Documentation/networking/net_dim.txt b/Documentation/networking/net_dim.txt index 9cb31c5e2dcd..9bdb7d5a3ba3 100644 --- a/Documentation/networking/net_dim.txt +++ b/Documentation/networking/net_dim.txt @@ -92,16 +92,16 @@ under some conditions. Part III: Registering a Network Device to DIM ============================================== -Net DIM API exposes the main function net_dim(struct net_dim *dim, -struct net_dim_sample end_sample). This function is the entry point to the Net +Net DIM API exposes the main function net_dim(struct dim *dim, +struct dim_sample end_sample). This function is the entry point to the Net DIM algorithm and has to be called every time the driver would like to check if it should change interrupt moderation parameters. The driver should provide two -data structures: struct net_dim and struct net_dim_sample. Struct net_dim +data structures: struct dim and struct dim_sample. Struct dim describes the state of DIM for a specific object (RX queue, TX queue, other queues, etc.). This includes the current selected profile, previous data samples, the callback function provided by the driver and more. -Struct net_dim_sample describes a data sample, which will be compared to the -data sample stored in struct net_dim in order to decide on the algorithm's next +Struct dim_sample describes a data sample, which will be compared to the +data sample stored in struct dim in order to decide on the algorithm's next step. The sample should include bytes, packets and interrupts, measured by the driver. @@ -110,9 +110,9 @@ main net_dim() function. The recommended method is to call net_dim() on each interrupt. Since Net DIM has a built-in moderation and it might decide to skip iterations under certain conditions, there is no need to moderate the net_dim() calls as well. As mentioned above, the driver needs to provide an object of type -struct net_dim to the net_dim() function call. It is advised for each entity -using Net DIM to hold a struct net_dim as part of its data structure and use it -as the main Net DIM API object. The struct net_dim_sample should hold the latest +struct dim to the net_dim() function call. It is advised for each entity +using Net DIM to hold a struct dim as part of its data structure and use it +as the main Net DIM API object. The struct dim_sample should hold the latest bytes, packets and interrupts count. No need to perform any calculations, just include the raw data. @@ -132,19 +132,19 @@ usage is not complete but it should make the outline of the usage clear. my_driver.c: -#include +#include /* Callback for net DIM to schedule on a decision to change moderation */ void my_driver_do_dim_work(struct work_struct *work) { - /* Get struct net_dim from struct work_struct */ - struct net_dim *dim = container_of(work, struct net_dim, - work); + /* Get struct dim from struct work_struct */ + struct dim *dim = container_of(work, struct dim, + work); /* Do interrupt moderation related stuff */ ... /* Signal net DIM work is done and it should move to next iteration */ - dim->state = NET_DIM_START_MEASURE; + dim->state = DIM_START_MEASURE; } /* My driver's interrupt handler */ @@ -152,13 +152,13 @@ int my_driver_handle_interrupt(struct my_driver_entity *my_entity, ...) { ... /* A struct to hold current measured data */ - struct net_dim_sample dim_sample; + struct dim_sample dim_sample; ... /* Initiate data sample struct with current data */ - net_dim_sample(my_entity->events, - my_entity->packets, - my_entity->bytes, - &dim_sample); + dim_update_sample(my_entity->events, + my_entity->packets, + my_entity->bytes, + &dim_sample); /* Call net DIM */ net_dim(&my_entity->dim, dim_sample); ... From 29ee2701529e1905c0e948688f9688c68c8d4ea4 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 10 Oct 2019 10:16:09 +0200 Subject: [PATCH 2121/2880] net/smc: fix SMCD link group creation with VLAN id If creation of an SMCD link group with VLAN id fails, the initial smc_ism_get_vlan() step has to be reverted as well. Fixes: c6ba7c9ba43d ("net/smc: add base infrastructure for SMC-D and ISM") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 4ca50ddf8d16..88556f0251ab 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -213,7 +213,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = SMC_CLC_DECL_MEM; - goto out; + goto ism_put_vlan; } lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; @@ -289,6 +289,9 @@ clear_llc_lnk: smc_llc_link_clear(lnk); free_lgr: kfree(lgr); +ism_put_vlan: + if (ini->is_smcd && ini->vlan_id) + smc_ism_put_vlan(ini->ism_dev, ini->vlan_id); out: if (rc < 0) { if (rc == -ENOMEM) From 882dcfe5a1785c20f45820cbe6fec4b8b647c946 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 10 Oct 2019 10:16:10 +0200 Subject: [PATCH 2122/2880] net/smc: receive returns without data smc_cdc_rxed_any_close_or_senddone() is used as an end condition for the receive loop. This conflicts with smc_cdc_msg_recv_action() which could run in parallel and set the bits checked by smc_cdc_rxed_any_close_or_senddone() before the receive is processed. In that case we could return from receive with no data, although data is available. The same applies to smc_rx_wait(). Fix this by checking for RCV_SHUTDOWN only, which is set in smc_cdc_msg_recv_action() after the receive was actually processed. Fixes: 952310ccf2d8 ("smc: receive data from RMBE") Reviewed-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_rx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 413a6abf227e..000002642288 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -211,8 +211,7 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, rc = sk_wait_event(sk, timeo, sk->sk_err || sk->sk_shutdown & RCV_SHUTDOWN || - fcrit(conn) || - smc_cdc_rxed_any_close_or_senddone(conn), + fcrit(conn), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); @@ -310,7 +309,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, smc_rx_update_cons(smc, 0); if (sk->sk_shutdown & RCV_SHUTDOWN || - smc_cdc_rxed_any_close_or_senddone(conn) || conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) break; From 107529e31a87acd475ff6a0f82745821b8f70fec Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 10 Oct 2019 10:16:11 +0200 Subject: [PATCH 2123/2880] net/smc: receive pending data after RCV_SHUTDOWN smc_rx_recvmsg() first checks if data is available, and then if RCV_SHUTDOWN is set. There is a race when smc_cdc_msg_recv_action() runs in between these 2 checks, receives data and sets RCV_SHUTDOWN. In that case smc_rx_recvmsg() would return from receive without to process the available data. Fix that with a final check for data available if RCV_SHUTDOWN is set. Move the check for data into a function and call it twice. And use the existing helper smc_rx_data_available(). Fixes: 952310ccf2d8 ("smc: receive data from RMBE") Reviewed-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_rx.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 000002642288..97e8369002d7 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -261,6 +261,18 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, return -EAGAIN; } +static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_rx_data_available(conn)) + return true; + else if (conn->urg_state == SMC_URG_VALID) + /* we received a single urgent Byte - skip */ + smc_rx_update_cons(smc, 0); + return false; +} + /* smc_rx_recvmsg - receive data from RMBE * @msg: copy data to receive buffer * @pipe: copy data to pipe if set - indicates splice() call @@ -302,15 +314,18 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (read_done >= target || (pipe && read_done)) break; - if (atomic_read(&conn->bytes_to_rcv)) + if (smc_rx_recvmsg_data_available(smc)) goto copy; - else if (conn->urg_state == SMC_URG_VALID) - /* we received a single urgent Byte - skip */ - smc_rx_update_cons(smc, 0); if (sk->sk_shutdown & RCV_SHUTDOWN || - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) { + /* smc_cdc_msg_recv_action() could have run after + * above smc_rx_recvmsg_data_available() + */ + if (smc_rx_recvmsg_data_available(smc)) + goto copy; break; + } if (read_done) { if (sk->sk_err || From 7adf4eaf60f3d8c3584bed51fe7066d4dfc2cbe1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Oct 2019 21:42:58 -0600 Subject: [PATCH 2124/2880] io_uring: fix sequence logic for timeout requests We have two ways a request can be deferred: 1) It's a regular request that depends on another one 2) It's a timeout that tracks completions We have a shared helper to determine whether to defer, and that attempts to make the right decision based on the request. But we only have some of this information in the caller. Un-share the two timeout/defer helpers so the caller can use the right one. Fixes: 5262f567987d ("io_uring: IORING_OP_TIMEOUT support") Reported-by: yangerkun Reviewed-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2c44648217bd..38d274fc0f25 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -415,27 +415,27 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return ctx; } -static inline bool io_sequence_defer(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static inline bool __io_sequence_defer(struct io_ring_ctx *ctx, + struct io_kiocb *req) { - /* timeout requests always honor sequence */ - if (!(req->flags & REQ_F_TIMEOUT) && - (req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) - return false; - return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; } -static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx, - struct list_head *list) +static inline bool io_sequence_defer(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) + return false; + + return __io_sequence_defer(ctx, req); +} + +static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) { struct io_kiocb *req; - if (list_empty(list)) - return NULL; - - req = list_first_entry(list, struct io_kiocb, list); - if (!io_sequence_defer(ctx, req)) { + req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list); + if (req && !io_sequence_defer(ctx, req)) { list_del_init(&req->list); return req; } @@ -443,14 +443,17 @@ static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx, return NULL; } -static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) -{ - return __io_get_deferred_req(ctx, &ctx->defer_list); -} - static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) { - return __io_get_deferred_req(ctx, &ctx->timeout_list); + struct io_kiocb *req; + + req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); + if (req && !__io_sequence_defer(ctx, req)) { + list_del_init(&req->list); + return req; + } + + return NULL; } static void __io_commit_cqring(struct io_ring_ctx *ctx) From 2272905a4580f26630f7d652cc33935b59f96d4c Mon Sep 17 00:00:00 2001 From: Emmanuel Nicolet Date: Tue, 8 Oct 2019 16:13:42 +0200 Subject: [PATCH 2125/2880] spufs: fix a crash in spufs_create_root() The spu_fs_context was not set in fc->fs_private, this caused a crash when accessing ctx->mode in spufs_create_root(). Fixes: d2e0981c3b9a ("vfs: Convert spufs to use the new mount API") Signed-off-by: Emmanuel Nicolet Signed-off-by: Michael Ellerman Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20191008141342.GA266797@gmail.com --- arch/powerpc/platforms/cell/spufs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 1d93e55a2de1..2dd452a047cd 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -761,6 +761,7 @@ static int spufs_init_fs_context(struct fs_context *fc) ctx->gid = current_gid(); ctx->mode = 0755; + fc->fs_private = ctx; fc->s_fs_info = sbi; fc->ops = &spufs_context_ops; return 0; From 10deeac921647c929c67752d377f22e632d55d1c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 20 Sep 2019 10:44:47 -0700 Subject: [PATCH 2126/2880] MAINTAINERS: kgdb: Add myself as a reviewer for kgdb/kdb I'm interested in kdb / kgdb and have sent various fixes over the years. I'd like to get CCed on patches so I can be aware of them and also help review. Signed-off-by: Douglas Anderson Acked-by: Daniel Thompson Link: https://lore.kernel.org/r/20190920104404.1.I237e68e8825e2d6ac26f8e847f521fe2fcc3705a@changeid Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 296de2b51c83..acdd996774d6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9075,6 +9075,7 @@ F: security/keys/ KGDB / KDB /debug_core M: Jason Wessel M: Daniel Thompson +R: Douglas Anderson W: http://kgdb.wiki.kernel.org/ L: kgdb-bugreport@lists.sourceforge.net T: git git://git.kernel.org/pub/scm/linux/kernel/git/jwessel/kgdb.git From 442f1e746e8187b9deb1590176f6b0ff19686b11 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 30 Sep 2019 14:45:22 -0700 Subject: [PATCH 2127/2880] firmware: google: increment VPD key_len properly Commit 4b708b7b1a2c ("firmware: google: check if size is valid when decoding VPD data") adds length checks, but the new vpd_decode_entry() function botched the logic -- it adds the key length twice, instead of adding the key and value lengths separately. On my local system, this means vpd.c's vpd_section_create_attribs() hits an error case after the first attribute it parses, since it's no longer looking at the correct offset. With this patch, I'm back to seeing all the correct attributes in /sys/firmware/vpd/... Fixes: 4b708b7b1a2c ("firmware: google: check if size is valid when decoding VPD data") Cc: Cc: Hung-Te Lin Signed-off-by: Brian Norris Reviewed-by: Stephen Boyd Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20190930214522.240680-1-briannorris@chromium.org Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/google/vpd_decode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/google/vpd_decode.c b/drivers/firmware/google/vpd_decode.c index dda525c0f968..5c6f2a74f104 100644 --- a/drivers/firmware/google/vpd_decode.c +++ b/drivers/firmware/google/vpd_decode.c @@ -52,7 +52,7 @@ static int vpd_decode_entry(const u32 max_len, const u8 *input_buf, if (max_len - consumed < *entry_len) return VPD_FAIL; - consumed += decoded_len; + consumed += *entry_len; *_consumed = consumed; return VPD_OK; } From 062795fcdcb2d22822fb42644b1d76a8ad8439b3 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 8 Oct 2019 17:02:32 +0200 Subject: [PATCH 2128/2880] s390/uaccess: avoid (false positive) compiler warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Depending on inlining decisions by the compiler, __get/put_user_fn might become out of line. Then the compiler is no longer able to tell that size can only be 1,2,4 or 8 due to the check in __get/put_user resulting in false positives like ./arch/s390/include/asm/uaccess.h: In function ‘__put_user_fn’: ./arch/s390/include/asm/uaccess.h:113:9: warning: ‘rc’ may be used uninitialized in this function [-Wmaybe-uninitialized] 113 | return rc; | ^~ ./arch/s390/include/asm/uaccess.h: In function ‘__get_user_fn’: ./arch/s390/include/asm/uaccess.h:143:9: warning: ‘rc’ may be used uninitialized in this function [-Wmaybe-uninitialized] 143 | return rc; | ^~ These functions are supposed to be always inlined. Mark it as such. Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index bd2fd9a7821d..a470f1fa9f2a 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -83,7 +83,7 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n); __rc; \ }) -static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) +static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) { unsigned long spec = 0x010000UL; int rc; @@ -113,7 +113,7 @@ static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) return rc; } -static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size) +static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size) { unsigned long spec = 0x01UL; int rc; From c461e8df0c9bc46b3696dfb5d1df2f09d755af53 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 26 Sep 2019 13:43:10 -0400 Subject: [PATCH 2129/2880] tools/virtio: more stubs fix test module build. Signed-off-by: Michael S. Tsirkin --- tools/virtio/crypto/hash.h | 0 tools/virtio/linux/dma-mapping.h | 2 ++ 2 files changed, 2 insertions(+) create mode 100644 tools/virtio/crypto/hash.h diff --git a/tools/virtio/crypto/hash.h b/tools/virtio/crypto/hash.h new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tools/virtio/linux/dma-mapping.h b/tools/virtio/linux/dma-mapping.h index f91aeb5fe571..8f41cd6bd5c0 100644 --- a/tools/virtio/linux/dma-mapping.h +++ b/tools/virtio/linux/dma-mapping.h @@ -29,4 +29,6 @@ enum dma_data_direction { #define dma_unmap_single(...) do { } while (0) #define dma_unmap_page(...) do { } while (0) +#define dma_max_mapping_size(...) SIZE_MAX + #endif From 48f9bcf91461b43249949f4e172b5c0245dde751 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 5 Oct 2019 09:46:41 -0700 Subject: [PATCH 2130/2880] net: sctp: Rename fallthrough label to unhandled fallthrough will become a pseudo reserved keyword so this only use of fallthrough is better renamed to allow it. Signed-off-by: Joe Perches Reviewed-by: Nick Desaulniers Reviewed-by: Kees Cook Acked-by: Neil Horman Signed-off-by: Linus Torvalds --- net/sctp/sm_make_chunk.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index e41ed2e0ae7d..48d63956a68c 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2155,7 +2155,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_SET_PRIMARY: if (ep->asconf_enable) break; - goto fallthrough; + goto unhandled; case SCTP_PARAM_HOST_NAME_ADDRESS: /* Tell the peer, we won't support this param. */ @@ -2166,11 +2166,11 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_FWD_TSN_SUPPORT: if (ep->prsctp_enable) break; - goto fallthrough; + goto unhandled; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) - goto fallthrough; + goto unhandled; /* SCTP-AUTH: Secion 6.1 * If the random number is not 32 byte long the association @@ -2187,7 +2187,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) - goto fallthrough; + goto unhandled; /* SCTP-AUTH: Section 3.2 * The CHUNKS parameter MUST be included once in the INIT or @@ -2203,7 +2203,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) - goto fallthrough; + goto unhandled; hmacs = (struct sctp_hmac_algo_param *)param.p; n_elt = (ntohs(param.p->length) - @@ -2226,7 +2226,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, retval = SCTP_IERROR_ABORT; } break; -fallthrough: +unhandled: default: pr_debug("%s: unrecognized param:%d for chunk:%d\n", __func__, ntohs(param.p->type), cid); From 294f69e662d1570703e9b56e95be37a9fd3afba5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 5 Oct 2019 09:46:42 -0700 Subject: [PATCH 2131/2880] compiler_attributes.h: Add 'fallthrough' pseudo keyword for switch/case use Reserve the pseudo keyword 'fallthrough' for the ability to convert the various case block /* fallthrough */ style comments to appear to be an actual reserved word with the same gcc case block missing fallthrough warning capability. All switch/case blocks now should end in one of: break; fallthrough; goto