From 93e75faba3982767d425323aec5726282d3ad7a2 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 5 Aug 2010 22:23:16 +0200 Subject: [PATCH 01/27] PCI: Adjust confusing if indentation in pcie_get_readrq Indent the branch of an if. The semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @r disable braces4@ position p1,p2; statement S1,S2; @@ ( if (...) { ... } | if (...) S1@p1 S2@p2 ) @script:python@ p1 << r.p1; p2 << r.p2; @@ if (p1[0].column == p2[0].column): cocci.print_main("branch",p1) cocci.print_secs("after",p2) // Signed-off-by: Julia Lawall Signed-off-by: Jesse Barnes --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 7fa3cbd742c5..cc232c016ef9 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2689,7 +2689,7 @@ int pcie_get_readrq(struct pci_dev *dev) ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl); if (!ret) - ret = 128 << ((ctl & PCI_EXP_DEVCTL_READRQ) >> 12); + ret = 128 << ((ctl & PCI_EXP_DEVCTL_READRQ) >> 12); return ret; } From 991f739544a0923b70fb69b115edb880ff9fcc4a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 4 Jul 2010 00:02:28 +0200 Subject: [PATCH 02/27] PCI: kill BKL in /proc/pci All operations in the pci procfs ioctl functions are atomic, so no lock is needed here. Also add a compat_ioctl method, since all the commands are compatible in 32 bit mode. Signed-off-by: Arnd Bergmann Cc: Jesse Barnes Cc: Tejun Heo Cc: linux-pci@vger.kernel.org Signed-off-by: Jesse Barnes --- drivers/pci/proc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c index 01f0306525a5..297b72c880a1 100644 --- a/drivers/pci/proc.c +++ b/drivers/pci/proc.c @@ -212,8 +212,6 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd, #endif /* HAVE_PCI_MMAP */ int ret = 0; - lock_kernel(); - switch (cmd) { case PCIIOC_CONTROLLER: ret = pci_domain_nr(dev->bus); @@ -242,7 +240,6 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd, break; }; - unlock_kernel(); return ret; } @@ -306,6 +303,7 @@ static const struct file_operations proc_bus_pci_operations = { .read = proc_bus_pci_read, .write = proc_bus_pci_write, .unlocked_ioctl = proc_bus_pci_ioctl, + .compat_ioctl = proc_bus_pci_ioctl, #ifdef HAVE_PCI_MMAP .open = proc_bus_pci_open, .release = proc_bus_pci_release, From 50c1126ee1990920705a067a6f3f9bb892369b08 Mon Sep 17 00:00:00 2001 From: Bill Pemberton Date: Tue, 3 Aug 2010 15:18:43 -0400 Subject: [PATCH 03/27] PCI: aerdrv: fix uninitialized variable warning quiet the warning about use of uninitialized e_src in aer_isr() e_src is initialized by get_e_source() Signed-off-by: Bill Pemberton Signed-off-by: Jesse Barnes --- drivers/pci/pcie/aer/aerdrv_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c index 29e268fadf14..43421fbe080a 100644 --- a/drivers/pci/pcie/aer/aerdrv_core.c +++ b/drivers/pci/pcie/aer/aerdrv_core.c @@ -754,7 +754,7 @@ void aer_isr(struct work_struct *work) { struct aer_rpc *rpc = container_of(work, struct aer_rpc, dpc_handler); struct pcie_device *p_device = rpc->rpd; - struct aer_err_source e_src; + struct aer_err_source uninitialized_var(e_src); mutex_lock(&rpc->rpc_mutex); while (get_e_source(rpc, &e_src)) From 5a37f1381f1d8625fa458360c9b5d17f0c5f1dea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Sep 2010 14:32:38 +0000 Subject: [PATCH 04/27] PCI hotplug: ibmphp-hpc: semaphore cleanup Get rid of init_MUTEX[_LOCKED]() and use sema_init() instead. Signed-off-by: Thomas Gleixner Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/ibmphp_hpc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/hotplug/ibmphp_hpc.c b/drivers/pci/hotplug/ibmphp_hpc.c index 1aaf3f32d3cd..f59ed30512b5 100644 --- a/drivers/pci/hotplug/ibmphp_hpc.c +++ b/drivers/pci/hotplug/ibmphp_hpc.c @@ -133,8 +133,8 @@ void __init ibmphp_hpc_initvars (void) debug ("%s - Entry\n", __func__); mutex_init(&sem_hpcaccess); - init_MUTEX (&semOperations); - init_MUTEX_LOCKED (&sem_exit); + sema_init(&semOperations, 1); + sema_init(&sem_exit, 0); to_debug = 0; debug ("%s - Exit\n", __func__); From 42b219322a97ccef347388b233aceaafe3fa517d Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 2 Sep 2010 14:28:51 -0700 Subject: [PATCH 05/27] PCI: pci_driver make name const The name field in pci_driver should be const, it is not modified by PCI subsystem. Signed-off-by: Stephen Hemminger Signed-off-by: Jesse Barnes --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/pci.h b/include/linux/pci.h index c8d95e369ff4..30faf4f3db0b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -541,7 +541,7 @@ struct pci_error_handlers { struct module; struct pci_driver { struct list_head node; - char *name; + const char *name; const struct pci_device_id *id_table; /* must be non-NULL for probe to be called */ int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */ void (*remove) (struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */ From b22c3d82757109fa107ce17ba9484d45273eed05 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 20 Sep 2010 18:50:00 +0200 Subject: [PATCH 06/27] PCI/PCIe/AER: Disable native AER service if BIOS has precedence There is a design issue related to PCIe AER and _OSC that the BIOS may be asked to grant control of the AER service even if some Hardware Error Source Table (HEST) entries contain information meaning that the BIOS really should control it. Namely, pcie_port_acpi_setup() calls pcie_aer_get_firmware_first() that determines whether or not the AER service should be controlled by the BIOS on the basis of the HEST information for the given PCIe port. The BIOS is asked to grant control of the AER service for a PCIe Root Complex if pcie_aer_get_firmware_first() returns 'false' for at least one root port in that complex, even if all of the other root ports' HEST entries have the FIRMWARE_FIRST flag set (and none of them has the GLOBAL flag set). However, if the AER service is controlled by the kernel, that may interfere with the BIOS' handling of the error sources having the FIRMWARE_FIRST flag. Moreover, there may be PCIe endpoints that have the FIRMWARE_FIRST flag set in HEST and are attached to the root ports in question, in which case it also may be unsafe to ask the BIOS for control of the AER service. For this reason, introduce a function checking if there's at least one PCIe-related HEST entry with the FIRMWARE_FIRST flag set and disable the native AER service altogether if this function returns 'true'. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/pcie/aer/aerdrv.c | 2 +- drivers/pci/pcie/aer/aerdrv.h | 3 +++ drivers/pci/pcie/aer/aerdrv_acpi.c | 34 ++++++++++++++++++++++++++++++ drivers/pci/pcie/portdrv_acpi.c | 2 +- 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c index f409948e1a9b..2b2b6508efde 100644 --- a/drivers/pci/pcie/aer/aerdrv.c +++ b/drivers/pci/pcie/aer/aerdrv.c @@ -416,7 +416,7 @@ static void aer_error_resume(struct pci_dev *dev) */ static int __init aer_service_init(void) { - if (!pci_aer_available()) + if (!pci_aer_available() || aer_acpi_firmware_first()) return -ENXIO; return pcie_port_service_register(&aerdriver); } diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h index 80c11d131499..9656e3060412 100644 --- a/drivers/pci/pcie/aer/aerdrv.h +++ b/drivers/pci/pcie/aer/aerdrv.h @@ -132,6 +132,7 @@ static inline int aer_osc_setup(struct pcie_device *pciedev) #ifdef CONFIG_ACPI_APEI extern int pcie_aer_get_firmware_first(struct pci_dev *pci_dev); +extern bool aer_acpi_firmware_first(void); #else static inline int pcie_aer_get_firmware_first(struct pci_dev *pci_dev) { @@ -139,6 +140,8 @@ static inline int pcie_aer_get_firmware_first(struct pci_dev *pci_dev) return pci_dev->__aer_firmware_first; return 0; } + +static inline bool aer_acpi_firmware_first(void) { return false; } #endif static inline void pcie_aer_force_firmware_first(struct pci_dev *pci_dev, diff --git a/drivers/pci/pcie/aer/aerdrv_acpi.c b/drivers/pci/pcie/aer/aerdrv_acpi.c index 2bb9b8972211..275bf158ffa7 100644 --- a/drivers/pci/pcie/aer/aerdrv_acpi.c +++ b/drivers/pci/pcie/aer/aerdrv_acpi.c @@ -93,4 +93,38 @@ int pcie_aer_get_firmware_first(struct pci_dev *dev) aer_set_firmware_first(dev); return dev->__aer_firmware_first; } + +static bool aer_firmware_first; + +static int aer_hest_parse_aff(struct acpi_hest_header *hest_hdr, void *data) +{ + struct acpi_hest_aer_common *p; + + if (aer_firmware_first) + return 0; + + switch (hest_hdr->type) { + case ACPI_HEST_TYPE_AER_ROOT_PORT: + case ACPI_HEST_TYPE_AER_ENDPOINT: + case ACPI_HEST_TYPE_AER_BRIDGE: + p = (struct acpi_hest_aer_common *)(hest_hdr + 1); + aer_firmware_first = !!(p->flags & ACPI_HEST_FIRMWARE_FIRST); + default: + return 0; + } +} + +/** + * aer_acpi_firmware_first - Check if APEI should control AER. + */ +bool aer_acpi_firmware_first(void) +{ + static bool parsed = false; + + if (!parsed) { + apei_hest_parse(aer_hest_parse_aff, NULL); + parsed = true; + } + return aer_firmware_first; +} #endif diff --git a/drivers/pci/pcie/portdrv_acpi.c b/drivers/pci/pcie/portdrv_acpi.c index b7c4cb1ccb23..5982b6a63b89 100644 --- a/drivers/pci/pcie/portdrv_acpi.c +++ b/drivers/pci/pcie/portdrv_acpi.c @@ -49,7 +49,7 @@ int pcie_port_acpi_setup(struct pci_dev *port, int *srv_mask) | OSC_PCI_EXPRESS_PME_CONTROL; if (pci_aer_available()) { - if (pcie_aer_get_firmware_first(port)) + if (aer_acpi_firmware_first()) dev_dbg(&port->dev, "PCIe errors handled by BIOS.\n"); else flags |= OSC_PCI_EXPRESS_AER_CONTROL; From 66db60eaf158aa953651d03e43e931e757e87262 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 21 Sep 2010 13:54:39 -0400 Subject: [PATCH 07/27] PCI: add quirk for non-symmetric-mode irq routing to versions 0 and 4 of the MCP55 northbridge A long time ago I worked on a RHEL5 bug in which kdump hung during boot on a set of systems. The systems hung because they never received timer interrupts during calibrate_delay. These systems also all had Opteron processors on a hypertransport bus, bridged to a pci bus via an Nvidia MCP55 northbridge chip. After much wrangling I managed to learn from Nvidia that they have an undocumented register in some versions of that chip which control how legacy interrupts are send to the cpu complex when the ioapic isn't active. Nvidia defaults this register to only send legacy interrupts to the BSP, so if kdump happens to boot on an AP, we never get timer interrupts and boom. I had initially used this quirk as a workaround, with my intent being to move apic initalization to an earlier point in the boot process, so the setting of the register would be irrelevant. Given the work involved in doing that however, the fragile nature of the apic initalization code, and the fact that, over the 2 years since we found this bug, the MCP55 is the only chip which seems to have this issue, I've figure at this point its likely safer to just carry the quirk around. By setting the referenced bits in this hidden register, interrupts will be broadcast to all cpus when the ioapic isn't active on the above described systems. Acked-by: Simon Horman Acked-by: Vivek Goyal Signed-off-by: Neil Horman Signed-off-by: Jesse Barnes --- drivers/pci/quirks.c | 31 +++++++++++++++++++++++++++++++ include/linux/pci_ids.h | 2 ++ 2 files changed, 33 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 857ae01734a6..034430690a5b 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2296,6 +2296,37 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_15, nvenet_msi_disable); +/* + * Some versions of the MCP55 bridge from nvidia have a legacy irq routing + * config register. This register controls the routing of legacy interrupts + * from devices that route through the MCP55. If this register is misprogramed + * interrupts are only sent to the bsp, unlike conventional systems where the + * irq is broadxast to all online cpus. Not having this register set + * properly prevents kdump from booting up properly, so lets make sure that + * we have it set correctly. + * Note this is an undocumented register. + */ +static void __devinit nvbridge_check_legacy_irq_routing(struct pci_dev *dev) +{ + u32 cfg; + + pci_read_config_dword(dev, 0x74, &cfg); + + if (cfg & ((1 << 2) | (1 << 15))) { + printk(KERN_INFO "Rewriting irq routing register on MCP55\n"); + cfg &= ~((1 << 2) | (1 << 15)); + pci_write_config_dword(dev, 0x74, cfg); + } +} + +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, + PCI_DEVICE_ID_NVIDIA_MCP55_BRIDGE_V0, + nvbridge_check_legacy_irq_routing); + +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, + PCI_DEVICE_ID_NVIDIA_MCP55_BRIDGE_V4, + nvbridge_check_legacy_irq_routing); + static int __devinit ht_check_msi_mapping(struct pci_dev *dev) { int pos, ttl = 48; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 570fddeb0388..dc2827723c1e 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1249,6 +1249,8 @@ #define PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO5700_2 0x0348 #define PCI_DEVICE_ID_NVIDIA_QUADRO_FX_GO1000 0x034C #define PCI_DEVICE_ID_NVIDIA_QUADRO_FX_1100 0x034E +#define PCI_DEVICE_ID_NVIDIA_MCP55_BRIDGE_V0 0x0360 +#define PCI_DEVICE_ID_NVIDIA_MCP55_BRIDGE_V4 0x0364 #define PCI_DEVICE_ID_NVIDIA_NVENET_15 0x0373 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA 0x03E7 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SMBUS 0x03EB From 80e7b19ae167197e84f378809b8ccddd0f99c1fd Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Thu, 23 Sep 2010 17:28:04 +0100 Subject: [PATCH 08/27] PCI: OLPC: Only enable PCI configuration type override on XO-1 This configuration type override is for XO-1 only and must not happen on XO-1.5. Acked-by: Andres Salomon Signed-off-by: Daniel Drake Signed-off-by: Jesse Barnes --- arch/x86/Kconfig | 2 +- arch/x86/kernel/olpc.c | 6 ++++-- arch/x86/pci/olpc.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cea0cd9a316f..0ed4c9bfcd13 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1900,7 +1900,7 @@ config PCI_GODIRECT bool "Direct" config PCI_GOOLPC - bool "OLPC" + bool "OLPC XO-1" depends on OLPC config PCI_GOANY diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 0e0cdde519be..635888cf050d 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -242,8 +242,10 @@ static int __init olpc_init(void) (unsigned char *) &olpc_platform_info.ecver, 1); #ifdef CONFIG_PCI_OLPC - /* If the VSA exists let it emulate PCI, if not emulate in kernel */ - if (!cs5535_has_vsa2()) + /* If the VSA exists let it emulate PCI, if not emulate in kernel. + * XO-1 only. */ + if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) && + !cs5535_has_vsa2()) x86_init.pci.arch_init = pci_olpc_init; #endif diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index b34815408f58..13700ec8e2e4 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -304,7 +304,7 @@ static struct pci_raw_ops pci_olpc_conf = { int __init pci_olpc_init(void) { - printk(KERN_INFO "PCI: Using configuration type OLPC\n"); + printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n"); raw_pci_ops = &pci_olpc_conf; is_lx = is_geode_lx(); return 0; From 25143fd1270d28782ae0620aa86ef5f8c14030fd Mon Sep 17 00:00:00 2001 From: Seth Heasley Date: Fri, 10 Sep 2010 16:36:39 -0700 Subject: [PATCH 09/27] x86/PCI: irq and pci_ids patch for Intel Patsburg DeviceIDs This patch adds the LPC Controller DeviceIDs for the Intel Patsburg PCH. Signed-off-by: Seth Heasley Signed-off-by: Jesse Barnes --- arch/x86/pci/irq.c | 1 + include/linux/pci_ids.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index f547ee05f715..ee7fc8fc8a83 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -589,6 +589,7 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: + case PCI_DEVICE_ID_INTEL_PBG_LPC: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index dc2827723c1e..b9ff2801cf76 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2436,6 +2436,7 @@ #define PCI_DEVICE_ID_INTEL_CPT_SMBUS 0x1c22 #define PCI_DEVICE_ID_INTEL_CPT_LPC_MIN 0x1c41 #define PCI_DEVICE_ID_INTEL_CPT_LPC_MAX 0x1c5f +#define PCI_DEVICE_ID_INTEL_PBG_LPC 0x1d40 #define PCI_DEVICE_ID_INTEL_82801AA_0 0x2410 #define PCI_DEVICE_ID_INTEL_82801AA_1 0x2411 #define PCI_DEVICE_ID_INTEL_82801AA_3 0x2413 From 7473fbf4a016301bfa3faa4f81c9a9c978330359 Mon Sep 17 00:00:00 2001 From: Anders Wallin Date: Thu, 23 Sep 2010 19:39:04 +0200 Subject: [PATCH 10/27] PCI: add PCI vendor id for STmicroelectronics Signed-off-by: Anders Wallin Signed-off-by: Jesse Barnes --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index b9ff2801cf76..ea5a3d19aaba 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -765,6 +765,8 @@ #define PCI_DEVICE_ID_ELSA_MICROLINK 0x1000 #define PCI_DEVICE_ID_ELSA_QS3000 0x3000 +#define PCI_VENDOR_ID_STMICRO 0x104A + #define PCI_VENDOR_ID_BUSLOGIC 0x104B #define PCI_DEVICE_ID_BUSLOGIC_MULTIMASTER_NC 0x0140 #define PCI_DEVICE_ID_BUSLOGIC_MULTIMASTER 0x1040 From db5004195481fcb500c929bd3a0e1c0c48eec527 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 13 Oct 2010 15:00:23 +0900 Subject: [PATCH 11/27] PCI: add PCI_MSIX_TABLE/PBA defines These are already defined in pcilib's pci/header.h but not in kernel's linux/pci_regs.h. Copy them to avoid using magic numbers. Signed-off-by: Hidetoshi Seto Signed-off-by: Jesse Barnes --- drivers/pci/msi.h | 4 ++-- include/linux/pci_regs.h | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h index de27c1cb5a2b..feff3bee6fe5 100644 --- a/drivers/pci/msi.h +++ b/drivers/pci/msi.h @@ -22,8 +22,8 @@ #define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT)) #define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT)) -#define msix_table_offset_reg(base) (base + 0x04) -#define msix_pba_offset_reg(base) (base + 0x08) +#define msix_table_offset_reg(base) (base + PCI_MSIX_TABLE) +#define msix_pba_offset_reg(base) (base + PCI_MSIX_PBA) #define msix_table_size(control) ((control & PCI_MSIX_FLAGS_QSIZE)+1) #define multi_msix_capable(control) msix_table_size((control)) diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h index 455b9ccdfca7..af83076c31a6 100644 --- a/include/linux/pci_regs.h +++ b/include/linux/pci_regs.h @@ -300,12 +300,14 @@ #define PCI_MSI_DATA_64 12 /* 16 bits of data for 64-bit devices */ #define PCI_MSI_MASK_64 16 /* Mask bits register for 64-bit devices */ -/* MSI-X registers (these are at offset PCI_MSIX_FLAGS) */ +/* MSI-X registers */ #define PCI_MSIX_FLAGS 2 #define PCI_MSIX_FLAGS_QSIZE 0x7FF #define PCI_MSIX_FLAGS_ENABLE (1 << 15) #define PCI_MSIX_FLAGS_MASKALL (1 << 14) -#define PCI_MSIX_FLAGS_BIRMASK (7 << 0) +#define PCI_MSIX_TABLE 4 +#define PCI_MSIX_PBA 8 +#define PCI_MSIX_FLAGS_BIRMASK (7 << 0) /* CompactPCI Hotswap Register */ From 350a55e9ff6005032407d3234af800f413b03af5 Mon Sep 17 00:00:00 2001 From: matt mooney Date: Fri, 24 Sep 2010 12:17:26 -0700 Subject: [PATCH 12/27] PCI: use new ccflags variable in Makefile Replace EXTRA_CFLAGS with ccflags-y. Signed-off-by: matt mooney Signed-off-by: Jesse Barnes --- drivers/pci/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index dc1aa0922868..dcd7ace9221e 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -65,6 +65,4 @@ obj-$(CONFIG_PCI_SYSCALL) += syscall.o obj-$(CONFIG_PCI_STUB) += pci-stub.o -ifeq ($(CONFIG_PCI_DEBUG),y) -EXTRA_CFLAGS += -DDEBUG -endif +ccflags-$(CONFIG_PCI_DEBUG) := -DDEBUG From cb04e95bdd0bfd618ab731c84a3ab56b56974df8 Mon Sep 17 00:00:00 2001 From: Seth Heasley Date: Mon, 4 Oct 2010 13:27:14 -0700 Subject: [PATCH 13/27] PCI: update Intel chipset names and defines This patch updates the defines for Intel devices in include/linux/pci_ids.h, referenced in arch/x86/pci/irq.c and drivers/i2c/busses/i2c-i801.c, reflecting approved legal branding, and using fuller code-names for products under development. Acked-by: Jean Delvare Signed-off-by: Seth Heasley Signed-off-by: Jesse Barnes --- arch/x86/pci/irq.c | 12 ++++++------ drivers/i2c/busses/Kconfig | 4 ++-- drivers/i2c/busses/i2c-i801.c | 10 +++++----- include/linux/pci_ids.h | 18 +++++++++--------- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index ee7fc8fc8a83..9f9bfb705cf9 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -584,28 +584,28 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH9_3: case PCI_DEVICE_ID_INTEL_ICH9_4: case PCI_DEVICE_ID_INTEL_ICH9_5: - case PCI_DEVICE_ID_INTEL_TOLAPAI_0: + case PCI_DEVICE_ID_INTEL_EP80579_0: case PCI_DEVICE_ID_INTEL_ICH10_0: case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: - case PCI_DEVICE_ID_INTEL_PBG_LPC: + case PCI_DEVICE_ID_INTEL_PATSBURG_LPC: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { + if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) && + (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) { r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) { + if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) && + (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) { r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 6539ac2907e9..fd455a2fdd12 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -95,9 +95,9 @@ config I2C_I801 ESB2 ICH8 ICH9 - Tolapai + EP80579 (Tolapai) ICH10 - 3400/5 Series (PCH) + 5/3400 Series (PCH) Cougar Point (PCH) This driver can also be built as a module. If so, the module diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index c60081169cc3..59d65981eed7 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -38,10 +38,10 @@ 82801G (ICH7) 0x27da 32 hard yes yes yes 82801H (ICH8) 0x283e 32 hard yes yes yes 82801I (ICH9) 0x2930 32 hard yes yes yes - Tolapai 0x5032 32 hard yes yes yes + EP80579 (Tolapai) 0x5032 32 hard yes yes yes ICH10 0x3a30 32 hard yes yes yes ICH10 0x3a60 32 hard yes yes yes - 3400/5 Series (PCH) 0x3b30 32 hard yes yes yes + 5/3400 Series (PCH) 0x3b30 32 hard yes yes yes Cougar Point (PCH) 0x1c22 32 hard yes yes yes Features supported by this driver: @@ -587,11 +587,11 @@ static const struct pci_device_id i801_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_17) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_5) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_6) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TOLAPAI_1) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_EP80579_1) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH10_4) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH10_5) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_PCH_SMBUS) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CPT_SMBUS) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5_3400_SERIES_SMBUS) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_COUGARPOINT_SMBUS) }, { 0, } }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index ea5a3d19aaba..bb6daa5f8240 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2435,10 +2435,10 @@ #define PCI_DEVICE_ID_INTEL_82840_HB 0x1a21 #define PCI_DEVICE_ID_INTEL_82845_HB 0x1a30 #define PCI_DEVICE_ID_INTEL_IOAT 0x1a38 -#define PCI_DEVICE_ID_INTEL_CPT_SMBUS 0x1c22 -#define PCI_DEVICE_ID_INTEL_CPT_LPC_MIN 0x1c41 -#define PCI_DEVICE_ID_INTEL_CPT_LPC_MAX 0x1c5f -#define PCI_DEVICE_ID_INTEL_PBG_LPC 0x1d40 +#define PCI_DEVICE_ID_INTEL_COUGARPOINT_SMBUS 0x1c22 +#define PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN 0x1c41 +#define PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX 0x1c5f +#define PCI_DEVICE_ID_INTEL_PATSBURG_LPC 0x1d40 #define PCI_DEVICE_ID_INTEL_82801AA_0 0x2410 #define PCI_DEVICE_ID_INTEL_82801AA_1 0x2411 #define PCI_DEVICE_ID_INTEL_82801AA_3 0x2413 @@ -2644,9 +2644,9 @@ #define PCI_DEVICE_ID_INTEL_ICH10_3 0x3a1a #define PCI_DEVICE_ID_INTEL_ICH10_4 0x3a30 #define PCI_DEVICE_ID_INTEL_ICH10_5 0x3a60 -#define PCI_DEVICE_ID_INTEL_PCH_LPC_MIN 0x3b00 -#define PCI_DEVICE_ID_INTEL_PCH_LPC_MAX 0x3b1f -#define PCI_DEVICE_ID_INTEL_PCH_SMBUS 0x3b30 +#define PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN 0x3b00 +#define PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX 0x3b1f +#define PCI_DEVICE_ID_INTEL_5_3400_SERIES_SMBUS 0x3b30 #define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f #define PCI_DEVICE_ID_INTEL_5100_16 0x65f0 #define PCI_DEVICE_ID_INTEL_5100_21 0x65f5 @@ -2655,8 +2655,8 @@ #define PCI_DEVICE_ID_INTEL_5400_FBD0 0x4035 #define PCI_DEVICE_ID_INTEL_5400_FBD1 0x4036 #define PCI_DEVICE_ID_INTEL_IOAT_SCNB 0x65ff -#define PCI_DEVICE_ID_INTEL_TOLAPAI_0 0x5031 -#define PCI_DEVICE_ID_INTEL_TOLAPAI_1 0x5032 +#define PCI_DEVICE_ID_INTEL_EP80579_0 0x5031 +#define PCI_DEVICE_ID_INTEL_EP80579_1 0x5032 #define PCI_DEVICE_ID_INTEL_82371SB_0 0x7000 #define PCI_DEVICE_ID_INTEL_82371SB_1 0x7010 #define PCI_DEVICE_ID_INTEL_82371SB_2 0x7020 From 2c6413aee215a43b1f95e218067abcde50ccbc5e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 29 Sep 2010 12:23:21 -0600 Subject: [PATCH 14/27] PCI: log vendor/device ID always Previously we had to have CONFIG_PCI_DEBUG=y or CONFIG_DYNAMIC_DEBUG=y to turn on this printk, but I think the IDs are valuable enough that it's worth putting them in the log always. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- drivers/pci/probe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 12625d90f8b5..c84900da3c59 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -961,8 +961,8 @@ int pci_setup_device(struct pci_dev *dev) dev->class = class; class >>= 8; - dev_dbg(&dev->dev, "found [%04x:%04x] class %06x header type %02x\n", - dev->vendor, dev->device, class, dev->hdr_type); + dev_printk(KERN_DEBUG, &dev->dev, "[%04x:%04x] type %d class %#08x\n", + dev->vendor, dev->device, dev->hdr_type, class); /* need to have dev->class ready */ dev->cfg_size = pci_cfg_space_size(dev); From 1bcd495be9ed3194f618e8af0446459dc52a1423 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 29 Sep 2010 12:23:54 -0600 Subject: [PATCH 15/27] PCI: fix message typo I missed the closing parenthesis on "(PCI address ...)". Acked-by: Arnd Bergmann Reported-by: Peter Maydell Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- drivers/pci/setup-res.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index 2aaa13150de3..bc0e6eea0fff 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -85,7 +85,7 @@ void pci_update_resource(struct pci_dev *dev, int resno) } } res->flags &= ~IORESOURCE_UNSET; - dev_info(&dev->dev, "BAR %d: set to %pR (PCI address [%#llx-%#llx]\n", + dev_info(&dev->dev, "BAR %d: set to %pR (PCI address [%#llx-%#llx])\n", resno, res, (unsigned long long)region.start, (unsigned long long)region.end); } From bf4d29086972ceaeaf72544d8f64933c2cfdc992 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 4 Oct 2010 14:22:26 -0400 Subject: [PATCH 16/27] PCI: Export some PCI PM functionality It's helpful to have some extra PCI power management functions available to platform code, so move the declarations to an exported header. Acked-by: Rafael J. Wysocki Signed-off-by: Matthew Garrett Signed-off-by: Jesse Barnes --- drivers/pci/pci.h | 3 --- include/linux/pci.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 6beb11b617a9..f5c7c382765f 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -63,11 +63,8 @@ struct pci_platform_pm_ops { extern int pci_set_platform_pm(struct pci_platform_pm_ops *ops); extern void pci_update_current_state(struct pci_dev *dev, pci_power_t state); extern void pci_disable_enabled_device(struct pci_dev *dev); -extern bool pci_check_pme_status(struct pci_dev *dev); extern int pci_finish_runtime_suspend(struct pci_dev *dev); -extern void pci_wakeup_event(struct pci_dev *dev); extern int __pci_pme_wakeup(struct pci_dev *dev, void *ign); -extern void pci_pme_wakeup_bus(struct pci_bus *bus); extern void pci_pm_init(struct pci_dev *dev); extern void platform_pci_wakeup_init(struct pci_dev *dev); extern void pci_allocate_cap_save_buffers(struct pci_dev *dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 30faf4f3db0b..7454408c41b6 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -819,6 +819,9 @@ pci_power_t pci_target_state(struct pci_dev *dev); int pci_prepare_to_sleep(struct pci_dev *dev); int pci_back_from_sleep(struct pci_dev *dev); bool pci_dev_run_wake(struct pci_dev *dev); +bool pci_check_pme_status(struct pci_dev *dev); +void pci_wakeup_event(struct pci_dev *dev); +void pci_pme_wakeup_bus(struct pci_bus *bus); static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable) From df17e62e5bff60aeefd0e81165c62f9e46f33217 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 4 Oct 2010 14:22:29 -0400 Subject: [PATCH 17/27] PCI: Add support for polling PME state on suspended legacy PCI devices Not all hardware vendors hook up the PME line for legacy PCI devices, meaning that wakeup events get lost. The only way around this is to poll the devices to see if their state has changed, so add support for doing that on legacy PCI devices that aren't part of the core chipset. Acked-by: Rafael J. Wysocki Signed-off-by: Matthew Garrett Signed-off-by: Jesse Barnes --- drivers/pci/pci.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index cc232c016ef9..e98c8104297b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -38,6 +38,19 @@ EXPORT_SYMBOL(pci_pci_problems); unsigned int pci_pm_d3_delay; +static void pci_pme_list_scan(struct work_struct *work); + +static LIST_HEAD(pci_pme_list); +static DEFINE_MUTEX(pci_pme_list_mutex); +static DECLARE_DELAYED_WORK(pci_pme_work, pci_pme_list_scan); + +struct pci_pme_device { + struct list_head list; + struct pci_dev *dev; +}; + +#define PME_TIMEOUT 1000 /* How long between PME checks */ + static void pci_dev_d3_sleep(struct pci_dev *dev) { unsigned int delay = dev->d3_delay; @@ -1331,6 +1344,32 @@ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state) return !!(dev->pme_support & (1 << state)); } +static void pci_pme_list_scan(struct work_struct *work) +{ + struct pci_pme_device *pme_dev; + + mutex_lock(&pci_pme_list_mutex); + if (!list_empty(&pci_pme_list)) { + list_for_each_entry(pme_dev, &pci_pme_list, list) + pci_pme_wakeup(pme_dev->dev, NULL); + schedule_delayed_work(&pci_pme_work, msecs_to_jiffies(PME_TIMEOUT)); + } + mutex_unlock(&pci_pme_list_mutex); +} + +/** + * pci_external_pme - is a device an external PCI PME source? + * @dev: PCI device to check + * + */ + +static bool pci_external_pme(struct pci_dev *dev) +{ + if (pci_is_pcie(dev) || dev->bus->number == 0) + return false; + return true; +} + /** * pci_pme_active - enable or disable PCI device's PME# function * @dev: PCI device to handle. @@ -1354,6 +1393,44 @@ void pci_pme_active(struct pci_dev *dev, bool enable) pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr); + /* PCI (as opposed to PCIe) PME requires that the device have + its PME# line hooked up correctly. Not all hardware vendors + do this, so the PME never gets delivered and the device + remains asleep. The easiest way around this is to + periodically walk the list of suspended devices and check + whether any have their PME flag set. The assumption is that + we'll wake up often enough anyway that this won't be a huge + hit, and the power savings from the devices will still be a + win. */ + + if (pci_external_pme(dev)) { + struct pci_pme_device *pme_dev; + if (enable) { + pme_dev = kmalloc(sizeof(struct pci_pme_device), + GFP_KERNEL); + if (!pme_dev) + goto out; + pme_dev->dev = dev; + mutex_lock(&pci_pme_list_mutex); + list_add(&pme_dev->list, &pci_pme_list); + if (list_is_singular(&pci_pme_list)) + schedule_delayed_work(&pci_pme_work, + msecs_to_jiffies(PME_TIMEOUT)); + mutex_unlock(&pci_pme_list_mutex); + } else { + mutex_lock(&pci_pme_list_mutex); + list_for_each_entry(pme_dev, &pci_pme_list, list) { + if (pme_dev->dev == dev) { + list_del(&pme_dev->list); + kfree(pme_dev); + break; + } + } + mutex_unlock(&pci_pme_list_mutex); + } + } + +out: dev_printk(KERN_DEBUG, &dev->dev, "PME# %s\n", enable ? "enabled" : "disabled"); } From 1ca98fa652bb5dc3c8793335db9ccc5d0f2e1f65 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 4 Oct 2010 12:49:24 -0600 Subject: [PATCH 18/27] x86/PCI: MMCONFIG: fix region end calculation The end of an MMCONFIG region depends on the ending bus number, not on the number of buses the region covers. We previously computed the wrong ending address whenever the starting bus number was non-zero, e.g.,: MMCONFIG for [bus 00-1f] at [mem 0xe0000000-0xe1ffffff] (base 0xe0000000) MMCONFIG for [bus 20-3f] at [mem 0xe2000000-0xe1ffffff] (base 0xe0000000) The correct regions are: MMCONFIG for [bus 00-1f] at [mem 0xe0000000-0xe1ffffff] (base 0xe0000000) MMCONFIG for [bus 20-3f] at [mem 0xe2000000-0xe3ffffff] (base 0xe0000000) Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index a918553ebc75..e282886616a0 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -65,7 +65,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, int end, u64 addr) { struct pci_mmcfg_region *new; - int num_buses; struct resource *res; if (addr == 0) @@ -82,10 +81,9 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, list_add_sorted(new); - num_buses = end - start + 1; res = &new->res; res->start = addr + PCI_MMCFG_BUS_OFFSET(start); - res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; + res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); From a9cea017411c95ec789092971f9baaef1f826883 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:13 -0600 Subject: [PATCH 19/27] resources: add a default alignf to simplify find_resource() This removes a test from find_resource(), which is getting cluttered. No functional change. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 7b36976e5dea..7dc8ad24f915 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -357,6 +357,14 @@ int __weak page_is_ram(unsigned long pfn) return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; } +static resource_size_t simple_align_resource(void *data, + const struct resource *avail, + resource_size_t size, + resource_size_t align) +{ + return avail->start; +} + /* * Find empty slot in the resource tree given range and alignment. */ @@ -391,8 +399,8 @@ static int find_resource(struct resource *root, struct resource *new, if (tmp.end > max) tmp.end = max; tmp.start = ALIGN(tmp.start, align); - if (alignf) - tmp.start = alignf(alignf_data, &tmp, size, align); + + tmp.start = alignf(alignf_data, &tmp, size, align); if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { new->start = tmp.start; new->end = tmp.start + size - 1; @@ -428,6 +436,9 @@ int allocate_resource(struct resource *root, struct resource *new, { int err; + if (!alignf) + alignf = simple_align_resource; + write_lock(&resource_lock); err = find_resource(root, new, size, min, max, align, alignf, alignf_data); if (err >= 0 && __request_resource(root, new)) From 5d6b1fa301b13cc651ee717a9b518124dea2f814 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:18 -0600 Subject: [PATCH 20/27] resources: factor out resource_clip() to simplify find_resource() This factors out the min/max clipping to simplify find_resource(). No functional change. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 7dc8ad24f915..26e9f2546923 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -365,6 +365,15 @@ static resource_size_t simple_align_resource(void *data, return avail->start; } +static void resource_clip(struct resource *res, resource_size_t min, + resource_size_t max) +{ + if (res->start < min) + res->start = min; + if (res->end > max) + res->end = max; +} + /* * Find empty slot in the resource tree given range and alignment. */ @@ -394,10 +403,8 @@ static int find_resource(struct resource *root, struct resource *new, tmp.end = this->start - 1; else tmp.end = root->end; - if (tmp.start < min) - tmp.start = min; - if (tmp.end > max) - tmp.end = max; + + resource_clip(&tmp, min, max); tmp.start = ALIGN(tmp.start, align); tmp.start = alignf(alignf_data, &tmp, size, align); From 6909ba14c25b4db6be2ff89f4fa0fac2d70151a0 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:23 -0600 Subject: [PATCH 21/27] resources: ensure callback doesn't allocate outside available space The alignment callback returns a proposed location, which may have been adjusted to avoid ISA aliases or for other architecture-specific reasons. We already had a check ("tmp.start < tmp.end") to make sure the callback doesn't return an area that extends past the available area. This patch reworks the check to make sure it doesn't return an area that extends either below or above the available area. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 26e9f2546923..89d50412508c 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -374,6 +374,11 @@ static void resource_clip(struct resource *res, resource_size_t min, res->end = max; } +static bool resource_contains(struct resource *res1, struct resource *res2) +{ + return res1->start <= res2->start && res1->end >= res2->end; +} + /* * Find empty slot in the resource tree given range and alignment. */ @@ -387,7 +392,7 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; - struct resource tmp = *new; + struct resource tmp = *new, alloc; tmp.start = root->start; /* @@ -407,10 +412,11 @@ static int find_resource(struct resource *root, struct resource *new, resource_clip(&tmp, min, max); tmp.start = ALIGN(tmp.start, align); - tmp.start = alignf(alignf_data, &tmp, size, align); - if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { - new->start = tmp.start; - new->end = tmp.start + size - 1; + alloc.start = alignf(alignf_data, &tmp, size, align); + alloc.end = alloc.start + size - 1; + if (resource_contains(&tmp, &alloc)) { + new->start = alloc.start; + new->end = alloc.end; return 0; } if (!this) From a1862e31079149a52b6223776228c3aee493d4a7 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:28 -0600 Subject: [PATCH 22/27] resources: handle overflow when aligning start of available area If tmp.start is near ~0, ALIGN(tmp.start) may overflow, which would make us think there's more available space than there really is. We would likely return something that conflicts with a previous resource, which would cause a failure when allocate_resource() requests the newly- allocated region. Reference: https://bugzilla.redhat.com/show_bug.cgi?id=646027 Reported-by: Fabrice Bellet Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 89d50412508c..e15b922d4ba4 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -392,7 +392,7 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; - struct resource tmp = *new, alloc; + struct resource tmp = *new, avail, alloc; tmp.start = root->start; /* @@ -410,14 +410,19 @@ static int find_resource(struct resource *root, struct resource *new, tmp.end = root->end; resource_clip(&tmp, min, max); - tmp.start = ALIGN(tmp.start, align); - alloc.start = alignf(alignf_data, &tmp, size, align); - alloc.end = alloc.start + size - 1; - if (resource_contains(&tmp, &alloc)) { - new->start = alloc.start; - new->end = alloc.end; - return 0; + /* Check for overflow after ALIGN() */ + avail = *new; + avail.start = ALIGN(tmp.start, align); + avail.end = tmp.end; + if (avail.start >= tmp.start) { + alloc.start = alignf(alignf_data, &avail, size, align); + alloc.end = alloc.start + size - 1; + if (resource_contains(&avail, &alloc)) { + new->start = alloc.start; + new->end = alloc.end; + return 0; + } } if (!this) break; From e7f8567db9a7f6b3151b0b275e245c1cef0d9c70 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:33 -0600 Subject: [PATCH 23/27] resources: support allocating space within a region from the top down Allocate space from the top of a region first, then work downward, if an architecture desires this. When we allocate space from a resource, we look for gaps between children of the resource. Previously, we always looked at gaps from the bottom up. For example, given this: [mem 0xbff00000-0xf7ffffff] PCI Bus 0000:00 [mem 0xbff00000-0xbfffffff] gap -- available [mem 0xc0000000-0xdfffffff] PCI Bus 0000:02 [mem 0xe0000000-0xf7ffffff] gap -- available we attempted to allocate from the [mem 0xbff00000-0xbfffffff] gap first, then the [mem 0xe0000000-0xf7ffffff] gap. With this patch an architecture can choose to allocate from the top gap [mem 0xe0000000-0xf7ffffff] first. We can't do this across the board because iomem_resource.end is initialized to 0xffffffff_ffffffff on 64-bit architectures, and most machines can't address the entire 64-bit physical address space. Therefore, we only allocate top-down if the arch requests it by clearing "resource_alloc_from_bottom". Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- Documentation/kernel-parameters.txt | 5 ++ include/linux/ioport.h | 1 + kernel/resource.c | 98 +++++++++++++++++++++++++++-- 3 files changed, 100 insertions(+), 4 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8dd7248508a9..fe50cbd315b0 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2156,6 +2156,11 @@ and is between 256 and 4096 characters. It is defined in the file reset_devices [KNL] Force drivers to reset the underlying device during initialization. + resource_alloc_from_bottom + Allocate new resources from the beginning of available + space, not the end. If you need to use this, please + report a bug. + resume= [SWSUSP] Specify the partition device for software suspend diff --git a/include/linux/ioport.h b/include/linux/ioport.h index b22790268b64..d377ea815d45 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -112,6 +112,7 @@ struct resource_list { /* PC/ISA/whatever - the normal PC address spaces: IO and memory */ extern struct resource ioport_resource; extern struct resource iomem_resource; +extern int resource_alloc_from_bottom; extern struct resource *request_resource_conflict(struct resource *root, struct resource *new); extern int request_resource(struct resource *root, struct resource *new); diff --git a/kernel/resource.c b/kernel/resource.c index e15b922d4ba4..716b6804077e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); +/* + * By default, we allocate free space bottom-up. The architecture can request + * top-down by clearing this flag. The user can override the architecture's + * choice with the "resource_alloc_from_bottom" kernel boot option, but that + * should only be a debugging tool. + */ +int resource_alloc_from_bottom = 1; + +static __init int setup_alloc_from_bottom(char *s) +{ + printk(KERN_INFO + "resource: allocating from bottom-up; please report a bug\n"); + resource_alloc_from_bottom = 1; + return 0; +} +early_param("resource_alloc_from_bottom", setup_alloc_from_bottom); + static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -379,8 +396,75 @@ static bool resource_contains(struct resource *res1, struct resource *res2) return res1->start <= res2->start && res1->end >= res2->end; } +/* + * Find the resource before "child" in the sibling list of "root" children. + */ +static struct resource *find_sibling_prev(struct resource *root, struct resource *child) +{ + struct resource *this; + + for (this = root->child; this; this = this->sibling) + if (this->sibling == child) + return this; + + return NULL; +} + /* * Find empty slot in the resource tree given range and alignment. + * This version allocates from the end of the root resource first. + */ +static int find_resource_from_top(struct resource *root, struct resource *new, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, + resource_size_t (*alignf)(void *, + const struct resource *, + resource_size_t, + resource_size_t), + void *alignf_data) +{ + struct resource *this; + struct resource tmp, avail, alloc; + + tmp.start = root->end; + tmp.end = root->end; + + this = find_sibling_prev(root, NULL); + for (;;) { + if (this) { + if (this->end < root->end) + tmp.start = this->end + 1; + } else + tmp.start = root->start; + + resource_clip(&tmp, min, max); + + /* Check for overflow after ALIGN() */ + avail = *new; + avail.start = ALIGN(tmp.start, align); + avail.end = tmp.end; + if (avail.start >= tmp.start) { + alloc.start = alignf(alignf_data, &avail, size, align); + alloc.end = alloc.start + size - 1; + if (resource_contains(&avail, &alloc)) { + new->start = alloc.start; + new->end = alloc.end; + return 0; + } + } + + if (!this || this->start == root->start) + break; + + tmp.end = this->start - 1; + this = find_sibling_prev(root, this); + } + return -EBUSY; +} + +/* + * Find empty slot in the resource tree given range and alignment. + * This version allocates from the beginning of the root resource first. */ static int find_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, @@ -396,14 +480,15 @@ static int find_resource(struct resource *root, struct resource *new, tmp.start = root->start; /* - * Skip past an allocated resource that starts at 0, since the assignment - * of this->start - 1 to tmp->end below would cause an underflow. + * Skip past an allocated resource that starts at 0, since the + * assignment of this->start - 1 to tmp->end below would cause an + * underflow. */ if (this && this->start == 0) { tmp.start = this->end + 1; this = this->sibling; } - for(;;) { + for (;;) { if (this) tmp.end = this->start - 1; else @@ -424,8 +509,10 @@ static int find_resource(struct resource *root, struct resource *new, return 0; } } + if (!this) break; + tmp.start = this->end + 1; this = this->sibling; } @@ -458,7 +545,10 @@ int allocate_resource(struct resource *root, struct resource *new, alignf = simple_align_resource; write_lock(&resource_lock); - err = find_resource(root, new, size, min, max, align, alignf, alignf_data); + if (resource_alloc_from_bottom) + err = find_resource(root, new, size, min, max, align, alignf, alignf_data); + else + err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data); if (err >= 0 && __request_resource(root, new)) err = -EBUSY; write_unlock(&resource_lock); From b126b4703afa4010b161784a43650337676dd03b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:39 -0600 Subject: [PATCH 24/27] PCI: allocate bus resources from the top down Allocate space from the highest-address PCI bus resource first, then work downward. Previously, we looked for space in PCI host bridge windows in the order we discovered the windows. For example, given the following windows (discovered via an ACPI _CRS method): pci_root PNP0A03:00: host bridge window [mem 0x000a0000-0x000bffff] pci_root PNP0A03:00: host bridge window [mem 0x000c0000-0x000effff] pci_root PNP0A03:00: host bridge window [mem 0x000f0000-0x000fffff] pci_root PNP0A03:00: host bridge window [mem 0xbff00000-0xf7ffffff] pci_root PNP0A03:00: host bridge window [mem 0xff980000-0xff980fff] pci_root PNP0A03:00: host bridge window [mem 0xff97c000-0xff97ffff] pci_root PNP0A03:00: host bridge window [mem 0xfed20000-0xfed9ffff] we attempted to allocate from [mem 0x000a0000-0x000bffff] first, then [mem 0x000c0000-0x000effff], and so on. With this patch, we allocate from [mem 0xff980000-0xff980fff] first, then [mem 0xff97c000-0xff97ffff], [mem 0xfed20000-0xfed9ffff], etc. Allocating top-down follows Windows practice, so we're less likely to trip over BIOS defects in the _CRS description. On the machine above (a Dell T3500), the [mem 0xbff00000-0xbfffffff] region doesn't actually work and is likely a BIOS defect. The symptom is that we move the AHCI controller to 0xbff00000, which leads to "Boot has failed, sleeping forever," a BUG in ahci_stop_engine(), or some other boot failure. Reference: https://bugzilla.kernel.org/show_bug.cgi?id=16228#c43 Reference: https://bugzilla.redhat.com/show_bug.cgi?id=620313 Reference: https://bugzilla.redhat.com/show_bug.cgi?id=629933 Reported-by: Brian Bloniarz Reported-and-tested-by: Stefan Becker Reported-by: Denys Vlasenko Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- drivers/pci/bus.c | 53 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 7f0af0e9b826..172bf26e0680 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -64,6 +64,49 @@ void pci_bus_remove_resources(struct pci_bus *bus) } } +/* + * Find the highest-address bus resource below the cursor "res". If the + * cursor is NULL, return the highest resource. + */ +static struct resource *pci_bus_find_resource_prev(struct pci_bus *bus, + unsigned int type, + struct resource *res) +{ + struct resource *r, *prev = NULL; + int i; + + pci_bus_for_each_resource(bus, r, i) { + if (!r) + continue; + + if ((r->flags & IORESOURCE_TYPE_BITS) != type) + continue; + + /* If this resource is at or past the cursor, skip it */ + if (res) { + if (r == res) + continue; + if (r->end > res->end) + continue; + if (r->end == res->end && r->start > res->start) + continue; + } + + if (!prev) + prev = r; + + /* + * A small resource is higher than a large one that ends at + * the same address. + */ + if (r->end > prev->end || + (r->end == prev->end && r->start > prev->start)) + prev = r; + } + + return prev; +} + /** * pci_bus_alloc_resource - allocate a resource from a parent bus * @bus: PCI bus @@ -89,9 +132,10 @@ pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res, resource_size_t), void *alignf_data) { - int i, ret = -ENOMEM; + int ret = -ENOMEM; struct resource *r; resource_size_t max = -1; + unsigned int type = res->flags & IORESOURCE_TYPE_BITS; type_mask |= IORESOURCE_IO | IORESOURCE_MEM; @@ -99,10 +143,9 @@ pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res, if (!(res->flags & IORESOURCE_MEM_64)) max = PCIBIOS_MAX_MEM_32; - pci_bus_for_each_resource(bus, r, i) { - if (!r) - continue; - + /* Look for space at highest addresses first */ + r = pci_bus_find_resource_prev(bus, type, NULL); + for ( ; r; r = pci_bus_find_resource_prev(bus, type, r)) { /* type_mask must match */ if ((res->flags ^ r->flags) & type_mask) continue; From dc9887dc02e37bcf83f4e792aa14b07782ef54cf Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:44 -0600 Subject: [PATCH 25/27] x86/PCI: allocate space from the end of a region, not the beginning Allocate from the end of a region, not the beginning. For example, if we need to allocate 0x800 bytes for a device on bus 0000:00 given these resources: [mem 0xbff00000-0xdfffffff] PCI Bus 0000:00 [mem 0xc0000000-0xdfffffff] PCI Bus 0000:02 the available space at [mem 0xbff00000-0xbfffffff] is passed to the alignment callback (pcibios_align_resource()). Prior to this patch, we would put the new 0x800 byte resource at the beginning of that available space, i.e., at [mem 0xbff00000-0xbff007ff]. With this patch, we put it at the end, at [mem 0xbffff800-0xbfffffff]. Reference: https://bugzilla.kernel.org/show_bug.cgi?id=16228#c41 Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 55253095be84..826140af3c3c 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -65,16 +65,21 @@ pcibios_align_resource(void *data, const struct resource *res, resource_size_t size, resource_size_t align) { struct pci_dev *dev = data; - resource_size_t start = res->start; + resource_size_t start = round_down(res->end - size + 1, align); if (res->flags & IORESOURCE_IO) { - if (skip_isa_ioresource_align(dev)) - return start; - if (start & 0x300) - start = (start + 0x3ff) & ~0x3ff; + + /* + * If we're avoiding ISA aliases, the largest contiguous I/O + * port space is 256 bytes. Clearing bits 9 and 10 preserves + * all 256-byte and smaller alignments, so the result will + * still be correctly aligned. + */ + if (!skip_isa_ioresource_align(dev)) + start &= ~0x300; } else if (res->flags & IORESOURCE_MEM) { if (start < BIOS_END) - start = BIOS_END; + start = res->end; /* fail; no space */ } return start; } From 419afdf53cca794a190014593b4778e2e9d64cf3 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:49 -0600 Subject: [PATCH 26/27] x86: update iomem_resource end based on CPU physical address capabilities The iomem_resource map reflects the available physical address space. We statically initialize the end to -1, i.e., 0xffffffff_ffffffff, but of course we can only use as much as the CPU can address. This patch updates the end based on the CPU capabilities, so we don't mistakenly allocate space that isn't usable, as we're likely to do when allocating from the top-down. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/kernel/setup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c3a4fbb2b996..922b5a1f978b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -788,6 +788,7 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); /* update the e820_saved too */ From 1af3c2e45e7a641e774bbb84fa428f2f0bf2d9c9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Oct 2010 15:41:54 -0600 Subject: [PATCH 27/27] x86: allocate space within a region top-down Request that allocate_resource() use available space from high addresses first, rather than the default of using low addresses first. The most common place this makes a difference is when we move or assign new PCI device resources. Low addresses are generally scarce, so it's better to use high addresses when possible. This follows Windows practice for PCI allocation. Reference: https://bugzilla.kernel.org/show_bug.cgi?id=16228#c42 Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/kernel/setup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 922b5a1f978b..0fe76df866db 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -788,6 +788,7 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + resource_alloc_from_bottom = 0; iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data();