diff --git a/.mailmap b/.mailmap index 83d7e750c2fc..fd6219293057 100644 --- a/.mailmap +++ b/.mailmap @@ -108,6 +108,10 @@ Jason Gunthorpe Jason Gunthorpe Javi Merino +Jayachandran C +Jayachandran C +Jayachandran C +Jayachandran C Jean Tourrilhes Jeff Garzik diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 06d0931119cc..fc20cde63d1e 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -486,6 +486,8 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/spec_store_bypass /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds + /sys/devices/system/cpu/vulnerabilities/tsx_async_abort + /sys/devices/system/cpu/vulnerabilities/itlb_multihit Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 49311f3da6f2..0795e3c2643f 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -12,3 +12,5 @@ are configurable at compile, boot or run time. spectre l1tf mds + tsx_async_abort + multihit.rst diff --git a/Documentation/admin-guide/hw-vuln/multihit.rst b/Documentation/admin-guide/hw-vuln/multihit.rst new file mode 100644 index 000000000000..ba9988d8bce5 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/multihit.rst @@ -0,0 +1,163 @@ +iTLB multihit +============= + +iTLB multihit is an erratum where some processors may incur a machine check +error, possibly resulting in an unrecoverable CPU lockup, when an +instruction fetch hits multiple entries in the instruction TLB. This can +occur when the page size is changed along with either the physical address +or cache type. A malicious guest running on a virtualized system can +exploit this erratum to perform a denial of service attack. + + +Affected processors +------------------- + +Variations of this erratum are present on most Intel Core and Xeon processor +models. The erratum is not present on: + + - non-Intel processors + + - Some Atoms (Airmont, Bonnell, Goldmont, GoldmontPlus, Saltwell, Silvermont) + + - Intel processors that have the PSCHANGE_MC_NO bit set in the + IA32_ARCH_CAPABILITIES MSR. + + +Related CVEs +------------ + +The following CVE entry is related to this issue: + + ============== ================================================= + CVE-2018-12207 Machine Check Error Avoidance on Page Size Change + ============== ================================================= + + +Problem +------- + +Privileged software, including OS and virtual machine managers (VMM), are in +charge of memory management. A key component in memory management is the control +of the page tables. Modern processors use virtual memory, a technique that creates +the illusion of a very large memory for processors. This virtual space is split +into pages of a given size. Page tables translate virtual addresses to physical +addresses. + +To reduce latency when performing a virtual to physical address translation, +processors include a structure, called TLB, that caches recent translations. +There are separate TLBs for instruction (iTLB) and data (dTLB). + +Under this errata, instructions are fetched from a linear address translated +using a 4 KB translation cached in the iTLB. Privileged software modifies the +paging structure so that the same linear address using large page size (2 MB, 4 +MB, 1 GB) with a different physical address or memory type. After the page +structure modification but before the software invalidates any iTLB entries for +the linear address, a code fetch that happens on the same linear address may +cause a machine-check error which can result in a system hang or shutdown. + + +Attack scenarios +---------------- + +Attacks against the iTLB multihit erratum can be mounted from malicious +guests in a virtualized system. + + +iTLB multihit system information +-------------------------------- + +The Linux kernel provides a sysfs interface to enumerate the current iTLB +multihit status of the system:whether the system is vulnerable and which +mitigations are active. The relevant sysfs file is: + +/sys/devices/system/cpu/vulnerabilities/itlb_multihit + +The possible values in this file are: + +.. list-table:: + + * - Not affected + - The processor is not vulnerable. + * - KVM: Mitigation: Split huge pages + - Software changes mitigate this issue. + * - KVM: Vulnerable + - The processor is vulnerable, but no mitigation enabled + + +Enumeration of the erratum +-------------------------------- + +A new bit has been allocated in the IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) msr +and will be set on CPU's which are mitigated against this issue. + + ======================================= =========== =============================== + IA32_ARCH_CAPABILITIES MSR Not present Possibly vulnerable,check model + IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '0' Likely vulnerable,check model + IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '1' Not vulnerable + ======================================= =========== =============================== + + +Mitigation mechanism +------------------------- + +This erratum can be mitigated by restricting the use of large page sizes to +non-executable pages. This forces all iTLB entries to be 4K, and removes +the possibility of multiple hits. + +In order to mitigate the vulnerability, KVM initially marks all huge pages +as non-executable. If the guest attempts to execute in one of those pages, +the page is broken down into 4K pages, which are then marked executable. + +If EPT is disabled or not available on the host, KVM is in control of TLB +flushes and the problematic situation cannot happen. However, the shadow +EPT paging mechanism used by nested virtualization is vulnerable, because +the nested guest can trigger multiple iTLB hits by modifying its own +(non-nested) page tables. For simplicity, KVM will make large pages +non-executable in all shadow paging modes. + +Mitigation control on the kernel command line and KVM - module parameter +------------------------------------------------------------------------ + +The KVM hypervisor mitigation mechanism for marking huge pages as +non-executable can be controlled with a module parameter "nx_huge_pages=". +The kernel command line allows to control the iTLB multihit mitigations at +boot time with the option "kvm.nx_huge_pages=". + +The valid arguments for these options are: + + ========== ================================================================ + force Mitigation is enabled. In this case, the mitigation implements + non-executable huge pages in Linux kernel KVM module. All huge + pages in the EPT are marked as non-executable. + If a guest attempts to execute in one of those pages, the page is + broken down into 4K pages, which are then marked executable. + + off Mitigation is disabled. + + auto Enable mitigation only if the platform is affected and the kernel + was not booted with the "mitigations=off" command line parameter. + This is the default option. + ========== ================================================================ + + +Mitigation selection guide +-------------------------- + +1. No virtualization in use +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + The system is protected by the kernel unconditionally and no further + action is required. + +2. Virtualization with trusted guests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + If the guest comes from a trusted source, you may assume that the guest will + not attempt to maliciously exploit these errata and no further action is + required. + +3. Virtualization with untrusted guests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + If the guest comes from an untrusted source, the guest host kernel will need + to apply iTLB multihit mitigation via the kernel command line or kvm + module parameter. diff --git a/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst new file mode 100644 index 000000000000..fddbd7579c53 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst @@ -0,0 +1,276 @@ +.. SPDX-License-Identifier: GPL-2.0 + +TAA - TSX Asynchronous Abort +====================================== + +TAA is a hardware vulnerability that allows unprivileged speculative access to +data which is available in various CPU internal buffers by using asynchronous +aborts within an Intel TSX transactional region. + +Affected processors +------------------- + +This vulnerability only affects Intel processors that support Intel +Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8) +is 0 in the IA32_ARCH_CAPABILITIES MSR. On processors where the MDS_NO bit +(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations +also mitigate against TAA. + +Whether a processor is affected or not can be read out from the TAA +vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`. + +Related CVEs +------------ + +The following CVE entry is related to this TAA issue: + + ============== ===== =================================================== + CVE-2019-11135 TAA TSX Asynchronous Abort (TAA) condition on some + microprocessors utilizing speculative execution may + allow an authenticated user to potentially enable + information disclosure via a side channel with + local access. + ============== ===== =================================================== + +Problem +------- + +When performing store, load or L1 refill operations, processors write +data into temporary microarchitectural structures (buffers). The data in +those buffers can be forwarded to load operations as an optimization. + +Intel TSX is an extension to the x86 instruction set architecture that adds +hardware transactional memory support to improve performance of multi-threaded +software. TSX lets the processor expose and exploit concurrency hidden in an +application due to dynamically avoiding unnecessary synchronization. + +TSX supports atomic memory transactions that are either committed (success) or +aborted. During an abort, operations that happened within the transactional region +are rolled back. An asynchronous abort takes place, among other options, when a +different thread accesses a cache line that is also used within the transactional +region when that access might lead to a data race. + +Immediately after an uncompleted asynchronous abort, certain speculatively +executed loads may read data from those internal buffers and pass it to dependent +operations. This can be then used to infer the value via a cache side channel +attack. + +Because the buffers are potentially shared between Hyper-Threads cross +Hyper-Thread attacks are possible. + +The victim of a malicious actor does not need to make use of TSX. Only the +attacker needs to begin a TSX transaction and raise an asynchronous abort +which in turn potenitally leaks data stored in the buffers. + +More detailed technical information is available in the TAA specific x86 +architecture section: :ref:`Documentation/x86/tsx_async_abort.rst `. + + +Attack scenarios +---------------- + +Attacks against the TAA vulnerability can be implemented from unprivileged +applications running on hosts or guests. + +As for MDS, the attacker has no control over the memory addresses that can +be leaked. Only the victim is responsible for bringing data to the CPU. As +a result, the malicious actor has to sample as much data as possible and +then postprocess it to try to infer any useful information from it. + +A potential attacker only has read access to the data. Also, there is no direct +privilege escalation by using this technique. + + +.. _tsx_async_abort_sys_info: + +TAA system information +----------------------- + +The Linux kernel provides a sysfs interface to enumerate the current TAA status +of mitigated systems. The relevant sysfs file is: + +/sys/devices/system/cpu/vulnerabilities/tsx_async_abort + +The possible values in this file are: + +.. list-table:: + + * - 'Vulnerable' + - The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied. + * - 'Vulnerable: Clear CPU buffers attempted, no microcode' + - The system tries to clear the buffers but the microcode might not support the operation. + * - 'Mitigation: Clear CPU buffers' + - The microcode has been updated to clear the buffers. TSX is still enabled. + * - 'Mitigation: TSX disabled' + - TSX is disabled. + * - 'Not affected' + - The CPU is not affected by this issue. + +.. _ucode_needed: + +Best effort mitigation mode +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If the processor is vulnerable, but the availability of the microcode-based +mitigation mechanism is not advertised via CPUID the kernel selects a best +effort mitigation mode. This mode invokes the mitigation instructions +without a guarantee that they clear the CPU buffers. + +This is done to address virtualization scenarios where the host has the +microcode update applied, but the hypervisor is not yet updated to expose the +CPUID to the guest. If the host has updated microcode the protection takes +effect; otherwise a few CPU cycles are wasted pointlessly. + +The state in the tsx_async_abort sysfs file reflects this situation +accordingly. + + +Mitigation mechanism +-------------------- + +The kernel detects the affected CPUs and the presence of the microcode which is +required. If a CPU is affected and the microcode is available, then the kernel +enables the mitigation by default. + + +The mitigation can be controlled at boot time via a kernel command line option. +See :ref:`taa_mitigation_control_command_line`. + +.. _virt_mechanism: + +Virtualization mitigation +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Affected systems where the host has TAA microcode and TAA is mitigated by +having disabled TSX previously, are not vulnerable regardless of the status +of the VMs. + +In all other cases, if the host either does not have the TAA microcode or +the kernel is not mitigated, the system might be vulnerable. + + +.. _taa_mitigation_control_command_line: + +Mitigation control on the kernel command line +--------------------------------------------- + +The kernel command line allows to control the TAA mitigations at boot time with +the option "tsx_async_abort=". The valid arguments for this option are: + + ============ ============================================================= + off This option disables the TAA mitigation on affected platforms. + If the system has TSX enabled (see next parameter) and the CPU + is affected, the system is vulnerable. + + full TAA mitigation is enabled. If TSX is enabled, on an affected + system it will clear CPU buffers on ring transitions. On + systems which are MDS-affected and deploy MDS mitigation, + TAA is also mitigated. Specifying this option on those + systems will have no effect. + + full,nosmt The same as tsx_async_abort=full, with SMT disabled on + vulnerable CPUs that have TSX enabled. This is the complete + mitigation. When TSX is disabled, SMT is not disabled because + CPU is not vulnerable to cross-thread TAA attacks. + ============ ============================================================= + +Not specifying this option is equivalent to "tsx_async_abort=full". + +The kernel command line also allows to control the TSX feature using the +parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used +to control the TSX feature and the enumeration of the TSX feature bits (RTM +and HLE) in CPUID. + +The valid options are: + + ============ ============================================================= + off Disables TSX on the system. + + Note that this option takes effect only on newer CPUs which are + not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 + and which get the new IA32_TSX_CTRL MSR through a microcode + update. This new MSR allows for the reliable deactivation of + the TSX functionality. + + on Enables TSX. + + Although there are mitigations for all known security + vulnerabilities, TSX has been known to be an accelerator for + several previous speculation-related CVEs, and so there may be + unknown security risks associated with leaving it enabled. + + auto Disables TSX if X86_BUG_TAA is present, otherwise enables TSX + on the system. + ============ ============================================================= + +Not specifying this option is equivalent to "tsx=off". + +The following combinations of the "tsx_async_abort" and "tsx" are possible. For +affected platforms tsx=auto is equivalent to tsx=off and the result will be: + + ========= ========================== ========================================= + tsx=on tsx_async_abort=full The system will use VERW to clear CPU + buffers. Cross-thread attacks are still + possible on SMT machines. + tsx=on tsx_async_abort=full,nosmt As above, cross-thread attacks on SMT + mitigated. + tsx=on tsx_async_abort=off The system is vulnerable. + tsx=off tsx_async_abort=full TSX might be disabled if microcode + provides a TSX control MSR. If so, + system is not vulnerable. + tsx=off tsx_async_abort=full,nosmt Ditto + tsx=off tsx_async_abort=off ditto + ========= ========================== ========================================= + + +For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU +buffers. For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0) +"tsx" command line argument has no effect. + +For the affected platforms below table indicates the mitigation status for the +combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO +and TSX_CTRL_MSR. + + ======= ========= ============= ======================================== + MDS_NO MD_CLEAR TSX_CTRL_MSR Status + ======= ========= ============= ======================================== + 0 0 0 Vulnerable (needs microcode) + 0 1 0 MDS and TAA mitigated via VERW + 1 1 0 MDS fixed, TAA vulnerable if TSX enabled + because MD_CLEAR has no meaning and + VERW is not guaranteed to clear buffers + 1 X 1 MDS fixed, TAA can be mitigated by + VERW or TSX_CTRL_MSR + ======= ========= ============= ======================================== + +Mitigation selection guide +-------------------------- + +1. Trusted userspace and guests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If all user space applications are from a trusted source and do not execute +untrusted code which is supplied externally, then the mitigation can be +disabled. The same applies to virtualized environments with trusted guests. + + +2. Untrusted userspace and guests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If there are untrusted applications or guests on the system, enabling TSX +might allow a malicious actor to leak data from the host or from other +processes running on the same physical core. + +If the microcode is available and the TSX is disabled on the host, attacks +are prevented in a virtualized environment as well, even if the VMs do not +explicitly enable the mitigation. + + +.. _taa_default_mitigations: + +Default mitigations +------------------- + +The kernel's default action for vulnerable processors is: + + - Deploy TSX disable mitigation (tsx_async_abort=full tsx=off). diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a84a83f8881e..8dee8f68fe15 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2055,6 +2055,25 @@ KVM MMU at runtime. Default is 0 (off) + kvm.nx_huge_pages= + [KVM] Controls the software workaround for the + X86_BUG_ITLB_MULTIHIT bug. + force : Always deploy workaround. + off : Never deploy workaround. + auto : Deploy workaround based on the presence of + X86_BUG_ITLB_MULTIHIT. + + Default is 'auto'. + + If the software workaround is enabled for the host, + guests do need not to enable it for nested guests. + + kvm.nx_huge_pages_recovery_ratio= + [KVM] Controls how many 4KiB pages are periodically zapped + back to huge pages. 0 disables the recovery, otherwise if + the value is N KVM will zap 1/Nth of the 4KiB pages every + minute. The default is 60. + kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. Default is 1 (enabled) @@ -2636,6 +2655,13 @@ ssbd=force-off [ARM64] l1tf=off [X86] mds=off [X86] + tsx_async_abort=off [X86] + kvm.nx_huge_pages=off [X86] + + Exceptions: + This does not have any effect on + kvm.nx_huge_pages when + kvm.nx_huge_pages=force. auto (default) Mitigate all CPU vulnerabilities, but leave SMT @@ -2651,6 +2677,7 @@ be fully mitigated, even if it means losing SMT. Equivalent to: l1tf=flush,nosmt [X86] mds=full,nosmt [X86] + tsx_async_abort=full,nosmt [X86] mminit_loglevel= [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this @@ -4848,6 +4875,71 @@ interruptions from clocksource watchdog are not acceptable). + tsx= [X86] Control Transactional Synchronization + Extensions (TSX) feature in Intel processors that + support TSX control. + + This parameter controls the TSX feature. The options are: + + on - Enable TSX on the system. Although there are + mitigations for all known security vulnerabilities, + TSX has been known to be an accelerator for + several previous speculation-related CVEs, and + so there may be unknown security risks associated + with leaving it enabled. + + off - Disable TSX on the system. (Note that this + option takes effect only on newer CPUs which are + not vulnerable to MDS, i.e., have + MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 and which get + the new IA32_TSX_CTRL MSR through a microcode + update. This new MSR allows for the reliable + deactivation of the TSX functionality.) + + auto - Disable TSX if X86_BUG_TAA is present, + otherwise enable TSX on the system. + + Not specifying this option is equivalent to tsx=off. + + See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst + for more details. + + tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async + Abort (TAA) vulnerability. + + Similar to Micro-architectural Data Sampling (MDS) + certain CPUs that support Transactional + Synchronization Extensions (TSX) are vulnerable to an + exploit against CPU internal buffers which can forward + information to a disclosure gadget under certain + conditions. + + In vulnerable processors, the speculatively forwarded + data can be used in a cache side channel attack, to + access data to which the attacker does not have direct + access. + + This parameter controls the TAA mitigation. The + options are: + + full - Enable TAA mitigation on vulnerable CPUs + if TSX is enabled. + + full,nosmt - Enable TAA mitigation and disable SMT on + vulnerable CPUs. If TSX is disabled, SMT + is not disabled because CPU is not + vulnerable to cross-thread TAA attacks. + off - Unconditionally disable TAA mitigation + + Not specifying this option is equivalent to + tsx_async_abort=full. On CPUs which are MDS affected + and deploy MDS mitigation, TAA mitigation is not + required and doesn't provide any additional + mitigation. + + For details see: + Documentation/admin-guide/hw-vuln/tsx_async_abort.rst + turbografx.map[2|3]= [HW,JOY] TurboGraFX parallel port interface Format: diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst index af64c4bb4447..a8de2fbc1caa 100644 --- a/Documentation/x86/index.rst +++ b/Documentation/x86/index.rst @@ -27,6 +27,7 @@ x86-specific Documentation mds microcode resctrl_ui + tsx_async_abort usb-legacy-support i386/index x86_64/index diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/x86/tsx_async_abort.rst new file mode 100644 index 000000000000..583ddc185ba2 --- /dev/null +++ b/Documentation/x86/tsx_async_abort.rst @@ -0,0 +1,117 @@ +.. SPDX-License-Identifier: GPL-2.0 + +TSX Async Abort (TAA) mitigation +================================ + +.. _tsx_async_abort: + +Overview +-------- + +TSX Async Abort (TAA) is a side channel attack on internal buffers in some +Intel processors similar to Microachitectural Data Sampling (MDS). In this +case certain loads may speculatively pass invalid data to dependent operations +when an asynchronous abort condition is pending in a Transactional +Synchronization Extensions (TSX) transaction. This includes loads with no +fault or assist condition. Such loads may speculatively expose stale data from +the same uarch data structures as in MDS, with same scope of exposure i.e. +same-thread and cross-thread. This issue affects all current processors that +support TSX. + +Mitigation strategy +------------------- + +a) TSX disable - one of the mitigations is to disable TSX. A new MSR +IA32_TSX_CTRL will be available in future and current processors after +microcode update which can be used to disable TSX. In addition, it +controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID. + +b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this +vulnerability. More details on this approach can be found in +:ref:`Documentation/admin-guide/hw-vuln/mds.rst `. + +Kernel internal mitigation modes +-------------------------------- + + ============= ============================================================ + off Mitigation is disabled. Either the CPU is not affected or + tsx_async_abort=off is supplied on the kernel command line. + + tsx disabled Mitigation is enabled. TSX feature is disabled by default at + bootup on processors that support TSX control. + + verw Mitigation is enabled. CPU is affected and MD_CLEAR is + advertised in CPUID. + + ucode needed Mitigation is enabled. CPU is affected and MD_CLEAR is not + advertised in CPUID. That is mainly for virtualization + scenarios where the host has the updated microcode but the + hypervisor does not expose MD_CLEAR in CPUID. It's a best + effort approach without guarantee. + ============= ============================================================ + +If the CPU is affected and the "tsx_async_abort" kernel command line parameter is +not provided then the kernel selects an appropriate mitigation depending on the +status of RTM and MD_CLEAR CPUID bits. + +Below tables indicate the impact of tsx=on|off|auto cmdline options on state of +TAA mitigation, VERW behavior and TSX feature for various combinations of +MSR_IA32_ARCH_CAPABILITIES bits. + +1. "tsx=off" + +========= ========= ============ ============ ============== =================== ====================== +MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=off +---------------------------------- ------------------------------------------------------------------------- +TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation + after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full +========= ========= ============ ============ ============== =================== ====================== + 0 0 0 HW default Yes Same as MDS Same as MDS + 0 0 1 Invalid case Invalid case Invalid case Invalid case + 0 1 0 HW default No Need ucode update Need ucode update + 0 1 1 Disabled Yes TSX disabled TSX disabled + 1 X 1 Disabled X None needed None needed +========= ========= ============ ============ ============== =================== ====================== + +2. "tsx=on" + +========= ========= ============ ============ ============== =================== ====================== +MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=on +---------------------------------- ------------------------------------------------------------------------- +TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation + after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full +========= ========= ============ ============ ============== =================== ====================== + 0 0 0 HW default Yes Same as MDS Same as MDS + 0 0 1 Invalid case Invalid case Invalid case Invalid case + 0 1 0 HW default No Need ucode update Need ucode update + 0 1 1 Enabled Yes None Same as MDS + 1 X 1 Enabled X None needed None needed +========= ========= ============ ============ ============== =================== ====================== + +3. "tsx=auto" + +========= ========= ============ ============ ============== =================== ====================== +MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=auto +---------------------------------- ------------------------------------------------------------------------- +TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation + after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full +========= ========= ============ ============ ============== =================== ====================== + 0 0 0 HW default Yes Same as MDS Same as MDS + 0 0 1 Invalid case Invalid case Invalid case Invalid case + 0 1 0 HW default No Need ucode update Need ucode update + 0 1 1 Disabled Yes TSX disabled TSX disabled + 1 X 1 Enabled X None needed None needed +========= ========= ============ ============ ============== =================== ====================== + +In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that +indicates whether MSR_IA32_TSX_CTRL is supported. + +There are two control bits in IA32_TSX_CTRL MSR: + + Bit 0: When set it disables the Restricted Transactional Memory (RTM) + sub-feature of TSX (will force all transactions to abort on the + XBEGIN instruction). + + Bit 1: When set it disables the enumeration of the RTM and HLE feature + (i.e. it will make CPUID(EAX=7).EBX{bit4} and + CPUID(EAX=7).EBX{bit11} read as 0). diff --git a/MAINTAINERS b/MAINTAINERS index 39681b34f8e3..760049454a23 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3268,7 +3268,6 @@ S: Maintained F: drivers/cpufreq/bmips-cpufreq.c BROADCOM BMIPS MIPS ARCHITECTURE -M: Kevin Cernekee M: Florian Fainelli L: bcm-kernel-feedback-list@broadcom.com L: linux-mips@vger.kernel.org @@ -3745,7 +3744,6 @@ F: drivers/crypto/cavium/cpt/ CAVIUM THUNDERX2 ARM64 SOC M: Robert Richter -M: Jayachandran C L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm64/boot/dts/cavium/thunder2-99xx* diff --git a/Makefile b/Makefile index b37d0e8fc61d..42bfda209cb8 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 4 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Kleptomaniac Octopus # *DOCUMENTATION* @@ -917,6 +917,9 @@ ifeq ($(CONFIG_RELR),y) LDFLAGS_vmlinux += --pack-dyn-relocs=relr endif +# make the checker run with the right architecture +CHECKFLAGS += --arch=$(ARCH) + # insure the checker run with the right endianness CHECKFLAGS += $(if $(CONFIG_CPU_BIG_ENDIAN),-mbig-endian,-mlittle-endian) diff --git a/arch/arm/boot/dts/imx6-logicpd-baseboard.dtsi b/arch/arm/boot/dts/imx6-logicpd-baseboard.dtsi index 2a6ce87071f9..9e027b9a5f91 100644 --- a/arch/arm/boot/dts/imx6-logicpd-baseboard.dtsi +++ b/arch/arm/boot/dts/imx6-logicpd-baseboard.dtsi @@ -328,6 +328,10 @@ pinctrl-0 = <&pinctrl_pwm3>; }; +&snvs_pwrkey { + status = "okay"; +}; + &ssi2 { status = "okay"; }; diff --git a/arch/arm/boot/dts/imx6qdl-sabreauto.dtsi b/arch/arm/boot/dts/imx6qdl-sabreauto.dtsi index f3404dd10537..cf628465cd0a 100644 --- a/arch/arm/boot/dts/imx6qdl-sabreauto.dtsi +++ b/arch/arm/boot/dts/imx6qdl-sabreauto.dtsi @@ -230,6 +230,8 @@ accelerometer@1c { compatible = "fsl,mma8451"; reg = <0x1c>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_mma8451_int>; interrupt-parent = <&gpio6>; interrupts = <31 IRQ_TYPE_LEVEL_LOW>; }; @@ -628,6 +630,12 @@ >; }; + pinctrl_mma8451_int: mma8451intgrp { + fsl,pins = < + MX6QDL_PAD_EIM_BCLK__GPIO6_IO31 0xb0b1 + >; + }; + pinctrl_pwm3: pwm1grp { fsl,pins = < MX6QDL_PAD_SD4_DAT1__PWM3_OUT 0x1b0b1 diff --git a/arch/arm/boot/dts/stm32mp157c-ev1.dts b/arch/arm/boot/dts/stm32mp157c-ev1.dts index 89d29b50c3f4..91fc0a315c49 100644 --- a/arch/arm/boot/dts/stm32mp157c-ev1.dts +++ b/arch/arm/boot/dts/stm32mp157c-ev1.dts @@ -183,14 +183,12 @@ ov5640: camera@3c { compatible = "ovti,ov5640"; - pinctrl-names = "default"; - pinctrl-0 = <&ov5640_pins>; reg = <0x3c>; clocks = <&clk_ext_camera>; clock-names = "xclk"; DOVDD-supply = <&v2v8>; - powerdown-gpios = <&stmfx_pinctrl 18 GPIO_ACTIVE_HIGH>; - reset-gpios = <&stmfx_pinctrl 19 GPIO_ACTIVE_LOW>; + powerdown-gpios = <&stmfx_pinctrl 18 (GPIO_ACTIVE_HIGH | GPIO_PUSH_PULL)>; + reset-gpios = <&stmfx_pinctrl 19 (GPIO_ACTIVE_LOW | GPIO_PUSH_PULL)>; rotation = <180>; status = "okay"; @@ -223,15 +221,8 @@ joystick_pins: joystick { pins = "gpio0", "gpio1", "gpio2", "gpio3", "gpio4"; - drive-push-pull; bias-pull-down; }; - - ov5640_pins: camera { - pins = "agpio2", "agpio3"; /* stmfx pins 18 & 19 */ - drive-push-pull; - output-low; - }; }; }; }; diff --git a/arch/arm/boot/dts/stm32mp157c.dtsi b/arch/arm/boot/dts/stm32mp157c.dtsi index 9b11654a0a39..f98e0370c0bc 100644 --- a/arch/arm/boot/dts/stm32mp157c.dtsi +++ b/arch/arm/boot/dts/stm32mp157c.dtsi @@ -932,7 +932,7 @@ interrupt-names = "int0", "int1"; clocks = <&rcc CK_HSE>, <&rcc FDCAN_K>; clock-names = "hclk", "cclk"; - bosch,mram-cfg = <0x1400 0 0 32 0 0 2 2>; + bosch,mram-cfg = <0x0 0 0 32 0 0 2 2>; status = "disabled"; }; @@ -945,7 +945,7 @@ interrupt-names = "int0", "int1"; clocks = <&rcc CK_HSE>, <&rcc FDCAN_K>; clock-names = "hclk", "cclk"; - bosch,mram-cfg = <0x0 0 0 32 0 0 2 2>; + bosch,mram-cfg = <0x1400 0 0 32 0 0 2 2>; status = "disabled"; }; diff --git a/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts b/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts index 568b90ece342..3bec3e0a81b2 100644 --- a/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts +++ b/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts @@ -192,6 +192,7 @@ vqmmc-supply = <®_dldo1>; non-removable; wakeup-source; + keep-power-in-suspend; status = "okay"; brcmf: wifi@1 { diff --git a/arch/arm/mach-sunxi/mc_smp.c b/arch/arm/mach-sunxi/mc_smp.c index 239084cf8192..26cbce135338 100644 --- a/arch/arm/mach-sunxi/mc_smp.c +++ b/arch/arm/mach-sunxi/mc_smp.c @@ -481,14 +481,18 @@ static void sunxi_mc_smp_cpu_die(unsigned int l_cpu) static int sunxi_cpu_powerdown(unsigned int cpu, unsigned int cluster) { u32 reg; + int gating_bit = cpu; pr_debug("%s: cluster %u cpu %u\n", __func__, cluster, cpu); if (cpu >= SUNXI_CPUS_PER_CLUSTER || cluster >= SUNXI_NR_CLUSTERS) return -EINVAL; + if (is_a83t && cpu == 0) + gating_bit = 4; + /* gate processor power */ reg = readl(prcm_base + PRCM_PWROFF_GATING_REG(cluster)); - reg |= PRCM_PWROFF_GATING_REG_CORE(cpu); + reg |= PRCM_PWROFF_GATING_REG_CORE(gating_bit); writel(reg, prcm_base + PRCM_PWROFF_GATING_REG(cluster)); udelay(20); diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts index d98346da01df..078a5010228c 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts @@ -127,7 +127,7 @@ status = "okay"; i2c-mux@77 { - compatible = "nxp,pca9847"; + compatible = "nxp,pca9547"; reg = <0x77>; #address-cells = <1>; #size-cells = <0>; diff --git a/arch/arm64/boot/dts/freescale/imx8mm.dtsi b/arch/arm64/boot/dts/freescale/imx8mm.dtsi index 58b8cd06cae7..23c8fad7932b 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm.dtsi @@ -394,7 +394,7 @@ }; sdma2: dma-controller@302c0000 { - compatible = "fsl,imx8mm-sdma", "fsl,imx7d-sdma"; + compatible = "fsl,imx8mm-sdma", "fsl,imx8mq-sdma"; reg = <0x302c0000 0x10000>; interrupts = ; clocks = <&clk IMX8MM_CLK_SDMA2_ROOT>, @@ -405,7 +405,7 @@ }; sdma3: dma-controller@302b0000 { - compatible = "fsl,imx8mm-sdma", "fsl,imx7d-sdma"; + compatible = "fsl,imx8mm-sdma", "fsl,imx8mq-sdma"; reg = <0x302b0000 0x10000>; interrupts = ; clocks = <&clk IMX8MM_CLK_SDMA3_ROOT>, @@ -737,7 +737,7 @@ }; sdma1: dma-controller@30bd0000 { - compatible = "fsl,imx8mm-sdma", "fsl,imx7d-sdma"; + compatible = "fsl,imx8mm-sdma", "fsl,imx8mq-sdma"; reg = <0x30bd0000 0x10000>; interrupts = ; clocks = <&clk IMX8MM_CLK_SDMA1_ROOT>, diff --git a/arch/arm64/boot/dts/freescale/imx8mn.dtsi b/arch/arm64/boot/dts/freescale/imx8mn.dtsi index 98496f570720..43c4db312146 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mn.dtsi @@ -288,7 +288,7 @@ }; sdma3: dma-controller@302b0000 { - compatible = "fsl,imx8mn-sdma", "fsl,imx7d-sdma"; + compatible = "fsl,imx8mn-sdma", "fsl,imx8mq-sdma"; reg = <0x302b0000 0x10000>; interrupts = ; clocks = <&clk IMX8MN_CLK_SDMA3_ROOT>, @@ -299,7 +299,7 @@ }; sdma2: dma-controller@302c0000 { - compatible = "fsl,imx8mn-sdma", "fsl,imx7d-sdma"; + compatible = "fsl,imx8mn-sdma", "fsl,imx8mq-sdma"; reg = <0x302c0000 0x10000>; interrupts = ; clocks = <&clk IMX8MN_CLK_SDMA2_ROOT>, @@ -612,7 +612,7 @@ }; sdma1: dma-controller@30bd0000 { - compatible = "fsl,imx8mn-sdma", "fsl,imx7d-sdma"; + compatible = "fsl,imx8mn-sdma", "fsl,imx8mq-sdma"; reg = <0x30bd0000 0x10000>; interrupts = ; clocks = <&clk IMX8MN_CLK_SDMA1_ROOT>, diff --git a/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi b/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi index 087b5b6ebe89..32ce14936b01 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mq-zii-ultra.dtsi @@ -88,7 +88,7 @@ regulator-name = "0V9_ARM"; regulator-min-microvolt = <900000>; regulator-max-microvolt = <1000000>; - gpios = <&gpio3 19 GPIO_ACTIVE_HIGH>; + gpios = <&gpio3 16 GPIO_ACTIVE_HIGH>; states = <1000000 0x1 900000 0x0>; regulator-always-on; diff --git a/arch/arm64/include/asm/vdso/vsyscall.h b/arch/arm64/include/asm/vdso/vsyscall.h index 0c731bfc7c8c..0c20a7c1bee5 100644 --- a/arch/arm64/include/asm/vdso/vsyscall.h +++ b/arch/arm64/include/asm/vdso/vsyscall.h @@ -30,13 +30,6 @@ int __arm64_get_clock_mode(struct timekeeper *tk) } #define __arch_get_clock_mode __arm64_get_clock_mode -static __always_inline -int __arm64_use_vsyscall(struct vdso_data *vdata) -{ - return !vdata[CS_HRES_COARSE].clock_mode; -} -#define __arch_use_vsyscall __arm64_use_vsyscall - static __always_inline void __arm64_update_vsyscall(struct vdso_data *vdata, struct timekeeper *tk) { diff --git a/arch/mips/include/asm/vdso/vsyscall.h b/arch/mips/include/asm/vdso/vsyscall.h index 195314732233..00d41b94ba31 100644 --- a/arch/mips/include/asm/vdso/vsyscall.h +++ b/arch/mips/include/asm/vdso/vsyscall.h @@ -28,13 +28,6 @@ int __mips_get_clock_mode(struct timekeeper *tk) } #define __arch_get_clock_mode __mips_get_clock_mode -static __always_inline -int __mips_use_vsyscall(struct vdso_data *vdata) -{ - return (vdata[CS_HRES_COARSE].clock_mode != VDSO_CLOCK_NONE); -} -#define __arch_use_vsyscall __mips_use_vsyscall - /* The asm-generic header needs to be included after the definitions above */ #include diff --git a/arch/mips/sgi-ip27/Kconfig b/arch/mips/sgi-ip27/Kconfig index ef3847e7aee0..e5b6cadbec85 100644 --- a/arch/mips/sgi-ip27/Kconfig +++ b/arch/mips/sgi-ip27/Kconfig @@ -38,10 +38,3 @@ config REPLICATE_KTEXT Say Y here to enable replicating the kernel text across multiple nodes in a NUMA cluster. This trades memory for speed. -config REPLICATE_EXHANDLERS - bool "Exception handler replication support" - depends on SGI_IP27 - help - Say Y here to enable replicating the kernel exception handlers - across multiple nodes in a NUMA cluster. This trades memory for - speed. diff --git a/arch/mips/sgi-ip27/ip27-init.c b/arch/mips/sgi-ip27/ip27-init.c index 59d5375c9021..79a52c472782 100644 --- a/arch/mips/sgi-ip27/ip27-init.c +++ b/arch/mips/sgi-ip27/ip27-init.c @@ -69,23 +69,14 @@ static void per_hub_init(cnodeid_t cnode) hub_rtc_init(cnode); -#ifdef CONFIG_REPLICATE_EXHANDLERS - /* - * If this is not a headless node initialization, - * copy over the caliased exception handlers. - */ - if (get_compact_nodeid() == cnode) { - extern char except_vec2_generic, except_vec3_generic; - extern void build_tlb_refill_handler(void); - - memcpy((void *)(CKSEG0 + 0x100), &except_vec2_generic, 0x80); - memcpy((void *)(CKSEG0 + 0x180), &except_vec3_generic, 0x80); - build_tlb_refill_handler(); - memcpy((void *)(CKSEG0 + 0x100), (void *) CKSEG0, 0x80); - memcpy((void *)(CKSEG0 + 0x180), &except_vec3_generic, 0x100); + if (nasid) { + /* copy exception handlers from first node to current node */ + memcpy((void *)NODE_OFFSET_TO_K0(nasid, 0), + (void *)CKSEG0, 0x200); __flush_cache_all(); + /* switch to node local exception handlers */ + REMOTE_HUB_S(nasid, PI_CALIAS_SIZE, PI_CALIAS_SIZE_8K); } -#endif } void per_cpu_init(void) diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index fb077a947575..8624a885d95b 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -332,11 +332,7 @@ static void __init mlreset(void) * thinks it is a node 0 address. */ REMOTE_HUB_S(nasid, PI_REGION_PRESENT, (region_mask | 1)); -#ifdef CONFIG_REPLICATE_EXHANDLERS - REMOTE_HUB_S(nasid, PI_CALIAS_SIZE, PI_CALIAS_SIZE_8K); -#else REMOTE_HUB_S(nasid, PI_CALIAS_SIZE, PI_CALIAS_SIZE_0); -#endif #ifdef LATER /* diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile index 324a23947585..997ffe46e953 100644 --- a/arch/sparc/vdso/Makefile +++ b/arch/sparc/vdso/Makefile @@ -65,14 +65,14 @@ $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(SPARC_REG_CFLAGS # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. # -CFLAGS_REMOVE_vdso-note.o = -pg CFLAGS_REMOVE_vclock_gettime.o = -pg +CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg $(obj)/%.so: OBJCOPYFLAGS := -S $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) -CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) +CPPFLAGS_vdso32/vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m elf32_sparc -soname linux-gate.so.1 #This makes sure the $(obj) subdirectory exists even though vdso32/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d6e1faa28c58..8ef85139553f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1940,6 +1940,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS If unsure, say y. +choice + prompt "TSX enable mode" + depends on CPU_SUP_INTEL + default X86_INTEL_TSX_MODE_OFF + help + Intel's TSX (Transactional Synchronization Extensions) feature + allows to optimize locking protocols through lock elision which + can lead to a noticeable performance boost. + + On the other hand it has been shown that TSX can be exploited + to form side channel attacks (e.g. TAA) and chances are there + will be more of those attacks discovered in the future. + + Therefore TSX is not enabled by default (aka tsx=off). An admin + might override this decision by tsx=on the command line parameter. + Even with TSX enabled, the kernel will attempt to enable the best + possible TAA mitigation setting depending on the microcode available + for the particular machine. + + This option allows to set the default tsx mode between tsx=on, =off + and =auto. See Documentation/admin-guide/kernel-parameters.txt for more + details. + + Say off if not sure, auto if TSX is in use but it should be used on safe + platforms or on if TSX is in use and the security aspect of tsx is not + relevant. + +config X86_INTEL_TSX_MODE_OFF + bool "off" + help + TSX is disabled if possible - equals to tsx=off command line parameter. + +config X86_INTEL_TSX_MODE_ON + bool "on" + help + TSX is always enabled on TSX capable HW - equals the tsx=on command + line parameter. + +config X86_INTEL_TSX_MODE_AUTO + bool "auto" + help + TSX is enabled on TSX capable HW that is believed to be safe against + side channel attacks- equals the tsx=auto command line parameter. +endchoice + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0652d3eed9bd..c4fbe379cc0b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -399,5 +399,7 @@ #define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ +#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 24d6598dea29..4fc61483919a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -312,9 +312,12 @@ struct kvm_rmap_head { struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + struct list_head lpage_disallowed_link; + bool unsync; u8 mmu_valid_gen; bool mmio_cached; + bool lpage_disallowed; /* Can't be replaced by an equiv large page */ /* * The following two entries are used to key the shadow page in the @@ -859,6 +862,7 @@ struct kvm_arch { */ struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; + struct list_head lpage_disallowed_mmu_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; @@ -933,6 +937,7 @@ struct kvm_arch { bool exception_payload_enabled; struct kvm_pmu_event_filter *pmu_event_filter; + struct task_struct *nx_lpage_recovery_thread; }; struct kvm_vm_stat { @@ -946,6 +951,7 @@ struct kvm_vm_stat { ulong mmu_unsync; ulong remote_tlb_flush; ulong lpages; + ulong nx_lpage_splits; ulong max_mmu_page_hash_collisions; }; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 20ce682a2540..6a3124664289 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -93,6 +93,18 @@ * Microarchitectural Data * Sampling (MDS) vulnerabilities. */ +#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /* + * The processor is not susceptible to a + * machine check error due to modifying the + * code page size along with either the + * physical address or cache type + * without TLB invalidation. + */ +#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ +#define ARCH_CAP_TAA_NO BIT(8) /* + * Not susceptible to + * TSX Async Abort (TAA) vulnerabilities. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* @@ -103,6 +115,10 @@ #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e +#define MSR_IA32_TSX_CTRL 0x00000122 +#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ +#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ + #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_EIP 0x00000176 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 80bc209c0708..5c24a7b35166 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -314,7 +314,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); #include /** - * mds_clear_cpu_buffers - Mitigation for MDS vulnerability + * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability * * This uses the otherwise unused and obsolete VERW instruction in * combination with microcode which triggers a CPU buffer flush when the @@ -337,7 +337,7 @@ static inline void mds_clear_cpu_buffers(void) } /** - * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability + * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability * * Clear CPU buffers if the corresponding static key is enabled */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6e0a3b43d027..54f5d54280f6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -988,4 +988,11 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 9e2dd2b296cd..2b0faf86da1b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1586,9 +1586,6 @@ static void setup_local_APIC(void) { int cpu = smp_processor_id(); unsigned int value; -#ifdef CONFIG_X86_32 - int logical_apicid, ldr_apicid; -#endif if (disable_apic) { disable_ioapic_support(); @@ -1626,16 +1623,21 @@ static void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - /* - * APIC LDR is initialized. If logical_apicid mapping was - * initialized during get_smp_config(), make sure it matches the - * actual value. - */ - logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid); - /* always use the value from LDR */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + if (apic->dest_logical) { + int logical_apicid, ldr_apicid; + + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches + * the actual value. + */ + logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + if (logical_apicid != BAD_APICID) + WARN_ON(logical_apicid != ldr_apicid); + /* Always use the value from LDR. */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + } #endif /* diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index d7a1e5a9331c..890f60083eca 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -30,7 +30,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o ifdef CONFIG_CPU_SUP_INTEL -obj-y += intel.o intel_pconfig.o +obj-y += intel.o intel_pconfig.o tsx.o obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 91c2561b905f..4c7b0fa15a19 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -39,6 +39,7 @@ static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); +static void __init taa_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -105,6 +106,7 @@ void __init check_bugs(void) ssb_select_mitigation(); l1tf_select_mitigation(); mds_select_mitigation(); + taa_select_mitigation(); arch_smt_update(); @@ -268,6 +270,100 @@ static int __init mds_cmdline(char *str) } early_param("mds", mds_cmdline); +#undef pr_fmt +#define pr_fmt(fmt) "TAA: " fmt + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static bool taa_nosmt __ro_after_init; + +static const char * const taa_strings[] = { + [TAA_MITIGATION_OFF] = "Vulnerable", + [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", + [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", +}; + +static void __init taa_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TSX previously disabled by tsx=off */ + if (!boot_cpu_has(X86_FEATURE_RTM)) { + taa_mitigation = TAA_MITIGATION_TSX_DISABLED; + goto out; + } + + if (cpu_mitigations_off()) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */ + if (taa_mitigation == TAA_MITIGATION_OFF) + goto out; + + if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_VERW; + else + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ( (ia32_cap & ARCH_CAP_MDS_NO) && + !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + static_branch_enable(&mds_user_clear); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + +out: + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static int __init tsx_async_abort_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_TAA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + taa_mitigation = TAA_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + taa_mitigation = TAA_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_nosmt = true; + } + + return 0; +} +early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); + #undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt @@ -786,13 +882,10 @@ static void update_mds_branch_idle(void) } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" void cpu_bugs_smt_update(void) { - /* Enhanced IBRS implies STIBP. No update required. */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) - return; - mutex_lock(&spec_ctrl_mutex); switch (spectre_v2_user) { @@ -819,6 +912,17 @@ void cpu_bugs_smt_update(void) break; } + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -1149,6 +1253,9 @@ void x86_spec_ctrl_setup_ap(void) x86_amd_ssb_disable(); } +bool itlb_multihit_kvm_mitigation; +EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); + #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt @@ -1304,11 +1411,24 @@ static ssize_t l1tf_show_state(char *buf) l1tf_vmx_states[l1tf_vmx_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + if (itlb_multihit_kvm_mitigation) + return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); + else + return sprintf(buf, "KVM: Vulnerable\n"); +} #else static ssize_t l1tf_show_state(char *buf) { return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + return sprintf(buf, "Processor vulnerable\n"); +} #endif static ssize_t mds_show_state(char *buf) @@ -1328,6 +1448,21 @@ static ssize_t mds_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t tsx_async_abort_show_state(char *buf) +{ + if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || + (taa_mitigation == TAA_MITIGATION_OFF)) + return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sprintf(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); + } + + return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) @@ -1398,6 +1533,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_MDS: return mds_show_state(buf); + case X86_BUG_TAA: + return tsx_async_abort_show_state(buf); + + case X86_BUG_ITLB_MULTIHIT: + return itlb_multihit_show_state(buf); + default: break; } @@ -1434,4 +1575,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_MDS); } + +ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_TAA); +} + +ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9ae7d1bcd4f4..fffe21945374 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1016,13 +1016,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } -#define NO_SPECULATION BIT(0) -#define NO_MELTDOWN BIT(1) -#define NO_SSB BIT(2) -#define NO_L1TF BIT(3) -#define NO_MDS BIT(4) -#define MSBDS_ONLY BIT(5) -#define NO_SWAPGS BIT(6) +#define NO_SPECULATION BIT(0) +#define NO_MELTDOWN BIT(1) +#define NO_SSB BIT(2) +#define NO_L1TF BIT(3) +#define NO_MDS BIT(4) +#define MSBDS_ONLY BIT(5) +#define NO_SWAPGS BIT(6) +#define NO_ITLB_MULTIHIT BIT(7) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -1043,27 +1044,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ - VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1073,15 +1074,17 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), + /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), {} }; @@ -1092,19 +1095,30 @@ static bool __init cpu_matches(unsigned long which) return m && !!(m->driver_data & which); } -static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +u64 x86_read_arch_cap_msr(void) { u64 ia32_cap = 0; + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + return ia32_cap; +} + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ + if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); + if (cpu_matches(NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); @@ -1121,6 +1135,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (!cpu_matches(NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); + /* + * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: + * - TSX is supported or + * - TSX_CTRL is present + * + * TSX_CTRL check is needed for cases when TSX could be disabled before + * the kernel boot e.g. kexec. + * TSX_CTRL check alone is not sufficient for cases when the microcode + * update is not present or running as guest that don't get TSX_CTRL. + */ + if (!(ia32_cap & ARCH_CAP_TAA_NO) && + (cpu_has(c, X86_FEATURE_RTM) || + (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + setup_force_cpu_bug(X86_BUG_TAA); + if (cpu_matches(NO_MELTDOWN)) return; @@ -1554,6 +1583,8 @@ void __init identify_boot_cpu(void) #endif cpu_detect_tlb(&boot_cpu_data); setup_cr_pinning(); + + tsx_init(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index c0e2407abdd6..38ab6e115eac 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -44,6 +44,22 @@ struct _tlb_table { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; +#ifdef CONFIG_CPU_SUP_INTEL +enum tsx_ctrl_states { + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_NOT_SUPPORTED, +}; + +extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; + +extern void __init tsx_init(void); +extern void tsx_enable(void); +extern void tsx_disable(void); +#else +static inline void tsx_init(void) { } +#endif /* CONFIG_CPU_SUP_INTEL */ + extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); @@ -62,4 +78,6 @@ unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern u64 x86_read_arch_cap_msr(void); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c2fdc00df163..11d5c5950e2d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -762,6 +762,11 @@ static void init_intel(struct cpuinfo_x86 *c) detect_tme(c); init_intel_misc_features(c); + + if (tsx_ctrl_state == TSX_CTRL_ENABLE) + tsx_enable(); + if (tsx_ctrl_state == TSX_CTRL_DISABLE) + tsx_disable(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index efbd54cc4e69..055c8613b531 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -522,6 +522,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) int ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out; + } md.priv = of->kn->priv; resid = md.u.rid; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index a46dee8e78db..2e3b06d6bbc6 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -461,10 +461,8 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, } rdtgrp = rdtgroup_kn_lock_live(of->kn); - rdt_last_cmd_clear(); if (!rdtgrp) { ret = -ENOENT; - rdt_last_cmd_puts("Directory was removed\n"); goto unlock; } @@ -2648,10 +2646,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, int ret; prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); - rdt_last_cmd_clear(); if (!prdtgrp) { ret = -ENODEV; - rdt_last_cmd_puts("Directory was removed\n"); goto out_unlock; } diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c new file mode 100644 index 000000000000..3e20d322bc98 --- /dev/null +++ b/arch/x86/kernel/cpu/tsx.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Transactional Synchronization Extensions (TSX) control. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Pawan Gupta + */ + +#include + +#include + +#include "cpu.h" + +enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; + +void tsx_disable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Force all transactions to immediately abort */ + tsx |= TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is not enumerated in CPUID. + * This is visible to userspace and will ensure they + * do not waste resources trying TSX transactions that + * will always abort. + */ + tsx |= TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +void tsx_enable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Enable the RTM feature in the cpu */ + tsx &= ~TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is enumerated in CPUID. + * This is visible to userspace and will ensure they + * can enumerate and use the TSX feature. + */ + tsx &= ~TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +static bool __init tsx_ctrl_is_supported(void) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); +} + +static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) +{ + if (boot_cpu_has_bug(X86_BUG_TAA)) + return TSX_CTRL_DISABLE; + + return TSX_CTRL_ENABLE; +} + +void __init tsx_init(void) +{ + char arg[5] = {}; + int ret; + + if (!tsx_ctrl_is_supported()) + return; + + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); + if (ret >= 0) { + if (!strcmp(arg, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(arg, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(arg, "auto")) { + tsx_ctrl_state = x86_get_tsx_auto_mode(); + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("tsx: invalid option, defaulting to off\n"); + } + } else { + /* tsx= not provided */ + if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) + tsx_ctrl_state = x86_get_tsx_auto_mode(); + else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) + tsx_ctrl_state = TSX_CTRL_DISABLE; + else + tsx_ctrl_state = TSX_CTRL_ENABLE; + } + + if (tsx_ctrl_state == TSX_CTRL_DISABLE) { + tsx_disable(); + + /* + * tsx_disable() will change the state of the + * RTM CPUID bit. Clear it here since it is now + * expected to be not set. + */ + setup_clear_cpu_cap(X86_FEATURE_RTM); + } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { + + /* + * HW defaults TSX to be enabled at bootup. + * We may still need the TSX enable support + * during init for special cases like + * kexec after TSX is disabled. + */ + tsx_enable(); + + /* + * tsx_enable() will change the state of the + * RTM CPUID bit. Force it here since it is now + * expected to be set. + */ + setup_force_cpu_cap(X86_FEATURE_RTM); + } +} diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 753b8cfe8b8a..87b97897a881 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -94,6 +94,13 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + /* + * Handle the case where stack trace is collected _before_ + * cea_exception_stacks had been initialized. + */ + if (!begin) + return false; + end = begin + sizeof(struct cea_exception_stacks); /* Bail if @stack is outside the exception stack area. */ if (stk < begin || stk >= end) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6f6b1d04dadf..4cba91ec8049 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -710,6 +710,8 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_INTEL, 0x3ec4, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c59454c382fd..7e322e2daaf5 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1505,6 +1505,9 @@ void __init tsc_init(void) return; } + if (tsc_clocksource_reliable || no_tsc_watchdog) + clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 24c23c66b226..2ce9da58611e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,35 @@ #include #include "trace.h" +extern bool itlb_multihit_kvm_mitigation; + +static int __read_mostly nx_huge_pages = -1; +#ifdef CONFIG_PREEMPT_RT +/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ +static uint __read_mostly nx_huge_pages_recovery_ratio = 0; +#else +static uint __read_mostly nx_huge_pages_recovery_ratio = 60; +#endif + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); + +static struct kernel_param_ops nx_huge_pages_ops = { + .set = set_nx_huge_pages, + .get = param_get_bool, +}; + +static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { + .set = set_nx_huge_pages_recovery_ratio, + .get = param_get_uint, +}; + +module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); +__MODULE_PARM_TYPE(nx_huge_pages, "bool"); +module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, + &nx_huge_pages_recovery_ratio, 0644); +__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); + /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -352,6 +382,11 @@ static inline bool spte_ad_need_write_protect(u64 spte) return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; } +static bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} + static inline u64 spte_shadow_accessed_mask(u64 spte) { MMU_WARN_ON(is_mmio_spte(spte)); @@ -1190,6 +1225,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } +static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + if (sp->lpage_disallowed) + return; + + ++kvm->stat.nx_lpage_splits; + list_add_tail(&sp->lpage_disallowed_link, + &kvm->arch.lpage_disallowed_mmu_pages); + sp->lpage_disallowed = true; +} + static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; @@ -1207,6 +1253,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } +static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + --kvm->stat.nx_lpage_splits; + sp->lpage_disallowed = false; + list_del(&sp->lpage_disallowed_link); +} + static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -2792,6 +2845,9 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, kvm_reload_remote_mmus(kvm); } + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + sp->role.invalid = 1; return list_unstable; } @@ -3013,6 +3069,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!speculative) spte |= spte_shadow_accessed_mask(spte); + if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else @@ -3233,9 +3294,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } +static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) +{ + int level = *levelp; + u64 spte = *it.sptep; + + if (it.level == level && level > PT_PAGE_TABLE_LEVEL && + is_nx_huge_page_enabled() && + is_shadow_present_pte(spte) && + !is_large_pte(spte)) { + /* + * A small SPTE exists for this pfn, but FNAME(fetch) + * and __direct_map would like to create a large PTE + * instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of + * the address. + */ + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + *pfnp |= gfn & page_mask; + (*levelp)--; + } +} + static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, int map_writable, int level, kvm_pfn_t pfn, - bool prefault) + bool prefault, bool lpage_disallowed) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -3248,6 +3332,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &level); + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) break; @@ -3258,6 +3348,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } @@ -3306,7 +3398,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * here. */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && + !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompoundMap(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; @@ -3550,11 +3642,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - bool force_pt_level = false; + bool force_pt_level; kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + force_pt_level = lpage_disallowed; level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { /* @@ -3588,7 +3683,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); + r = __direct_map(vcpu, v, write, map_writable, level, pfn, + prefault, false); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -4174,6 +4270,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; bool map_writable; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); @@ -4184,8 +4282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, - PT_DIRECTORY_LEVEL); + force_pt_level = + lpage_disallowed || + !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { if (level > PT_DIRECTORY_LEVEL && @@ -4214,7 +4313,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, + prefault, lpage_disallowed); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -5914,9 +6014,9 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && - !kvm_is_reserved_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + !kvm_is_zone_device_pfn(pfn) && + PageTransCompoundMap(pfn_to_page(pfn))) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) @@ -6155,10 +6255,59 @@ static void kvm_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); } +static bool get_nx_auto_mode(void) +{ + /* Return true when CPU has the bug, and mitigations are ON */ + return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); +} + +static void __set_nx_huge_pages(bool val) +{ + nx_huge_pages = itlb_multihit_kvm_mitigation = val; +} + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) +{ + bool old_val = nx_huge_pages; + bool new_val; + + /* In "auto" mode deploy workaround only if CPU has the bug. */ + if (sysfs_streq(val, "off")) + new_val = 0; + else if (sysfs_streq(val, "force")) + new_val = 1; + else if (sysfs_streq(val, "auto")) + new_val = get_nx_auto_mode(); + else if (strtobool(val, &new_val) < 0) + return -EINVAL; + + __set_nx_huge_pages(new_val); + + if (new_val != old_val) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + mutex_lock(&kvm->slots_lock); + kvm_mmu_zap_all_fast(kvm); + mutex_unlock(&kvm->slots_lock); + + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + } + mutex_unlock(&kvm_lock); + } + + return 0; +} + int kvm_mmu_module_init(void) { int ret = -ENOMEM; + if (nx_huge_pages == -1) + __set_nx_huge_pages(get_nx_auto_mode()); + /* * MMU roles use union aliasing which is, generally speaking, an * undefined behavior. However, we supposedly know how compilers behave @@ -6238,3 +6387,116 @@ void kvm_mmu_module_exit(void) unregister_shrinker(&mmu_shrinker); mmu_audit_disable(); } + +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) +{ + unsigned int old_val; + int err; + + old_val = nx_huge_pages_recovery_ratio; + err = param_set_uint(val, kp); + if (err) + return err; + + if (READ_ONCE(nx_huge_pages) && + !old_val && nx_huge_pages_recovery_ratio) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + + mutex_unlock(&kvm_lock); + } + + return err; +} + +static void kvm_recover_nx_lpages(struct kvm *kvm) +{ + int rcu_idx; + struct kvm_mmu_page *sp; + unsigned int ratio; + LIST_HEAD(invalid_list); + ulong to_zap; + + rcu_idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + + ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + /* + * We use a separate list instead of just using active_mmu_pages + * because the number of lpage_disallowed pages is expected to + * be relatively small compared to the total. + */ + sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + struct kvm_mmu_page, + lpage_disallowed_link); + WARN_ON_ONCE(!sp->lpage_disallowed); + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + + if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + if (to_zap) + cond_resched_lock(&kvm->mmu_lock); + } + } + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, rcu_idx); +} + +static long get_nx_lpage_recovery_timeout(u64 start_time) +{ + return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) + ? start_time + 60 * HZ - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; +} + +static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +{ + u64 start_time; + long remaining_time; + + while (true) { + start_time = get_jiffies_64(); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop() && remaining_time > 0) { + schedule_timeout(remaining_time); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + + if (kthread_should_stop()) + return 0; + + kvm_recover_nx_lpages(kvm); + } +} + +int kvm_mmu_post_init_vm(struct kvm *kvm) +{ + int err; + + err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, + "kvm-nx-lpage-recovery", + &kvm->arch.nx_lpage_recovery_thread); + if (!err) + kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + + return err; +} + +void kvm_mmu_pre_destroy_vm(struct kvm *kvm) +{ + if (kvm->arch.nx_lpage_recovery_thread) + kthread_stop(kvm->arch.nx_lpage_recovery_thread); +} diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 11f8ec89433b..d55674f44a18 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -210,4 +210,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); + +int kvm_mmu_post_init_vm(struct kvm *kvm); +void kvm_mmu_pre_destroy_vm(struct kvm *kvm); + #endif diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 7d5cdb3af594..97b21e7fd013 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -614,13 +614,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int write_fault, int hlevel, - kvm_pfn_t pfn, bool map_writable, bool prefault) + kvm_pfn_t pfn, bool map_writable, bool prefault, + bool lpage_disallowed) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; int top_level, ret; - gfn_t base_gfn; + gfn_t gfn, base_gfn; direct_access = gw->pte_access; @@ -665,13 +666,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(vcpu, it.sptep, sp); } - base_gfn = gw->gfn; + /* + * FNAME(page_fault) might have clobbered the bottom bits of + * gw->gfn, restore them from the virtual address. + */ + gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); + base_gfn = gfn; trace_kvm_mmu_spte_requested(addr, gw->level, pfn); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); - base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); + + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == hlevel) break; @@ -683,6 +696,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, base_gfn, addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } @@ -759,9 +774,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; kvm_pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + bool force_pt_level = lpage_disallowed; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -851,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (!force_pt_level) transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, - level, pfn, map_writable, prefault); + level, pfn, map_writable, prefault, lpage_disallowed); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5d21a4ab28cf..04a8212704c1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1268,6 +1268,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; + /* + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block is the one expected to change PID.NDST and the + * wakeup handler expects the vCPU to be on the blocked_vcpu_list that + * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up + * correctly. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + goto after_clear_sn; + } + /* The full case. */ do { old.control = new.control = pi_desc->control; @@ -1283,6 +1295,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) } while (cmpxchg64(&pi_desc->control, old.control, new.control) != old.control); +after_clear_sn: + /* * Clear SN before reading the bitmap. The VT-d firmware * writes the bitmap and reads SN atomically (5.2.3 in the @@ -1291,7 +1305,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) */ smp_mb__after_atomic(); - if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) + if (!pi_is_pir_empty(pi_desc)) pi_set_on(pi_desc); } @@ -6137,7 +6151,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) if (pi_test_on(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* - * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); @@ -6167,7 +6181,10 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) { - return pi_test_on(vcpu_to_pi_desc(vcpu)); + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + return pi_test_on(pi_desc) || + (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); } static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index bee16687dc0b..5a0f34b1e226 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -355,6 +355,11 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); +} + static inline void pi_set_sn(struct pi_desc *pi_desc) { set_bit(POSTED_INTR_SN, @@ -373,6 +378,12 @@ static inline void pi_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + static inline int pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ff395f812719..5d530521f11d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -213,6 +213,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages, .mode = 0444) }, + { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } @@ -1132,13 +1133,15 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. * - * This list is modified at module load time to reflect the + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) + * extract the supported MSRs from the related const lists. + * msrs_to_save is selected from the msrs_to_save_all to reflect the * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs + * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs * may depend on host virtualization features rather than host cpu features. */ -static u32 msrs_to_save[] = { +static const u32 msrs_to_save_all[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1179,9 +1182,10 @@ static u32 msrs_to_save[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, }; +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; static unsigned num_msrs_to_save; -static u32 emulated_msrs[] = { +static const u32 emulated_msrs_all[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, @@ -1220,7 +1224,7 @@ static u32 emulated_msrs[] = { * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. * We always support the "true" VMX control MSRs, even if the host * processor does not, so I am putting these registers here rather - * than in msrs_to_save. + * than in msrs_to_save_all. */ MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, @@ -1239,13 +1243,14 @@ static u32 emulated_msrs[] = { MSR_KVM_POLL_CONTROL, }; +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; static unsigned num_emulated_msrs; /* * List of msr numbers which are used to expose MSR-based features that * can be used by a hypervisor to validate requested CPU features. */ -static u32 msr_based_features[] = { +static const u32 msr_based_features_all[] = { MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, MSR_IA32_VMX_PINBASED_CTLS, @@ -1270,6 +1275,7 @@ static u32 msr_based_features[] = { MSR_IA32_ARCH_CAPABILITIES, }; +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; static unsigned int num_msr_based_features; static u64 kvm_get_arch_capabilities(void) @@ -1279,6 +1285,14 @@ static u64 kvm_get_arch_capabilities(void) if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); + /* + * If nx_huge_pages is enabled, KVM's shadow paging will ensure that + * the nested hypervisor runs with NX huge pages. If it is not, + * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other + * L1 guests, so it need not worry about its own (L2) guests. + */ + data |= ARCH_CAP_PSCHANGE_MC_NO; + /* * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. @@ -1298,6 +1312,25 @@ static u64 kvm_get_arch_capabilities(void) if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; + /* + * On TAA affected systems, export MDS_NO=0 when: + * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1. + * - Updated microcode is present. This is detected by + * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures + * that VERW clears CPU buffers. + * + * When MDS_NO=0 is exported, guests deploy clear CPU buffer + * mitigation and don't complain: + * + * "Vulnerable: Clear CPU buffers attempted, no microcode" + * + * If TSX is disabled on the system, guests are also mitigated against + * TAA and clear CPU buffer mitigation is not required for guests. + */ + if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) && + (data & ARCH_CAP_TSX_CTRL_MSR)) + data &= ~ARCH_CAP_MDS_NO; + return data; } @@ -5090,22 +5123,26 @@ static void kvm_init_msr_list(void) { struct x86_pmu_capability x86_pmu; u32 dummy[2]; - unsigned i, j; + unsigned i; BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, - "Please update the fixed PMCs in msrs_to_save[]"); + "Please update the fixed PMCs in msrs_to_saved_all[]"); perf_get_x86_pmu_capability(&x86_pmu); - for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { - if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + num_msrs_to_save = 0; + num_emulated_msrs = 0; + num_msr_based_features = 0; + + for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { + if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) continue; /* * Even MSRs that are valid in the host may not be exposed * to the guests in some cases. */ - switch (msrs_to_save[i]) { + switch (msrs_to_save_all[i]) { case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported()) continue; @@ -5133,17 +5170,17 @@ static void kvm_init_msr_list(void) break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { if (!kvm_x86_ops->pt_supported() || - msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >= + msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: - if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >= + if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; break; case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: - if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= + if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; } @@ -5151,34 +5188,25 @@ static void kvm_init_msr_list(void) break; } - if (j < i) - msrs_to_save[j] = msrs_to_save[i]; - j++; + msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; } - num_msrs_to_save = j; - for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { - if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i])) + for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { + if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i])) continue; - if (j < i) - emulated_msrs[j] = emulated_msrs[i]; - j++; + emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; } - num_emulated_msrs = j; - for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) { + for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) { struct kvm_msr_entry msr; - msr.index = msr_based_features[i]; + msr.index = msr_based_features_all[i]; if (kvm_get_msr_feature(&msr)) continue; - if (j < i) - msr_based_features[j] = msr_based_features[i]; - j++; + msr_based_features[num_msr_based_features++] = msr_based_features_all[i]; } - num_msr_based_features = j; } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -9428,6 +9456,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); + INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -9456,6 +9485,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return kvm_x86_ops->vm_init(kvm); } +int kvm_arch_post_init_vm(struct kvm *kvm) +{ + return kvm_mmu_post_init_vm(kvm); +} + static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); @@ -9557,6 +9591,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } EXPORT_SYMBOL_GPL(x86_set_memory_region); +void kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ + kvm_mmu_pre_destroy_vm(kvm); +} + void kvm_arch_destroy_vm(struct kvm *kvm) { if (current->mm == kvm->mm) { diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0319d6339822..0c6214497fcc 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2713,6 +2713,28 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) } } + +static +void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* + * To prevent bfqq's service guarantees from being violated, + * bfqq may be left busy, i.e., queued for service, even if + * empty (see comments in __bfq_bfqq_expire() for + * details). But, if no process will send requests to bfqq any + * longer, then there is no point in keeping bfqq queued for + * service. In addition, keeping bfqq queued for service, but + * with no process ref any longer, may have caused bfqq to be + * freed when dequeued from service. But this is assumed to + * never happen. + */ + if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) && + bfqq != bfqd->in_service_queue) + bfq_del_bfqq_busy(bfqd, bfqq, false); + + bfq_put_queue(bfqq); +} + static void bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) @@ -2783,8 +2805,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, */ new_bfqq->pid = -1; bfqq->bic = NULL; - /* release process reference to bfqq */ - bfq_put_queue(bfqq); + bfq_release_process_ref(bfqd, bfqq); } static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, @@ -4899,7 +4920,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_cooperator(bfqq); - bfq_put_queue(bfqq); /* release process reference */ + bfq_release_process_ref(bfqd, bfqq); } static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) @@ -5001,8 +5022,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { - /* release process reference on this queue */ - bfq_put_queue(bfqq); + bfq_release_process_ref(bfqd, bfqq); bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); bic_set_bfqq(bic, bfqq, false); } @@ -5963,7 +5983,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) bfq_put_cooperator(bfqq); - bfq_put_queue(bfqq); + bfq_release_process_ref(bfqq->bfqd, bfqq); return NULL; } diff --git a/block/bio.c b/block/bio.c index 8f0ed6228fc5..b1170ec18464 100644 --- a/block/bio.c +++ b/block/bio.c @@ -751,7 +751,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page, if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return false; - if (bio->bi_vcnt > 0) { + if (bio->bi_vcnt > 0 && !bio_full(bio, len)) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (page_is_mergeable(bv, page, len, off, same_page)) { diff --git a/block/blk-iocost.c b/block/blk-iocost.c index a7ed434eae03..e01267f99183 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1057,9 +1057,12 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) atomic64_set(&iocg->active_period, cur_period); /* already activated or breaking leaf-only constraint? */ - for (i = iocg->level; i > 0; i--) - if (!list_empty(&iocg->active_list)) + if (!list_empty(&iocg->active_list)) + goto succeed_unlock; + for (i = iocg->level - 1; i > 0; i--) + if (!list_empty(&iocg->ancestors[i]->active_list)) goto fail_unlock; + if (iocg->child_active_sum) goto fail_unlock; @@ -1101,6 +1104,7 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) ioc_start_period(ioc, now); } +succeed_unlock: spin_unlock_irq(&ioc->lock); return true; diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index cc37511de866..6265871a4af2 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -554,12 +554,27 @@ ssize_t __weak cpu_show_mds(struct device *dev, return sprintf(buf, "Not affected\n"); } +ssize_t __weak cpu_show_tsx_async_abort(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + +ssize_t __weak cpu_show_itlb_multihit(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); +static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); +static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -568,6 +583,8 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_spec_store_bypass.attr, &dev_attr_l1tf.attr, &dev_attr_mds.attr, + &dev_attr_tsx_async_abort.attr, + &dev_attr_itlb_multihit.attr, NULL }; diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 55907c27075b..84c4e1f72cbd 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -872,3 +872,39 @@ int walk_memory_blocks(unsigned long start, unsigned long size, } return ret; } + +struct for_each_memory_block_cb_data { + walk_memory_blocks_func_t func; + void *arg; +}; + +static int for_each_memory_block_cb(struct device *dev, void *data) +{ + struct memory_block *mem = to_memory_block(dev); + struct for_each_memory_block_cb_data *cb_data = data; + + return cb_data->func(mem, cb_data->arg); +} + +/** + * for_each_memory_block - walk through all present memory blocks + * + * @arg: argument passed to func + * @func: callback for each memory block walked + * + * This function walks through all present memory blocks, calling func on + * each memory block. + * + * In case func() returns an error, walking is aborted and the error is + * returned. + */ +int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) +{ + struct for_each_memory_block_cb_data cb_data = { + .func = func, + .arg = arg, + }; + + return bus_for_each_dev(&memory_subsys, NULL, &cb_data, + for_each_memory_block_cb); +} diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 39136675dae5..13527a0b4e44 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2087,7 +2087,7 @@ static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req, struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; struct ceph_osd_data *osd_data; u64 objno; - u8 state, new_state, current_state; + u8 state, new_state, uninitialized_var(current_state); bool has_current_state; void *p; diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 76b73ddf8fd7..10f6368117d8 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -1000,8 +1000,10 @@ static void rsxx_pci_remove(struct pci_dev *dev) cancel_work_sync(&card->event_work); + destroy_workqueue(card->event_wq); rsxx_destroy_dev(card); rsxx_dma_destroy(card); + destroy_workqueue(card->creg_ctrl.creg_wq); spin_lock_irqsave(&card->irq_lock, flags); rsxx_disable_ier_and_isr(card, CR_INTR_ALL); diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 80b850ef1bf6..8d53b8ef545c 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -422,9 +421,7 @@ static int hwrng_fillfn(void *unused) { long rc; - set_freezable(); - - while (!kthread_freezable_should_stop(NULL)) { + while (!kthread_should_stop()) { struct hwrng *rng; rng = get_current_rng(); diff --git a/drivers/char/random.c b/drivers/char/random.c index de434feb873a..01b8868b9bed 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -327,7 +327,6 @@ #include #include #include -#include #include #include #include @@ -2500,8 +2499,7 @@ void add_hwgenerator_randomness(const char *buffer, size_t count, * We'll be woken up again once below random_write_wakeup_thresh, * or when the calling thread is about to terminate. */ - wait_event_freezable(random_write_wait, - kthread_should_stop() || + wait_event_interruptible(random_write_wait, kthread_should_stop() || ENTROPY_BITS(&input_pool) <= random_write_wakeup_bits); mix_pool_bytes(poolp, buffer, count); credit_entropy_bits(poolp, entropy); diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c index 354b27d14a19..62812f80b5cc 100644 --- a/drivers/clocksource/sh_mtu2.c +++ b/drivers/clocksource/sh_mtu2.c @@ -328,12 +328,13 @@ static int sh_mtu2_register(struct sh_mtu2_channel *ch, const char *name) return 0; } +static const unsigned int sh_mtu2_channel_offsets[] = { + 0x300, 0x380, 0x000, +}; + static int sh_mtu2_setup_channel(struct sh_mtu2_channel *ch, unsigned int index, struct sh_mtu2_device *mtu) { - static const unsigned int channel_offsets[] = { - 0x300, 0x380, 0x000, - }; char name[6]; int irq; int ret; @@ -356,7 +357,7 @@ static int sh_mtu2_setup_channel(struct sh_mtu2_channel *ch, unsigned int index, return ret; } - ch->base = mtu->mapbase + channel_offsets[index]; + ch->base = mtu->mapbase + sh_mtu2_channel_offsets[index]; ch->index = index; return sh_mtu2_register(ch, dev_name(&mtu->pdev->dev)); @@ -408,7 +409,12 @@ static int sh_mtu2_setup(struct sh_mtu2_device *mtu, } /* Allocate and setup the channels. */ - mtu->num_channels = 3; + ret = platform_irq_count(pdev); + if (ret < 0) + goto err_unmap; + + mtu->num_channels = min_t(unsigned int, ret, + ARRAY_SIZE(sh_mtu2_channel_offsets)); mtu->channels = kcalloc(mtu->num_channels, sizeof(*mtu->channels), GFP_KERNEL); diff --git a/drivers/clocksource/timer-mediatek.c b/drivers/clocksource/timer-mediatek.c index a562f491b0f8..9318edcd8963 100644 --- a/drivers/clocksource/timer-mediatek.c +++ b/drivers/clocksource/timer-mediatek.c @@ -268,15 +268,12 @@ static int __init mtk_syst_init(struct device_node *node) ret = timer_of_init(node, &to); if (ret) - goto err; + return ret; clockevents_config_and_register(&to.clkevt, timer_of_rate(&to), TIMER_SYNC_TICKS, 0xffffffff); return 0; -err: - timer_of_cleanup(&to); - return ret; } static int __init mtk_gpt_init(struct device_node *node) @@ -293,7 +290,7 @@ static int __init mtk_gpt_init(struct device_node *node) ret = timer_of_init(node, &to); if (ret) - goto err; + return ret; /* Configure clock source */ mtk_gpt_setup(&to, TIMER_CLK_SRC, GPT_CTRL_OP_FREERUN); @@ -311,9 +308,6 @@ static int __init mtk_gpt_init(struct device_node *node) mtk_gpt_enable_irq(&to, TIMER_CLK_EVT); return 0; -err: - timer_of_cleanup(&to); - return ret; } TIMER_OF_DECLARE(mtk_mt6577, "mediatek,mt6577-timer", mtk_gpt_init); TIMER_OF_DECLARE(mtk_mt6765, "mediatek,mt6765-timer", mtk_syst_init); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 4d71537a960d..a46090071034 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -950,21 +950,7 @@ static void psp_print_fw_hdr(struct psp_context *psp, struct amdgpu_firmware_info *ucode) { struct amdgpu_device *adev = psp->adev; - const struct sdma_firmware_header_v1_0 *sdma_hdr = - (const struct sdma_firmware_header_v1_0 *) - adev->sdma.instance[ucode->ucode_id - AMDGPU_UCODE_ID_SDMA0].fw->data; - const struct gfx_firmware_header_v1_0 *ce_hdr = - (const struct gfx_firmware_header_v1_0 *)adev->gfx.ce_fw->data; - const struct gfx_firmware_header_v1_0 *pfp_hdr = - (const struct gfx_firmware_header_v1_0 *)adev->gfx.pfp_fw->data; - const struct gfx_firmware_header_v1_0 *me_hdr = - (const struct gfx_firmware_header_v1_0 *)adev->gfx.me_fw->data; - const struct gfx_firmware_header_v1_0 *mec_hdr = - (const struct gfx_firmware_header_v1_0 *)adev->gfx.mec_fw->data; - const struct rlc_firmware_header_v2_0 *rlc_hdr = - (const struct rlc_firmware_header_v2_0 *)adev->gfx.rlc_fw->data; - const struct smc_firmware_header_v1_0 *smc_hdr = - (const struct smc_firmware_header_v1_0 *)adev->pm.fw->data; + struct common_firmware_header *hdr; switch (ucode->ucode_id) { case AMDGPU_UCODE_ID_SDMA0: @@ -975,25 +961,33 @@ static void psp_print_fw_hdr(struct psp_context *psp, case AMDGPU_UCODE_ID_SDMA5: case AMDGPU_UCODE_ID_SDMA6: case AMDGPU_UCODE_ID_SDMA7: - amdgpu_ucode_print_sdma_hdr(&sdma_hdr->header); + hdr = (struct common_firmware_header *) + adev->sdma.instance[ucode->ucode_id - AMDGPU_UCODE_ID_SDMA0].fw->data; + amdgpu_ucode_print_sdma_hdr(hdr); break; case AMDGPU_UCODE_ID_CP_CE: - amdgpu_ucode_print_gfx_hdr(&ce_hdr->header); + hdr = (struct common_firmware_header *)adev->gfx.ce_fw->data; + amdgpu_ucode_print_gfx_hdr(hdr); break; case AMDGPU_UCODE_ID_CP_PFP: - amdgpu_ucode_print_gfx_hdr(&pfp_hdr->header); + hdr = (struct common_firmware_header *)adev->gfx.pfp_fw->data; + amdgpu_ucode_print_gfx_hdr(hdr); break; case AMDGPU_UCODE_ID_CP_ME: - amdgpu_ucode_print_gfx_hdr(&me_hdr->header); + hdr = (struct common_firmware_header *)adev->gfx.me_fw->data; + amdgpu_ucode_print_gfx_hdr(hdr); break; case AMDGPU_UCODE_ID_CP_MEC1: - amdgpu_ucode_print_gfx_hdr(&mec_hdr->header); + hdr = (struct common_firmware_header *)adev->gfx.mec_fw->data; + amdgpu_ucode_print_gfx_hdr(hdr); break; case AMDGPU_UCODE_ID_RLC_G: - amdgpu_ucode_print_rlc_hdr(&rlc_hdr->header); + hdr = (struct common_firmware_header *)adev->gfx.rlc_fw->data; + amdgpu_ucode_print_rlc_hdr(hdr); break; case AMDGPU_UCODE_ID_SMC: - amdgpu_ucode_print_smc_hdr(&smc_hdr->header); + hdr = (struct common_firmware_header *)adev->pm.fw->data; + amdgpu_ucode_print_smc_hdr(hdr); break; default: break; diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c index 12099760d99e..c002f234ff31 100644 --- a/drivers/gpu/drm/i915/display/intel_display_power.c +++ b/drivers/gpu/drm/i915/display/intel_display_power.c @@ -4896,6 +4896,9 @@ void intel_power_domains_init_hw(struct drm_i915_private *i915, bool resume) power_domains->initializing = true; + /* Must happen before power domain init on VLV/CHV */ + intel_update_rawclk(i915); + if (INTEL_GEN(i915) >= 11) { icl_display_core_init(i915, resume); } else if (IS_CANNONLAKE(i915)) { diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 1cdfe05514c3..e41fd94ae5a9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -319,6 +319,8 @@ static void i915_gem_context_free(struct i915_gem_context *ctx) free_engines(rcu_access_pointer(ctx->engines)); mutex_destroy(&ctx->engines_mutex); + kfree(ctx->jump_whitelist); + if (ctx->timeline) intel_timeline_put(ctx->timeline); @@ -441,6 +443,9 @@ __create_context(struct drm_i915_private *i915) for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++) ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES; + ctx->jump_whitelist = NULL; + ctx->jump_whitelist_cmds = 0; + return ctx; err_free: diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h index 260d59cc3de8..00537b9d7006 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h @@ -192,6 +192,13 @@ struct i915_gem_context { * per vm, which may be one per context or shared with the global GTT) */ struct radix_tree_root handles_vma; + + /** jump_whitelist: Bit array for tracking cmds during cmdparsing + * Guarded by struct_mutex + */ + unsigned long *jump_whitelist; + /** jump_whitelist_cmds: No of cmd slots available */ + u32 jump_whitelist_cmds; }; #endif /* __I915_GEM_CONTEXT_TYPES_H__ */ diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index b5f6937369ea..e635e1e5f4d3 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -296,7 +296,9 @@ static inline u64 gen8_noncanonical_addr(u64 address) static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) { - return intel_engine_needs_cmd_parser(eb->engine) && eb->batch_len; + return intel_engine_requires_cmd_parser(eb->engine) || + (intel_engine_using_cmd_parser(eb->engine) && + eb->args->batch_len); } static int eb_create(struct i915_execbuffer *eb) @@ -1955,40 +1957,94 @@ static int i915_reset_gen7_sol_offsets(struct i915_request *rq) return 0; } -static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master) +static struct i915_vma * +shadow_batch_pin(struct i915_execbuffer *eb, struct drm_i915_gem_object *obj) +{ + struct drm_i915_private *dev_priv = eb->i915; + struct i915_vma * const vma = *eb->vma; + struct i915_address_space *vm; + u64 flags; + + /* + * PPGTT backed shadow buffers must be mapped RO, to prevent + * post-scan tampering + */ + if (CMDPARSER_USES_GGTT(dev_priv)) { + flags = PIN_GLOBAL; + vm = &dev_priv->ggtt.vm; + } else if (vma->vm->has_read_only) { + flags = PIN_USER; + vm = vma->vm; + i915_gem_object_set_readonly(obj); + } else { + DRM_DEBUG("Cannot prevent post-scan tampering without RO capable vm\n"); + return ERR_PTR(-EINVAL); + } + + return i915_gem_object_pin(obj, vm, NULL, 0, 0, flags); +} + +static struct i915_vma *eb_parse(struct i915_execbuffer *eb) { struct intel_engine_pool_node *pool; struct i915_vma *vma; + u64 batch_start; + u64 shadow_batch_start; int err; pool = intel_engine_pool_get(&eb->engine->pool, eb->batch_len); if (IS_ERR(pool)) return ERR_CAST(pool); - err = intel_engine_cmd_parser(eb->engine, + vma = shadow_batch_pin(eb, pool->obj); + if (IS_ERR(vma)) + goto err; + + batch_start = gen8_canonical_addr(eb->batch->node.start) + + eb->batch_start_offset; + + shadow_batch_start = gen8_canonical_addr(vma->node.start); + + err = intel_engine_cmd_parser(eb->gem_context, + eb->engine, eb->batch->obj, - pool->obj, + batch_start, eb->batch_start_offset, eb->batch_len, - is_master); + pool->obj, + shadow_batch_start); + if (err) { - if (err == -EACCES) /* unhandled chained batch */ + i915_vma_unpin(vma); + + /* + * Unsafe GGTT-backed buffers can still be submitted safely + * as non-secure. + * For PPGTT backing however, we have no choice but to forcibly + * reject unsafe buffers + */ + if (CMDPARSER_USES_GGTT(eb->i915) && (err == -EACCES)) + /* Execute original buffer non-secure */ vma = NULL; else vma = ERR_PTR(err); goto err; } - vma = i915_gem_object_ggtt_pin(pool->obj, NULL, 0, 0, 0); - if (IS_ERR(vma)) - goto err; - eb->vma[eb->buffer_count] = i915_vma_get(vma); eb->flags[eb->buffer_count] = __EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF; vma->exec_flags = &eb->flags[eb->buffer_count]; eb->buffer_count++; + eb->batch_start_offset = 0; + eb->batch = vma; + + if (CMDPARSER_USES_GGTT(eb->i915)) + eb->batch_flags |= I915_DISPATCH_SECURE; + + /* eb->batch_len unchanged */ + vma->private = pool; return vma; @@ -2421,6 +2477,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, struct drm_i915_gem_exec_object2 *exec, struct drm_syncobj **fences) { + struct drm_i915_private *i915 = to_i915(dev); struct i915_execbuffer eb; struct dma_fence *in_fence = NULL; struct dma_fence *exec_fence = NULL; @@ -2432,7 +2489,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & ~__EXEC_OBJECT_UNKNOWN_FLAGS); - eb.i915 = to_i915(dev); + eb.i915 = i915; eb.file = file; eb.args = args; if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC)) @@ -2452,8 +2509,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, eb.batch_flags = 0; if (args->flags & I915_EXEC_SECURE) { + if (INTEL_GEN(i915) >= 11) + return -ENODEV; + + /* Return -EPERM to trigger fallback code on old binaries. */ + if (!HAS_SECURE_BATCHES(i915)) + return -EPERM; + if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN)) - return -EPERM; + return -EPERM; eb.batch_flags |= I915_DISPATCH_SECURE; } @@ -2530,34 +2594,19 @@ i915_gem_do_execbuffer(struct drm_device *dev, goto err_vma; } + if (eb.batch_len == 0) + eb.batch_len = eb.batch->size - eb.batch_start_offset; + if (eb_use_cmdparser(&eb)) { struct i915_vma *vma; - vma = eb_parse(&eb, drm_is_current_master(file)); + vma = eb_parse(&eb); if (IS_ERR(vma)) { err = PTR_ERR(vma); goto err_vma; } - - if (vma) { - /* - * Batch parsed and accepted: - * - * Set the DISPATCH_SECURE bit to remove the NON_SECURE - * bit from MI_BATCH_BUFFER_START commands issued in - * the dispatch_execbuffer implementations. We - * specifically don't want that set on batches the - * command parser has accepted. - */ - eb.batch_flags |= I915_DISPATCH_SECURE; - eb.batch_start_offset = 0; - eb.batch = vma; - } } - if (eb.batch_len == 0) - eb.batch_len = eb.batch->size - eb.batch_start_offset; - /* * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure * batch" bit. Hence we need to pin secure batches into the global gtt. diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index a82cea95c2f2..9dd8c299cb2d 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -475,12 +475,13 @@ struct intel_engine_cs { struct intel_engine_hangcheck hangcheck; -#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0) +#define I915_ENGINE_USING_CMD_PARSER BIT(0) #define I915_ENGINE_SUPPORTS_STATS BIT(1) #define I915_ENGINE_HAS_PREEMPTION BIT(2) #define I915_ENGINE_HAS_SEMAPHORES BIT(3) #define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4) #define I915_ENGINE_IS_VIRTUAL BIT(5) +#define I915_ENGINE_REQUIRES_CMD_PARSER BIT(7) unsigned int flags; /* @@ -541,9 +542,15 @@ struct intel_engine_cs { }; static inline bool -intel_engine_needs_cmd_parser(const struct intel_engine_cs *engine) +intel_engine_using_cmd_parser(const struct intel_engine_cs *engine) { - return engine->flags & I915_ENGINE_NEEDS_CMD_PARSER; + return engine->flags & I915_ENGINE_USING_CMD_PARSER; +} + +static inline bool +intel_engine_requires_cmd_parser(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_REQUIRES_CMD_PARSER; } static inline bool diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c index 1363e069ec83..fac75afed35b 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c @@ -38,6 +38,9 @@ static int __gt_unpark(struct intel_wakeref *wf) gt->awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ); GEM_BUG_ON(!gt->awake); + if (NEEDS_RC6_CTX_CORRUPTION_WA(i915)) + intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL); + intel_enable_gt_powersave(i915); i915_update_gfx_val(i915); @@ -67,6 +70,11 @@ static int __gt_park(struct intel_wakeref *wf) if (INTEL_GEN(i915) >= 6) gen6_rps_idle(i915); + if (NEEDS_RC6_CTX_CORRUPTION_WA(i915)) { + i915_rc6_ctx_wa_check(i915); + intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL); + } + /* Everything switched off, flush any residual interrupt just in case */ intel_synchronize_irq(i915); diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c index 728704bbbe18..cea184a7dde9 100644 --- a/drivers/gpu/drm/i915/gt/intel_mocs.c +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -199,14 +199,6 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = { MOCS_ENTRY(15, \ LE_3_WB | LE_TC_1_LLC | LE_LRUM(2) | LE_AOM(1), \ L3_3_WB), \ - /* Bypass LLC - Uncached (EHL+) */ \ - MOCS_ENTRY(16, \ - LE_1_UC | LE_TC_1_LLC | LE_SCF(1), \ - L3_1_UC), \ - /* Bypass LLC - L3 (Read-Only) (EHL+) */ \ - MOCS_ENTRY(17, \ - LE_1_UC | LE_TC_1_LLC | LE_SCF(1), \ - L3_3_WB), \ /* Self-Snoop - L3 + LLC */ \ MOCS_ENTRY(18, \ LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SSE(3), \ @@ -270,7 +262,7 @@ static const struct drm_i915_mocs_entry tigerlake_mocs_table[] = { L3_1_UC), /* HW Special Case (Displayable) */ MOCS_ENTRY(61, - LE_1_UC | LE_TC_1_LLC | LE_SCF(1), + LE_1_UC | LE_TC_1_LLC, L3_3_WB), }; diff --git a/drivers/gpu/drm/i915/gvt/dmabuf.c b/drivers/gpu/drm/i915/gvt/dmabuf.c index 13044c027f27..4bfaefdf548d 100644 --- a/drivers/gpu/drm/i915/gvt/dmabuf.c +++ b/drivers/gpu/drm/i915/gvt/dmabuf.c @@ -498,8 +498,6 @@ int intel_vgpu_get_dmabuf(struct intel_vgpu *vgpu, unsigned int dmabuf_id) goto out_free_gem; } - i915_gem_object_put(obj); - ret = dma_buf_fd(dmabuf, DRM_CLOEXEC | DRM_RDWR); if (ret < 0) { gvt_vgpu_err("create dma-buf fd failed ret:%d\n", ret); @@ -524,6 +522,8 @@ int intel_vgpu_get_dmabuf(struct intel_vgpu *vgpu, unsigned int dmabuf_id) file_count(dmabuf->file), kref_read(&obj->base.refcount)); + i915_gem_object_put(obj); + return dmabuf_fd; out_free_dmabuf: diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 24555102e198..f24096e27bef 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -53,13 +53,11 @@ * granting userspace undue privileges. There are three categories of privilege. * * First, commands which are explicitly defined as privileged or which should - * only be used by the kernel driver. The parser generally rejects such - * commands, though it may allow some from the drm master process. + * only be used by the kernel driver. The parser rejects such commands * * Second, commands which access registers. To support correct/enhanced * userspace functionality, particularly certain OpenGL extensions, the parser - * provides a whitelist of registers which userspace may safely access (for both - * normal and drm master processes). + * provides a whitelist of registers which userspace may safely access * * Third, commands which access privileged memory (i.e. GGTT, HWS page, etc). * The parser always rejects such commands. @@ -84,9 +82,9 @@ * in the per-engine command tables. * * Other command table entries map fairly directly to high level categories - * mentioned above: rejected, master-only, register whitelist. The parser - * implements a number of checks, including the privileged memory checks, via a - * general bitmasking mechanism. + * mentioned above: rejected, register whitelist. The parser implements a number + * of checks, including the privileged memory checks, via a general bitmasking + * mechanism. */ /* @@ -104,8 +102,6 @@ struct drm_i915_cmd_descriptor { * CMD_DESC_REJECT: The command is never allowed * CMD_DESC_REGISTER: The command should be checked against the * register whitelist for the appropriate ring - * CMD_DESC_MASTER: The command is allowed if the submitting process - * is the DRM master */ u32 flags; #define CMD_DESC_FIXED (1<<0) @@ -113,7 +109,6 @@ struct drm_i915_cmd_descriptor { #define CMD_DESC_REJECT (1<<2) #define CMD_DESC_REGISTER (1<<3) #define CMD_DESC_BITMASK (1<<4) -#define CMD_DESC_MASTER (1<<5) /* * The command's unique identification bits and the bitmask to get them. @@ -194,7 +189,7 @@ struct drm_i915_cmd_table { #define CMD(op, opm, f, lm, fl, ...) \ { \ .flags = (fl) | ((f) ? CMD_DESC_FIXED : 0), \ - .cmd = { (op), ~0u << (opm) }, \ + .cmd = { (op & ~0u << (opm)), ~0u << (opm) }, \ .length = { (lm) }, \ __VA_ARGS__ \ } @@ -209,14 +204,13 @@ struct drm_i915_cmd_table { #define R CMD_DESC_REJECT #define W CMD_DESC_REGISTER #define B CMD_DESC_BITMASK -#define M CMD_DESC_MASTER /* Command Mask Fixed Len Action ---------------------------------------------------------- */ -static const struct drm_i915_cmd_descriptor common_cmds[] = { +static const struct drm_i915_cmd_descriptor gen7_common_cmds[] = { CMD( MI_NOOP, SMI, F, 1, S ), CMD( MI_USER_INTERRUPT, SMI, F, 1, R ), - CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, M ), + CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, R ), CMD( MI_ARB_CHECK, SMI, F, 1, S ), CMD( MI_REPORT_HEAD, SMI, F, 1, S ), CMD( MI_SUSPEND_FLUSH, SMI, F, 1, S ), @@ -246,7 +240,7 @@ static const struct drm_i915_cmd_descriptor common_cmds[] = { CMD( MI_BATCH_BUFFER_START, SMI, !F, 0xFF, S ), }; -static const struct drm_i915_cmd_descriptor render_cmds[] = { +static const struct drm_i915_cmd_descriptor gen7_render_cmds[] = { CMD( MI_FLUSH, SMI, F, 1, S ), CMD( MI_ARB_ON_OFF, SMI, F, 1, R ), CMD( MI_PREDICATE, SMI, F, 1, S ), @@ -313,7 +307,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = { CMD( MI_URB_ATOMIC_ALLOC, SMI, F, 1, S ), CMD( MI_SET_APPID, SMI, F, 1, S ), CMD( MI_RS_CONTEXT, SMI, F, 1, S ), - CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, M ), + CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, R ), CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ), CMD( MI_LOAD_REGISTER_REG, SMI, !F, 0xFF, W, .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 } ), @@ -330,7 +324,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = { CMD( GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS, S3D, !F, 0x1FF, S ), }; -static const struct drm_i915_cmd_descriptor video_cmds[] = { +static const struct drm_i915_cmd_descriptor gen7_video_cmds[] = { CMD( MI_ARB_ON_OFF, SMI, F, 1, R ), CMD( MI_SET_APPID, SMI, F, 1, S ), CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B, @@ -374,7 +368,7 @@ static const struct drm_i915_cmd_descriptor video_cmds[] = { CMD( MFX_WAIT, SMFX, F, 1, S ), }; -static const struct drm_i915_cmd_descriptor vecs_cmds[] = { +static const struct drm_i915_cmd_descriptor gen7_vecs_cmds[] = { CMD( MI_ARB_ON_OFF, SMI, F, 1, R ), CMD( MI_SET_APPID, SMI, F, 1, S ), CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B, @@ -412,7 +406,7 @@ static const struct drm_i915_cmd_descriptor vecs_cmds[] = { }}, ), }; -static const struct drm_i915_cmd_descriptor blt_cmds[] = { +static const struct drm_i915_cmd_descriptor gen7_blt_cmds[] = { CMD( MI_DISPLAY_FLIP, SMI, !F, 0xFF, R ), CMD( MI_STORE_DWORD_IMM, SMI, !F, 0x3FF, B, .bits = {{ @@ -446,10 +440,64 @@ static const struct drm_i915_cmd_descriptor blt_cmds[] = { }; static const struct drm_i915_cmd_descriptor hsw_blt_cmds[] = { - CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, M ), + CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, R ), CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ), }; +/* + * For Gen9 we can still rely on the h/w to enforce cmd security, and only + * need to re-enforce the register access checks. We therefore only need to + * teach the cmdparser how to find the end of each command, and identify + * register accesses. The table doesn't need to reject any commands, and so + * the only commands listed here are: + * 1) Those that touch registers + * 2) Those that do not have the default 8-bit length + * + * Note that the default MI length mask chosen for this table is 0xFF, not + * the 0x3F used on older devices. This is because the vast majority of MI + * cmds on Gen9 use a standard 8-bit Length field. + * All the Gen9 blitter instructions are standard 0xFF length mask, and + * none allow access to non-general registers, so in fact no BLT cmds are + * included in the table at all. + * + */ +static const struct drm_i915_cmd_descriptor gen9_blt_cmds[] = { + CMD( MI_NOOP, SMI, F, 1, S ), + CMD( MI_USER_INTERRUPT, SMI, F, 1, S ), + CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, S ), + CMD( MI_FLUSH, SMI, F, 1, S ), + CMD( MI_ARB_CHECK, SMI, F, 1, S ), + CMD( MI_REPORT_HEAD, SMI, F, 1, S ), + CMD( MI_ARB_ON_OFF, SMI, F, 1, S ), + CMD( MI_SUSPEND_FLUSH, SMI, F, 1, S ), + CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, S ), + CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, S ), + CMD( MI_STORE_DWORD_IMM, SMI, !F, 0x3FF, S ), + CMD( MI_LOAD_REGISTER_IMM(1), SMI, !F, 0xFF, W, + .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 2 } ), + CMD( MI_UPDATE_GTT, SMI, !F, 0x3FF, S ), + CMD( MI_STORE_REGISTER_MEM_GEN8, SMI, F, 4, W, + .reg = { .offset = 1, .mask = 0x007FFFFC } ), + CMD( MI_FLUSH_DW, SMI, !F, 0x3F, S ), + CMD( MI_LOAD_REGISTER_MEM_GEN8, SMI, F, 4, W, + .reg = { .offset = 1, .mask = 0x007FFFFC } ), + CMD( MI_LOAD_REGISTER_REG, SMI, !F, 0xFF, W, + .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 } ), + + /* + * We allow BB_START but apply further checks. We just sanitize the + * basic fields here. + */ +#define MI_BB_START_OPERAND_MASK GENMASK(SMI-1, 0) +#define MI_BB_START_OPERAND_EXPECT (MI_BATCH_PPGTT_HSW | 1) + CMD( MI_BATCH_BUFFER_START_GEN8, SMI, !F, 0xFF, B, + .bits = {{ + .offset = 0, + .mask = MI_BB_START_OPERAND_MASK, + .expected = MI_BB_START_OPERAND_EXPECT, + }}, ), +}; + static const struct drm_i915_cmd_descriptor noop_desc = CMD(MI_NOOP, SMI, F, 1, S); @@ -463,40 +511,44 @@ static const struct drm_i915_cmd_descriptor noop_desc = #undef R #undef W #undef B -#undef M -static const struct drm_i915_cmd_table gen7_render_cmds[] = { - { common_cmds, ARRAY_SIZE(common_cmds) }, - { render_cmds, ARRAY_SIZE(render_cmds) }, +static const struct drm_i915_cmd_table gen7_render_cmd_table[] = { + { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, + { gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) }, }; -static const struct drm_i915_cmd_table hsw_render_ring_cmds[] = { - { common_cmds, ARRAY_SIZE(common_cmds) }, - { render_cmds, ARRAY_SIZE(render_cmds) }, +static const struct drm_i915_cmd_table hsw_render_ring_cmd_table[] = { + { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, + { gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) }, { hsw_render_cmds, ARRAY_SIZE(hsw_render_cmds) }, }; -static const struct drm_i915_cmd_table gen7_video_cmds[] = { - { common_cmds, ARRAY_SIZE(common_cmds) }, - { video_cmds, ARRAY_SIZE(video_cmds) }, +static const struct drm_i915_cmd_table gen7_video_cmd_table[] = { + { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, + { gen7_video_cmds, ARRAY_SIZE(gen7_video_cmds) }, }; -static const struct drm_i915_cmd_table hsw_vebox_cmds[] = { - { common_cmds, ARRAY_SIZE(common_cmds) }, - { vecs_cmds, ARRAY_SIZE(vecs_cmds) }, +static const struct drm_i915_cmd_table hsw_vebox_cmd_table[] = { + { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, + { gen7_vecs_cmds, ARRAY_SIZE(gen7_vecs_cmds) }, }; -static const struct drm_i915_cmd_table gen7_blt_cmds[] = { - { common_cmds, ARRAY_SIZE(common_cmds) }, - { blt_cmds, ARRAY_SIZE(blt_cmds) }, +static const struct drm_i915_cmd_table gen7_blt_cmd_table[] = { + { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, + { gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) }, }; -static const struct drm_i915_cmd_table hsw_blt_ring_cmds[] = { - { common_cmds, ARRAY_SIZE(common_cmds) }, - { blt_cmds, ARRAY_SIZE(blt_cmds) }, +static const struct drm_i915_cmd_table hsw_blt_ring_cmd_table[] = { + { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, + { gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) }, { hsw_blt_cmds, ARRAY_SIZE(hsw_blt_cmds) }, }; +static const struct drm_i915_cmd_table gen9_blt_cmd_table[] = { + { gen9_blt_cmds, ARRAY_SIZE(gen9_blt_cmds) }, +}; + + /* * Register whitelists, sorted by increasing register offset. */ @@ -612,17 +664,27 @@ static const struct drm_i915_reg_descriptor gen7_blt_regs[] = { REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE), }; -static const struct drm_i915_reg_descriptor ivb_master_regs[] = { - REG32(FORCEWAKE_MT), - REG32(DERRMR), - REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_A)), - REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_B)), - REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_C)), -}; - -static const struct drm_i915_reg_descriptor hsw_master_regs[] = { - REG32(FORCEWAKE_MT), - REG32(DERRMR), +static const struct drm_i915_reg_descriptor gen9_blt_regs[] = { + REG64_IDX(RING_TIMESTAMP, RENDER_RING_BASE), + REG64_IDX(RING_TIMESTAMP, BSD_RING_BASE), + REG32(BCS_SWCTRL), + REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE), + REG64_IDX(BCS_GPR, 0), + REG64_IDX(BCS_GPR, 1), + REG64_IDX(BCS_GPR, 2), + REG64_IDX(BCS_GPR, 3), + REG64_IDX(BCS_GPR, 4), + REG64_IDX(BCS_GPR, 5), + REG64_IDX(BCS_GPR, 6), + REG64_IDX(BCS_GPR, 7), + REG64_IDX(BCS_GPR, 8), + REG64_IDX(BCS_GPR, 9), + REG64_IDX(BCS_GPR, 10), + REG64_IDX(BCS_GPR, 11), + REG64_IDX(BCS_GPR, 12), + REG64_IDX(BCS_GPR, 13), + REG64_IDX(BCS_GPR, 14), + REG64_IDX(BCS_GPR, 15), }; #undef REG64 @@ -631,28 +693,27 @@ static const struct drm_i915_reg_descriptor hsw_master_regs[] = { struct drm_i915_reg_table { const struct drm_i915_reg_descriptor *regs; int num_regs; - bool master; }; static const struct drm_i915_reg_table ivb_render_reg_tables[] = { - { gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false }, - { ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true }, + { gen7_render_regs, ARRAY_SIZE(gen7_render_regs) }, }; static const struct drm_i915_reg_table ivb_blt_reg_tables[] = { - { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false }, - { ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true }, + { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) }, }; static const struct drm_i915_reg_table hsw_render_reg_tables[] = { - { gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false }, - { hsw_render_regs, ARRAY_SIZE(hsw_render_regs), false }, - { hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true }, + { gen7_render_regs, ARRAY_SIZE(gen7_render_regs) }, + { hsw_render_regs, ARRAY_SIZE(hsw_render_regs) }, }; static const struct drm_i915_reg_table hsw_blt_reg_tables[] = { - { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false }, - { hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true }, + { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) }, +}; + +static const struct drm_i915_reg_table gen9_blt_reg_tables[] = { + { gen9_blt_regs, ARRAY_SIZE(gen9_blt_regs) }, }; static u32 gen7_render_get_cmd_length_mask(u32 cmd_header) @@ -710,6 +771,17 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header) return 0; } +static u32 gen9_blt_get_cmd_length_mask(u32 cmd_header) +{ + u32 client = cmd_header >> INSTR_CLIENT_SHIFT; + + if (client == INSTR_MI_CLIENT || client == INSTR_BC_CLIENT) + return 0xFF; + + DRM_DEBUG_DRIVER("CMD: Abnormal blt cmd length! 0x%08X\n", cmd_header); + return 0; +} + static bool validate_cmds_sorted(const struct intel_engine_cs *engine, const struct drm_i915_cmd_table *cmd_tables, int cmd_table_count) @@ -867,18 +939,19 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) int cmd_table_count; int ret; - if (!IS_GEN(engine->i915, 7)) + if (!IS_GEN(engine->i915, 7) && !(IS_GEN(engine->i915, 9) && + engine->class == COPY_ENGINE_CLASS)) return; switch (engine->class) { case RENDER_CLASS: if (IS_HASWELL(engine->i915)) { - cmd_tables = hsw_render_ring_cmds; + cmd_tables = hsw_render_ring_cmd_table; cmd_table_count = - ARRAY_SIZE(hsw_render_ring_cmds); + ARRAY_SIZE(hsw_render_ring_cmd_table); } else { - cmd_tables = gen7_render_cmds; - cmd_table_count = ARRAY_SIZE(gen7_render_cmds); + cmd_tables = gen7_render_cmd_table; + cmd_table_count = ARRAY_SIZE(gen7_render_cmd_table); } if (IS_HASWELL(engine->i915)) { @@ -888,36 +961,46 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) engine->reg_tables = ivb_render_reg_tables; engine->reg_table_count = ARRAY_SIZE(ivb_render_reg_tables); } - engine->get_cmd_length_mask = gen7_render_get_cmd_length_mask; break; case VIDEO_DECODE_CLASS: - cmd_tables = gen7_video_cmds; - cmd_table_count = ARRAY_SIZE(gen7_video_cmds); + cmd_tables = gen7_video_cmd_table; + cmd_table_count = ARRAY_SIZE(gen7_video_cmd_table); engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; break; case COPY_ENGINE_CLASS: - if (IS_HASWELL(engine->i915)) { - cmd_tables = hsw_blt_ring_cmds; - cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); + engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; + if (IS_GEN(engine->i915, 9)) { + cmd_tables = gen9_blt_cmd_table; + cmd_table_count = ARRAY_SIZE(gen9_blt_cmd_table); + engine->get_cmd_length_mask = + gen9_blt_get_cmd_length_mask; + + /* BCS Engine unsafe without parser */ + engine->flags |= I915_ENGINE_REQUIRES_CMD_PARSER; + } else if (IS_HASWELL(engine->i915)) { + cmd_tables = hsw_blt_ring_cmd_table; + cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmd_table); } else { - cmd_tables = gen7_blt_cmds; - cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); + cmd_tables = gen7_blt_cmd_table; + cmd_table_count = ARRAY_SIZE(gen7_blt_cmd_table); } - if (IS_HASWELL(engine->i915)) { + if (IS_GEN(engine->i915, 9)) { + engine->reg_tables = gen9_blt_reg_tables; + engine->reg_table_count = + ARRAY_SIZE(gen9_blt_reg_tables); + } else if (IS_HASWELL(engine->i915)) { engine->reg_tables = hsw_blt_reg_tables; engine->reg_table_count = ARRAY_SIZE(hsw_blt_reg_tables); } else { engine->reg_tables = ivb_blt_reg_tables; engine->reg_table_count = ARRAY_SIZE(ivb_blt_reg_tables); } - - engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; break; case VIDEO_ENHANCEMENT_CLASS: - cmd_tables = hsw_vebox_cmds; - cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); + cmd_tables = hsw_vebox_cmd_table; + cmd_table_count = ARRAY_SIZE(hsw_vebox_cmd_table); /* VECS can use the same length_mask function as VCS */ engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; break; @@ -943,7 +1026,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) return; } - engine->flags |= I915_ENGINE_NEEDS_CMD_PARSER; + engine->flags |= I915_ENGINE_USING_CMD_PARSER; } /** @@ -955,7 +1038,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) */ void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine) { - if (!intel_engine_needs_cmd_parser(engine)) + if (!intel_engine_using_cmd_parser(engine)) return; fini_hash_table(engine); @@ -1029,22 +1112,16 @@ __find_reg(const struct drm_i915_reg_descriptor *table, int count, u32 addr) } static const struct drm_i915_reg_descriptor * -find_reg(const struct intel_engine_cs *engine, bool is_master, u32 addr) +find_reg(const struct intel_engine_cs *engine, u32 addr) { const struct drm_i915_reg_table *table = engine->reg_tables; + const struct drm_i915_reg_descriptor *reg = NULL; int count = engine->reg_table_count; - for (; count > 0; ++table, --count) { - if (!table->master || is_master) { - const struct drm_i915_reg_descriptor *reg; + for (; !reg && (count > 0); ++table, --count) + reg = __find_reg(table->regs, table->num_regs, addr); - reg = __find_reg(table->regs, table->num_regs, addr); - if (reg != NULL) - return reg; - } - } - - return NULL; + return reg; } /* Returns a vmap'd pointer to dst_obj, which the caller must unmap */ @@ -1128,8 +1205,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, static bool check_cmd(const struct intel_engine_cs *engine, const struct drm_i915_cmd_descriptor *desc, - const u32 *cmd, u32 length, - const bool is_master) + const u32 *cmd, u32 length) { if (desc->flags & CMD_DESC_SKIP) return true; @@ -1139,12 +1215,6 @@ static bool check_cmd(const struct intel_engine_cs *engine, return false; } - if ((desc->flags & CMD_DESC_MASTER) && !is_master) { - DRM_DEBUG_DRIVER("CMD: Rejected master-only command: 0x%08X\n", - *cmd); - return false; - } - if (desc->flags & CMD_DESC_REGISTER) { /* * Get the distance between individual register offset @@ -1158,7 +1228,7 @@ static bool check_cmd(const struct intel_engine_cs *engine, offset += step) { const u32 reg_addr = cmd[offset] & desc->reg.mask; const struct drm_i915_reg_descriptor *reg = - find_reg(engine, is_master, reg_addr); + find_reg(engine, reg_addr); if (!reg) { DRM_DEBUG_DRIVER("CMD: Rejected register 0x%08X in command: 0x%08X (%s)\n", @@ -1236,16 +1306,112 @@ static bool check_cmd(const struct intel_engine_cs *engine, return true; } +static int check_bbstart(const struct i915_gem_context *ctx, + u32 *cmd, u32 offset, u32 length, + u32 batch_len, + u64 batch_start, + u64 shadow_batch_start) +{ + u64 jump_offset, jump_target; + u32 target_cmd_offset, target_cmd_index; + + /* For igt compatibility on older platforms */ + if (CMDPARSER_USES_GGTT(ctx->i915)) { + DRM_DEBUG("CMD: Rejecting BB_START for ggtt based submission\n"); + return -EACCES; + } + + if (length != 3) { + DRM_DEBUG("CMD: Recursive BB_START with bad length(%u)\n", + length); + return -EINVAL; + } + + jump_target = *(u64*)(cmd+1); + jump_offset = jump_target - batch_start; + + /* + * Any underflow of jump_target is guaranteed to be outside the range + * of a u32, so >= test catches both too large and too small + */ + if (jump_offset >= batch_len) { + DRM_DEBUG("CMD: BB_START to 0x%llx jumps out of BB\n", + jump_target); + return -EINVAL; + } + + /* + * This cannot overflow a u32 because we already checked jump_offset + * is within the BB, and the batch_len is a u32 + */ + target_cmd_offset = lower_32_bits(jump_offset); + target_cmd_index = target_cmd_offset / sizeof(u32); + + *(u64*)(cmd + 1) = shadow_batch_start + target_cmd_offset; + + if (target_cmd_index == offset) + return 0; + + if (ctx->jump_whitelist_cmds <= target_cmd_index) { + DRM_DEBUG("CMD: Rejecting BB_START - truncated whitelist array\n"); + return -EINVAL; + } else if (!test_bit(target_cmd_index, ctx->jump_whitelist)) { + DRM_DEBUG("CMD: BB_START to 0x%llx not a previously executed cmd\n", + jump_target); + return -EINVAL; + } + + return 0; +} + +static void init_whitelist(struct i915_gem_context *ctx, u32 batch_len) +{ + const u32 batch_cmds = DIV_ROUND_UP(batch_len, sizeof(u32)); + const u32 exact_size = BITS_TO_LONGS(batch_cmds); + u32 next_size = BITS_TO_LONGS(roundup_pow_of_two(batch_cmds)); + unsigned long *next_whitelist; + + if (CMDPARSER_USES_GGTT(ctx->i915)) + return; + + if (batch_cmds <= ctx->jump_whitelist_cmds) { + bitmap_zero(ctx->jump_whitelist, batch_cmds); + return; + } + +again: + next_whitelist = kcalloc(next_size, sizeof(long), GFP_KERNEL); + if (next_whitelist) { + kfree(ctx->jump_whitelist); + ctx->jump_whitelist = next_whitelist; + ctx->jump_whitelist_cmds = + next_size * BITS_PER_BYTE * sizeof(long); + return; + } + + if (next_size > exact_size) { + next_size = exact_size; + goto again; + } + + DRM_DEBUG("CMD: Failed to extend whitelist. BB_START may be disallowed\n"); + bitmap_zero(ctx->jump_whitelist, ctx->jump_whitelist_cmds); + + return; +} + #define LENGTH_BIAS 2 /** * i915_parse_cmds() - parse a submitted batch buffer for privilege violations + * @ctx: the context in which the batch is to execute * @engine: the engine on which the batch is to execute * @batch_obj: the batch buffer in question - * @shadow_batch_obj: copy of the batch buffer in question + * @batch_start: Canonical base address of batch * @batch_start_offset: byte offset in the batch at which execution starts * @batch_len: length of the commands in batch_obj - * @is_master: is the submitting process the drm master? + * @shadow_batch_obj: copy of the batch buffer in question + * @shadow_batch_start: Canonical base address of shadow_batch_obj * * Parses the specified batch buffer looking for privilege violations as * described in the overview. @@ -1253,14 +1419,17 @@ static bool check_cmd(const struct intel_engine_cs *engine, * Return: non-zero if the parser finds violations or otherwise fails; -EACCES * if the batch appears legal but should use hardware parsing */ -int intel_engine_cmd_parser(struct intel_engine_cs *engine, + +int intel_engine_cmd_parser(struct i915_gem_context *ctx, + struct intel_engine_cs *engine, struct drm_i915_gem_object *batch_obj, - struct drm_i915_gem_object *shadow_batch_obj, + u64 batch_start, u32 batch_start_offset, u32 batch_len, - bool is_master) + struct drm_i915_gem_object *shadow_batch_obj, + u64 shadow_batch_start) { - u32 *cmd, *batch_end; + u32 *cmd, *batch_end, offset = 0; struct drm_i915_cmd_descriptor default_desc = noop_desc; const struct drm_i915_cmd_descriptor *desc = &default_desc; bool needs_clflush_after = false; @@ -1274,6 +1443,8 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, return PTR_ERR(cmd); } + init_whitelist(ctx, batch_len); + /* * We use the batch length as size because the shadow object is as * large or larger and copy_batch() will write MI_NOPs to the extra @@ -1283,31 +1454,15 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, do { u32 length; - if (*cmd == MI_BATCH_BUFFER_END) { - if (needs_clflush_after) { - void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping); - drm_clflush_virt_range(ptr, - (void *)(cmd + 1) - ptr); - } + if (*cmd == MI_BATCH_BUFFER_END) break; - } desc = find_cmd(engine, *cmd, desc, &default_desc); if (!desc) { DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n", *cmd); ret = -EINVAL; - break; - } - - /* - * If the batch buffer contains a chained batch, return an - * error that tells the caller to abort and dispatch the - * workload as a non-secure batch. - */ - if (desc->cmd.value == MI_BATCH_BUFFER_START) { - ret = -EACCES; - break; + goto err; } if (desc->flags & CMD_DESC_FIXED) @@ -1321,22 +1476,43 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, length, batch_end - cmd); ret = -EINVAL; + goto err; + } + + if (!check_cmd(engine, desc, cmd, length)) { + ret = -EACCES; + goto err; + } + + if (desc->cmd.value == MI_BATCH_BUFFER_START) { + ret = check_bbstart(ctx, cmd, offset, length, + batch_len, batch_start, + shadow_batch_start); + + if (ret) + goto err; break; } - if (!check_cmd(engine, desc, cmd, length, is_master)) { - ret = -EACCES; - break; - } + if (ctx->jump_whitelist_cmds > offset) + set_bit(offset, ctx->jump_whitelist); cmd += length; + offset += length; if (cmd >= batch_end) { DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n"); ret = -EINVAL; - break; + goto err; } } while (1); + if (needs_clflush_after) { + void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping); + + drm_clflush_virt_range(ptr, (void *)(cmd + 1) - ptr); + } + +err: i915_gem_object_unpin_map(shadow_batch_obj); return ret; } @@ -1357,7 +1533,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv) /* If the command parser is not enabled, report 0 - unsupported */ for_each_uabi_engine(engine, dev_priv) { - if (intel_engine_needs_cmd_parser(engine)) { + if (intel_engine_using_cmd_parser(engine)) { active = true; break; } @@ -1382,6 +1558,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv) * the parser enabled. * 9. Don't whitelist or handle oacontrol specially, as ownership * for oacontrol state is moving to i915-perf. + * 10. Support for Gen9 BCS Parsing */ - return 9; + return 10; } diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index bb6f86c7067a..3d717e282908 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -364,9 +364,6 @@ static int i915_driver_modeset_probe(struct drm_device *dev) if (ret) goto cleanup_vga_client; - /* must happen before intel_power_domains_init_hw() on VLV/CHV */ - intel_update_rawclk(dev_priv); - intel_power_domains_init_hw(dev_priv, false); intel_csr_ucode_init(dev_priv); @@ -1850,6 +1847,8 @@ static int i915_drm_suspend_late(struct drm_device *dev, bool hibernation) i915_gem_suspend_late(dev_priv); + i915_rc6_ctx_wa_suspend(dev_priv); + intel_uncore_suspend(&dev_priv->uncore); intel_power_domains_suspend(dev_priv, @@ -2053,6 +2052,8 @@ static int i915_drm_resume_early(struct drm_device *dev) intel_power_domains_resume(dev_priv); + i915_rc6_ctx_wa_resume(dev_priv); + intel_gt_sanitize(&dev_priv->gt, true); enable_rpm_wakeref_asserts(&dev_priv->runtime_pm); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 953e1d12c23c..89b6112bd66b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -593,6 +593,8 @@ struct intel_rps { struct intel_rc6 { bool enabled; + bool ctx_corrupted; + intel_wakeref_t ctx_corrupted_wakeref; u64 prev_hw_residency[4]; u64 cur_residency[4]; }; @@ -2075,9 +2077,16 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915, #define VEBOX_MASK(dev_priv) \ ENGINE_INSTANCES_MASK(dev_priv, VECS0, I915_MAX_VECS) +/* + * The Gen7 cmdparser copies the scanned buffer to the ggtt for execution + * All later gens can run the final buffer from the ppgtt + */ +#define CMDPARSER_USES_GGTT(dev_priv) IS_GEN(dev_priv, 7) + #define HAS_LLC(dev_priv) (INTEL_INFO(dev_priv)->has_llc) #define HAS_SNOOP(dev_priv) (INTEL_INFO(dev_priv)->has_snoop) #define HAS_EDRAM(dev_priv) ((dev_priv)->edram_size_mb) +#define HAS_SECURE_BATCHES(dev_priv) (INTEL_GEN(dev_priv) < 6) #define HAS_WT(dev_priv) ((IS_HASWELL(dev_priv) || \ IS_BROADWELL(dev_priv)) && HAS_EDRAM(dev_priv)) @@ -2110,10 +2119,12 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915, /* Early gen2 have a totally busted CS tlb and require pinned batches. */ #define HAS_BROKEN_CS_TLB(dev_priv) (IS_I830(dev_priv) || IS_I845G(dev_priv)) +#define NEEDS_RC6_CTX_CORRUPTION_WA(dev_priv) \ + (IS_BROADWELL(dev_priv) || IS_GEN(dev_priv, 9)) + /* WaRsDisableCoarsePowerGating:skl,cnl */ #define NEEDS_WaRsDisableCoarsePowerGating(dev_priv) \ - (IS_CANNONLAKE(dev_priv) || \ - IS_SKL_GT3(dev_priv) || IS_SKL_GT4(dev_priv)) + (IS_CANNONLAKE(dev_priv) || IS_GEN(dev_priv, 9)) #define HAS_GMBUS_IRQ(dev_priv) (INTEL_GEN(dev_priv) >= 4) #define HAS_GMBUS_BURST_READ(dev_priv) (INTEL_GEN(dev_priv) >= 10 || \ @@ -2284,6 +2295,14 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj, unsigned long flags); #define I915_GEM_OBJECT_UNBIND_ACTIVE BIT(0) +struct i915_vma * __must_check +i915_gem_object_pin(struct drm_i915_gem_object *obj, + struct i915_address_space *vm, + const struct i915_ggtt_view *view, + u64 size, + u64 alignment, + u64 flags); + void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv); static inline int __must_check @@ -2393,12 +2412,14 @@ const char *i915_cache_level_str(struct drm_i915_private *i915, int type); int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv); void intel_engine_init_cmd_parser(struct intel_engine_cs *engine); void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine); -int intel_engine_cmd_parser(struct intel_engine_cs *engine, +int intel_engine_cmd_parser(struct i915_gem_context *cxt, + struct intel_engine_cs *engine, struct drm_i915_gem_object *batch_obj, - struct drm_i915_gem_object *shadow_batch_obj, + u64 user_batch_start, u32 batch_start_offset, u32 batch_len, - bool is_master); + struct drm_i915_gem_object *shadow_batch_obj, + u64 shadow_batch_start); /* intel_device_info.c */ static inline struct intel_device_info * diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d0f94f239919..98305d987ac1 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -964,6 +964,20 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj, { struct drm_i915_private *dev_priv = to_i915(obj->base.dev); struct i915_address_space *vm = &dev_priv->ggtt.vm; + + return i915_gem_object_pin(obj, vm, view, size, alignment, + flags | PIN_GLOBAL); +} + +struct i915_vma * +i915_gem_object_pin(struct drm_i915_gem_object *obj, + struct i915_address_space *vm, + const struct i915_ggtt_view *view, + u64 size, + u64 alignment, + u64 flags) +{ + struct drm_i915_private *dev_priv = to_i915(obj->base.dev); struct i915_vma *vma; int ret; @@ -1038,7 +1052,7 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj, return ERR_PTR(ret); } - ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL); + ret = i915_vma_pin(vma, size, alignment, flags); if (ret) return ERR_PTR(ret); diff --git a/drivers/gpu/drm/i915/i915_getparam.c b/drivers/gpu/drm/i915/i915_getparam.c index 5d9101376a3d..9f1517af5b7f 100644 --- a/drivers/gpu/drm/i915/i915_getparam.c +++ b/drivers/gpu/drm/i915/i915_getparam.c @@ -62,7 +62,7 @@ int i915_getparam_ioctl(struct drm_device *dev, void *data, value = !!(i915->caps.scheduler & I915_SCHEDULER_CAP_SEMAPHORES); break; case I915_PARAM_HAS_SECURE_BATCHES: - value = capable(CAP_SYS_ADMIN); + value = HAS_SECURE_BATCHES(i915) && capable(CAP_SYS_ADMIN); break; case I915_PARAM_CMD_PARSER_VERSION: value = i915_cmd_parser_get_version(i915); diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 2abd199093c5..f8ee9aba3955 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -471,6 +471,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define ECOCHK_PPGTT_WT_HSW (0x2 << 3) #define ECOCHK_PPGTT_WB_HSW (0x3 << 3) +#define GEN8_RC6_CTX_INFO _MMIO(0x8504) + #define GAC_ECO_BITS _MMIO(0x14090) #define ECOBITS_SNB_BIT (1 << 13) #define ECOBITS_PPGTT_CACHE64B (3 << 8) @@ -555,6 +557,10 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) */ #define BCS_SWCTRL _MMIO(0x22200) +/* There are 16 GPR registers */ +#define BCS_GPR(n) _MMIO(0x22600 + (n) * 8) +#define BCS_GPR_UDW(n) _MMIO(0x22600 + (n) * 8 + 4) + #define GPGPU_THREADS_DISPATCHED _MMIO(0x2290) #define GPGPU_THREADS_DISPATCHED_UDW _MMIO(0x2290 + 4) #define HS_INVOCATION_COUNT _MMIO(0x2300) @@ -7211,6 +7217,10 @@ enum { #define TGL_DMC_DEBUG_DC5_COUNT _MMIO(0x101084) #define TGL_DMC_DEBUG_DC6_COUNT _MMIO(0x101088) +/* Display Internal Timeout Register */ +#define RM_TIMEOUT _MMIO(0x42060) +#define MMIO_TIMEOUT_US(us) ((us) << 0) + /* interrupts */ #define DE_MASTER_IRQ_CONTROL (1 << 31) #define DE_SPRITEB_FLIP_DONE (1 << 29) diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 75ee027abb80..2efe1d12d5a9 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -126,6 +126,14 @@ static void bxt_init_clock_gating(struct drm_i915_private *dev_priv) */ I915_WRITE(GEN9_CLKGATE_DIS_0, I915_READ(GEN9_CLKGATE_DIS_0) | PWM1_GATING_DIS | PWM2_GATING_DIS); + + /* + * Lower the display internal timeout. + * This is needed to avoid any hard hangs when DSI port PLL + * is off and a MMIO access is attempted by any privilege + * application, using batch buffers or any other means. + */ + I915_WRITE(RM_TIMEOUT, MMIO_TIMEOUT_US(950)); } static void glk_init_clock_gating(struct drm_i915_private *dev_priv) @@ -8544,6 +8552,100 @@ static void intel_init_emon(struct drm_i915_private *dev_priv) dev_priv->ips.corr = (lcfuse & LCFUSE_HIV_MASK); } +static bool i915_rc6_ctx_corrupted(struct drm_i915_private *dev_priv) +{ + return !I915_READ(GEN8_RC6_CTX_INFO); +} + +static void i915_rc6_ctx_wa_init(struct drm_i915_private *i915) +{ + if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915)) + return; + + if (i915_rc6_ctx_corrupted(i915)) { + DRM_INFO("RC6 context corrupted, disabling runtime power management\n"); + i915->gt_pm.rc6.ctx_corrupted = true; + i915->gt_pm.rc6.ctx_corrupted_wakeref = + intel_runtime_pm_get(&i915->runtime_pm); + } +} + +static void i915_rc6_ctx_wa_cleanup(struct drm_i915_private *i915) +{ + if (i915->gt_pm.rc6.ctx_corrupted) { + intel_runtime_pm_put(&i915->runtime_pm, + i915->gt_pm.rc6.ctx_corrupted_wakeref); + i915->gt_pm.rc6.ctx_corrupted = false; + } +} + +/** + * i915_rc6_ctx_wa_suspend - system suspend sequence for the RC6 CTX WA + * @i915: i915 device + * + * Perform any steps needed to clean up the RC6 CTX WA before system suspend. + */ +void i915_rc6_ctx_wa_suspend(struct drm_i915_private *i915) +{ + if (i915->gt_pm.rc6.ctx_corrupted) + intel_runtime_pm_put(&i915->runtime_pm, + i915->gt_pm.rc6.ctx_corrupted_wakeref); +} + +/** + * i915_rc6_ctx_wa_resume - system resume sequence for the RC6 CTX WA + * @i915: i915 device + * + * Perform any steps needed to re-init the RC6 CTX WA after system resume. + */ +void i915_rc6_ctx_wa_resume(struct drm_i915_private *i915) +{ + if (!i915->gt_pm.rc6.ctx_corrupted) + return; + + if (i915_rc6_ctx_corrupted(i915)) { + i915->gt_pm.rc6.ctx_corrupted_wakeref = + intel_runtime_pm_get(&i915->runtime_pm); + return; + } + + DRM_INFO("RC6 context restored, re-enabling runtime power management\n"); + i915->gt_pm.rc6.ctx_corrupted = false; +} + +static void intel_disable_rc6(struct drm_i915_private *dev_priv); + +/** + * i915_rc6_ctx_wa_check - check for a new RC6 CTX corruption + * @i915: i915 device + * + * Check if an RC6 CTX corruption has happened since the last check and if so + * disable RC6 and runtime power management. + * + * Return false if no context corruption has happened since the last call of + * this function, true otherwise. +*/ +bool i915_rc6_ctx_wa_check(struct drm_i915_private *i915) +{ + if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915)) + return false; + + if (i915->gt_pm.rc6.ctx_corrupted) + return false; + + if (!i915_rc6_ctx_corrupted(i915)) + return false; + + DRM_NOTE("RC6 context corruption, disabling runtime power management\n"); + + intel_disable_rc6(i915); + i915->gt_pm.rc6.ctx_corrupted = true; + i915->gt_pm.rc6.ctx_corrupted_wakeref = + intel_runtime_pm_get_noresume(&i915->runtime_pm); + + return true; +} + void intel_init_gt_powersave(struct drm_i915_private *dev_priv) { struct intel_rps *rps = &dev_priv->gt_pm.rps; @@ -8557,6 +8659,8 @@ void intel_init_gt_powersave(struct drm_i915_private *dev_priv) pm_runtime_get(&dev_priv->drm.pdev->dev); } + i915_rc6_ctx_wa_init(dev_priv); + /* Initialize RPS limits (for userspace) */ if (IS_CHERRYVIEW(dev_priv)) cherryview_init_gt_powersave(dev_priv); @@ -8595,6 +8699,8 @@ void intel_cleanup_gt_powersave(struct drm_i915_private *dev_priv) if (IS_VALLEYVIEW(dev_priv)) valleyview_cleanup_gt_powersave(dev_priv); + i915_rc6_ctx_wa_cleanup(dev_priv); + if (!HAS_RC6(dev_priv)) pm_runtime_put(&dev_priv->drm.pdev->dev); } @@ -8623,7 +8729,7 @@ static inline void intel_disable_llc_pstate(struct drm_i915_private *i915) i915->gt_pm.llc_pstate.enabled = false; } -static void intel_disable_rc6(struct drm_i915_private *dev_priv) +static void __intel_disable_rc6(struct drm_i915_private *dev_priv) { lockdep_assert_held(&dev_priv->gt_pm.rps.lock); @@ -8642,6 +8748,15 @@ static void intel_disable_rc6(struct drm_i915_private *dev_priv) dev_priv->gt_pm.rc6.enabled = false; } +static void intel_disable_rc6(struct drm_i915_private *dev_priv) +{ + struct intel_rps *rps = &dev_priv->gt_pm.rps; + + mutex_lock(&rps->lock); + __intel_disable_rc6(dev_priv); + mutex_unlock(&rps->lock); +} + static void intel_disable_rps(struct drm_i915_private *dev_priv) { lockdep_assert_held(&dev_priv->gt_pm.rps.lock); @@ -8667,7 +8782,7 @@ void intel_disable_gt_powersave(struct drm_i915_private *dev_priv) { mutex_lock(&dev_priv->gt_pm.rps.lock); - intel_disable_rc6(dev_priv); + __intel_disable_rc6(dev_priv); intel_disable_rps(dev_priv); if (HAS_LLC(dev_priv)) intel_disable_llc_pstate(dev_priv); @@ -8694,6 +8809,9 @@ static void intel_enable_rc6(struct drm_i915_private *dev_priv) if (dev_priv->gt_pm.rc6.enabled) return; + if (dev_priv->gt_pm.rc6.ctx_corrupted) + return; + if (IS_CHERRYVIEW(dev_priv)) cherryview_enable_rc6(dev_priv); else if (IS_VALLEYVIEW(dev_priv)) diff --git a/drivers/gpu/drm/i915/intel_pm.h b/drivers/gpu/drm/i915/intel_pm.h index e3573e1e16e3..0f7390c850ec 100644 --- a/drivers/gpu/drm/i915/intel_pm.h +++ b/drivers/gpu/drm/i915/intel_pm.h @@ -36,6 +36,9 @@ void intel_cleanup_gt_powersave(struct drm_i915_private *dev_priv); void intel_sanitize_gt_powersave(struct drm_i915_private *dev_priv); void intel_enable_gt_powersave(struct drm_i915_private *dev_priv); void intel_disable_gt_powersave(struct drm_i915_private *dev_priv); +bool i915_rc6_ctx_wa_check(struct drm_i915_private *i915); +void i915_rc6_ctx_wa_suspend(struct drm_i915_private *i915); +void i915_rc6_ctx_wa_resume(struct drm_i915_private *i915); void gen6_rps_busy(struct drm_i915_private *dev_priv); void gen6_rps_idle(struct drm_i915_private *dev_priv); void gen6_rps_boost(struct i915_request *rq); diff --git a/drivers/gpu/drm/sun4i/sun4i_tcon.c b/drivers/gpu/drm/sun4i/sun4i_tcon.c index 04c721d0d3b9..b89439ed210d 100644 --- a/drivers/gpu/drm/sun4i/sun4i_tcon.c +++ b/drivers/gpu/drm/sun4i/sun4i_tcon.c @@ -488,7 +488,7 @@ static void sun4i_tcon0_mode_set_rgb(struct sun4i_tcon *tcon, WARN_ON(!tcon->quirks->has_channel_0); - tcon->dclk_min_div = 6; + tcon->dclk_min_div = 1; tcon->dclk_max_div = 127; sun4i_tcon0_mode_set_common(tcon, mode); diff --git a/drivers/hwtracing/intel_th/gth.c b/drivers/hwtracing/intel_th/gth.c index fa9d34af87ac..f72803a02391 100644 --- a/drivers/hwtracing/intel_th/gth.c +++ b/drivers/hwtracing/intel_th/gth.c @@ -626,6 +626,9 @@ static void intel_th_gth_switch(struct intel_th_device *thdev, if (!count) dev_dbg(&thdev->dev, "timeout waiting for CTS Trigger\n"); + /* De-assert the trigger */ + iowrite32(0, gth->base + REG_CTS_CTL); + intel_th_gth_stop(gth, output, false); intel_th_gth_start(gth, output); } diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index fc9f15f36ad4..6d240dfae9d9 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -164,7 +164,7 @@ struct msc { }; static LIST_HEAD(msu_buffer_list); -static struct mutex msu_buffer_mutex; +static DEFINE_MUTEX(msu_buffer_mutex); /** * struct msu_buffer_entry - internal MSU buffer bookkeeping @@ -327,7 +327,7 @@ static size_t msc_win_total_sz(struct msc_window *win) struct msc_block_desc *bdesc = sg_virt(sg); if (msc_block_wrapped(bdesc)) - return win->nr_blocks << PAGE_SHIFT; + return (size_t)win->nr_blocks << PAGE_SHIFT; size += msc_total_sz(bdesc); if (msc_block_last_written(bdesc)) @@ -1848,9 +1848,14 @@ mode_store(struct device *dev, struct device_attribute *attr, const char *buf, len = cp - buf; mode = kstrndup(buf, len, GFP_KERNEL); + if (!mode) + return -ENOMEM; + i = match_string(msc_mode, ARRAY_SIZE(msc_mode), mode); - if (i >= 0) + if (i >= 0) { + kfree(mode); goto found; + } /* Buffer sinks only work with a usable IRQ */ if (!msc->do_irq) { diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index 91dfeba62485..03ca5b1bef9f 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -199,6 +199,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x02a6), .driver_data = (kernel_ulong_t)&intel_th_2x, }, + { + /* Comet Lake PCH */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x06a6), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, { /* Ice Lake NNPI */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x45c5), @@ -209,6 +214,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa0a6), .driver_data = (kernel_ulong_t)&intel_th_2x, }, + { + /* Jasper Lake PCH */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x4da6), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, { 0 }, }; diff --git a/drivers/iio/adc/stm32-adc.c b/drivers/iio/adc/stm32-adc.c index 663f8a5012d6..73aee5949b6b 100644 --- a/drivers/iio/adc/stm32-adc.c +++ b/drivers/iio/adc/stm32-adc.c @@ -1399,7 +1399,7 @@ static int stm32_adc_dma_start(struct iio_dev *indio_dev) cookie = dmaengine_submit(desc); ret = dma_submit_error(cookie); if (ret) { - dmaengine_terminate_all(adc->dma_chan); + dmaengine_terminate_sync(adc->dma_chan); return ret; } @@ -1477,7 +1477,7 @@ static void __stm32_adc_buffer_predisable(struct iio_dev *indio_dev) stm32_adc_conv_irq_disable(adc); if (adc->dma_chan) - dmaengine_terminate_all(adc->dma_chan); + dmaengine_terminate_sync(adc->dma_chan); if (stm32_adc_set_trig(indio_dev, NULL)) dev_err(&indio_dev->dev, "Can't clear trigger\n"); diff --git a/drivers/iio/imu/adis16480.c b/drivers/iio/imu/adis16480.c index b99d73887c9f..8743b2f376e2 100644 --- a/drivers/iio/imu/adis16480.c +++ b/drivers/iio/imu/adis16480.c @@ -317,8 +317,11 @@ static int adis16480_set_freq(struct iio_dev *indio_dev, int val, int val2) struct adis16480 *st = iio_priv(indio_dev); unsigned int t, reg; + if (val < 0 || val2 < 0) + return -EINVAL; + t = val * 1000 + val2 / 1000; - if (t <= 0) + if (t == 0) return -EINVAL; /* diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c index b17f060b52fc..868281b8adb0 100644 --- a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c +++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c @@ -114,54 +114,63 @@ static const struct inv_mpu6050_hw hw_info[] = { .name = "MPU6050", .reg = ®_set_6050, .config = &chip_config_6050, + .fifo_size = 1024, }, { .whoami = INV_MPU6500_WHOAMI_VALUE, .name = "MPU6500", .reg = ®_set_6500, .config = &chip_config_6050, + .fifo_size = 512, }, { .whoami = INV_MPU6515_WHOAMI_VALUE, .name = "MPU6515", .reg = ®_set_6500, .config = &chip_config_6050, + .fifo_size = 512, }, { .whoami = INV_MPU6000_WHOAMI_VALUE, .name = "MPU6000", .reg = ®_set_6050, .config = &chip_config_6050, + .fifo_size = 1024, }, { .whoami = INV_MPU9150_WHOAMI_VALUE, .name = "MPU9150", .reg = ®_set_6050, .config = &chip_config_6050, + .fifo_size = 1024, }, { .whoami = INV_MPU9250_WHOAMI_VALUE, .name = "MPU9250", .reg = ®_set_6500, .config = &chip_config_6050, + .fifo_size = 512, }, { .whoami = INV_MPU9255_WHOAMI_VALUE, .name = "MPU9255", .reg = ®_set_6500, .config = &chip_config_6050, + .fifo_size = 512, }, { .whoami = INV_ICM20608_WHOAMI_VALUE, .name = "ICM20608", .reg = ®_set_6500, .config = &chip_config_6050, + .fifo_size = 512, }, { .whoami = INV_ICM20602_WHOAMI_VALUE, .name = "ICM20602", .reg = ®_set_icm20602, .config = &chip_config_6050, + .fifo_size = 1008, }, }; diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h b/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h index db1c6904388b..51235677c534 100644 --- a/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h +++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h @@ -100,12 +100,14 @@ struct inv_mpu6050_chip_config { * @name: name of the chip. * @reg: register map of the chip. * @config: configuration of the chip. + * @fifo_size: size of the FIFO in bytes. */ struct inv_mpu6050_hw { u8 whoami; u8 *name; const struct inv_mpu6050_reg_map *reg; const struct inv_mpu6050_chip_config *config; + size_t fifo_size; }; /* diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c index 5f9a5de0bab4..72d8c5790076 100644 --- a/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c +++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c @@ -180,9 +180,6 @@ irqreturn_t inv_mpu6050_read_fifo(int irq, void *p) "failed to ack interrupt\n"); goto flush_fifo; } - /* handle fifo overflow by reseting fifo */ - if (int_status & INV_MPU6050_BIT_FIFO_OVERFLOW_INT) - goto flush_fifo; if (!(int_status & INV_MPU6050_BIT_RAW_DATA_RDY_INT)) { dev_warn(regmap_get_device(st->map), "spurious interrupt with status 0x%x\n", int_status); @@ -211,6 +208,18 @@ irqreturn_t inv_mpu6050_read_fifo(int irq, void *p) if (result) goto end_session; fifo_count = get_unaligned_be16(&data[0]); + + /* + * Handle fifo overflow by resetting fifo. + * Reset if there is only 3 data set free remaining to mitigate + * possible delay between reading fifo count and fifo data. + */ + nb = 3 * bytes_per_datum; + if (fifo_count >= st->hw->fifo_size - nb) { + dev_warn(regmap_get_device(st->map), "fifo overflow reset\n"); + goto flush_fifo; + } + /* compute and process all complete datum */ nb = fifo_count / bytes_per_datum; inv_mpu6050_update_period(st, pf->timestamp, nb); diff --git a/drivers/iio/proximity/srf04.c b/drivers/iio/proximity/srf04.c index 8b50d56b0a03..01eb8cc63076 100644 --- a/drivers/iio/proximity/srf04.c +++ b/drivers/iio/proximity/srf04.c @@ -110,7 +110,7 @@ static int srf04_read(struct srf04_data *data) udelay(data->cfg->trigger_pulse_us); gpiod_set_value(data->gpiod_trig, 0); - /* it cannot take more than 20 ms */ + /* it should not take more than 20 ms until echo is rising */ ret = wait_for_completion_killable_timeout(&data->rising, HZ/50); if (ret < 0) { mutex_unlock(&data->lock); @@ -120,7 +120,8 @@ static int srf04_read(struct srf04_data *data) return -ETIMEDOUT; } - ret = wait_for_completion_killable_timeout(&data->falling, HZ/50); + /* it cannot take more than 50 ms until echo is falling */ + ret = wait_for_completion_killable_timeout(&data->falling, HZ/20); if (ret < 0) { mutex_unlock(&data->lock); return ret; @@ -135,19 +136,19 @@ static int srf04_read(struct srf04_data *data) dt_ns = ktime_to_ns(ktime_dt); /* - * measuring more than 3 meters is beyond the capabilities of - * the sensor + * measuring more than 6,45 meters is beyond the capabilities of + * the supported sensors * ==> filter out invalid results for not measuring echos of * another us sensor * * formula: - * distance 3 m - * time = ---------- = --------- = 9404389 ns - * speed 319 m/s + * distance 6,45 * 2 m + * time = ---------- = ------------ = 40438871 ns + * speed 319 m/s * * using a minimum speed at -20 °C of 319 m/s */ - if (dt_ns > 9404389) + if (dt_ns > 40438871) return -EIO; time_ns = dt_ns; @@ -159,20 +160,20 @@ static int srf04_read(struct srf04_data *data) * with Temp in °C * and speed in m/s * - * use 343 m/s as ultrasonic speed at 20 °C here in absence of the + * use 343,5 m/s as ultrasonic speed at 20 °C here in absence of the * temperature * * therefore: - * time 343 - * distance = ------ * ----- - * 10^6 2 + * time 343,5 time * 106 + * distance = ------ * ------- = ------------ + * 10^6 2 617176 * with time in ns * and distance in mm (one way) * - * because we limit to 3 meters the multiplication with 343 just + * because we limit to 6,45 meters the multiplication with 106 just * fits into 32 bit */ - distance_mm = time_ns * 343 / 2000000; + distance_mm = time_ns * 106 / 617176; return distance_mm; } diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 71cb9525c074..26b792bb1027 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1489,7 +1489,6 @@ static int __init hfi1_mod_init(void) goto bail_dev; } - hfi1_compute_tid_rdma_flow_wt(); /* * These must be called before the driver is registered with * the PCI subsystem. diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 61aa5504d7c3..61362bd6d3ce 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -319,7 +319,9 @@ int pcie_speeds(struct hfi1_devdata *dd) /* * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed */ - if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) { + if (parent && + (dd->pcidev->bus->max_bus_speed == PCIE_SPEED_2_5GT || + dd->pcidev->bus->max_bus_speed == PCIE_SPEED_5_0GT)) { dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n"); dd->link_gen3_capable = 0; } diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 513a8aac9ccd..1a3c647675a7 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -2209,15 +2209,15 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, if (qp->s_flags & RVT_S_WAIT_RNR) goto bail_stop; rdi = ib_to_rvt(qp->ibqp.device); - if (qp->s_rnr_retry == 0 && - !((rdi->post_parms[wqe->wr.opcode].flags & - RVT_OPERATION_IGN_RNR_CNT) && - qp->s_rnr_retry_cnt == 0)) { - status = IB_WC_RNR_RETRY_EXC_ERR; - goto class_b; + if (!(rdi->post_parms[wqe->wr.opcode].flags & + RVT_OPERATION_IGN_RNR_CNT)) { + if (qp->s_rnr_retry == 0) { + status = IB_WC_RNR_RETRY_EXC_ERR; + goto class_b; + } + if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0) + qp->s_rnr_retry--; } - if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0) - qp->s_rnr_retry--; /* * The last valid PSN is the previous PSN. For TID RDMA WRITE diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index f21fca3617d5..e53f542b60af 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -107,8 +107,6 @@ static u32 mask_generation(u32 a) * C - Capcode */ -static u32 tid_rdma_flow_wt; - static void tid_rdma_trigger_resume(struct work_struct *work); static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, @@ -136,6 +134,26 @@ static void update_r_next_psn_fecn(struct hfi1_packet *packet, struct tid_rdma_flow *flow, bool fecn); +static void validate_r_tid_ack(struct hfi1_qp_priv *priv) +{ + if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) + priv->r_tid_ack = priv->r_tid_tail; +} + +static void tid_rdma_schedule_ack(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + priv->s_flags |= RVT_S_ACK_PENDING; + hfi1_schedule_tid_send(qp); +} + +static void tid_rdma_trigger_ack(struct rvt_qp *qp) +{ + validate_r_tid_ack(qp->priv); + tid_rdma_schedule_ack(qp); +} + static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { return @@ -3005,10 +3023,7 @@ nak_psn: qpriv->s_nak_state = IB_NAK_PSN_ERROR; /* We are NAK'ing the next expected PSN */ qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn); - qpriv->s_flags |= RVT_S_ACK_PENDING; - if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID) - qpriv->r_tid_ack = qpriv->r_tid_tail; - hfi1_schedule_tid_send(qp); + tid_rdma_trigger_ack(qp); } goto unlock; } @@ -3371,18 +3386,17 @@ u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); } -void hfi1_compute_tid_rdma_flow_wt(void) +static u32 hfi1_compute_tid_rdma_flow_wt(struct rvt_qp *qp) { /* * Heuristic for computing the RNR timeout when waiting on the flow * queue. Rather than a computationaly expensive exact estimate of when * a flow will be available, we assume that if a QP is at position N in * the flow queue it has to wait approximately (N + 1) * (number of - * segments between two sync points), assuming PMTU of 4K. The rationale - * for this is that flows are released and recycled at each sync point. + * segments between two sync points). The rationale for this is that + * flows are released and recycled at each sync point. */ - tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) / - TID_RDMA_MAX_SEGMENT_SIZE; + return (MAX_TID_FLOW_PSN * qp->pmtu) >> TID_RDMA_SEGMENT_SHIFT; } static u32 position_in_queue(struct hfi1_qp_priv *qpriv, @@ -3505,7 +3519,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) { ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp); if (ret) { - to_seg = tid_rdma_flow_wt * + to_seg = hfi1_compute_tid_rdma_flow_wt(qp) * position_in_queue(qpriv, &rcd->flow_queue); break; @@ -3526,7 +3540,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) /* * If overtaking req->acked_tail, send an RNR NAK. Because the * QP is not queued in this case, and the issue can only be - * caused due a delay in scheduling the second leg which we + * caused by a delay in scheduling the second leg which we * cannot estimate, we use a rather arbitrary RNR timeout of * (MAX_FLOWS / 2) segments */ @@ -3534,8 +3548,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) MAX_FLOWS)) { ret = -EAGAIN; to_seg = MAX_FLOWS >> 1; - qpriv->s_flags |= RVT_S_ACK_PENDING; - hfi1_schedule_tid_send(qp); + tid_rdma_trigger_ack(qp); break; } @@ -4335,8 +4348,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn, req); trace_hfi1_tid_write_rsp_rcv_data(qp); - if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) - priv->r_tid_ack = priv->r_tid_tail; + validate_r_tid_ack(priv); if (opcode == TID_OP(WRITE_DATA_LAST)) { release_rdma_sge_mr(e); @@ -4375,8 +4387,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) } done: - priv->s_flags |= RVT_S_ACK_PENDING; - hfi1_schedule_tid_send(qp); + tid_rdma_schedule_ack(qp); exit: priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; if (fecn) @@ -4388,10 +4399,7 @@ send_nak: if (!priv->s_nak_state) { priv->s_nak_state = IB_NAK_PSN_ERROR; priv->s_nak_psn = flow->flow_state.r_next_psn; - priv->s_flags |= RVT_S_ACK_PENDING; - if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) - priv->r_tid_ack = priv->r_tid_tail; - hfi1_schedule_tid_send(qp); + tid_rdma_trigger_ack(qp); } goto done; } @@ -4939,8 +4947,7 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) qpriv->resync = true; /* RESYNC request always gets a TID RDMA ACK. */ qpriv->s_nak_state = 0; - qpriv->s_flags |= RVT_S_ACK_PENDING; - hfi1_schedule_tid_send(qp); + tid_rdma_trigger_ack(qp); bail: if (fecn) qp->s_flags |= RVT_S_ECN; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 1c536185261e..6e82df2190b7 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -17,6 +17,7 @@ #define TID_RDMA_MIN_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ #define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ #define TID_RDMA_MAX_PAGES (BIT(18) >> PAGE_SHIFT) +#define TID_RDMA_SEGMENT_SHIFT 18 /* * Bit definitions for priv->s_flags. @@ -274,8 +275,6 @@ u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, u32 *bth2, u32 *len); -void hfi1_compute_tid_rdma_flow_wt(void); - void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet); u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h index 86783276fb1f..3bb8f78fb7b0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.h +++ b/drivers/infiniband/hw/hns/hns_roce_hem.h @@ -59,7 +59,7 @@ enum { #define HNS_ROCE_HEM_CHUNK_LEN \ ((256 - sizeof(struct list_head) - 2 * sizeof(int)) / \ - (sizeof(struct scatterlist))) + (sizeof(struct scatterlist) + sizeof(void *))) #define check_whether_bt_num_3(type, hop_num) \ (type < HEM_TYPE_MTT && hop_num == 2) diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 9591457eb768..43ea2c13b212 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -376,7 +376,7 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, srq->max = roundup_pow_of_two(srq_init_attr->attr.max_wr + 1); srq->max_gs = srq_init_attr->attr.max_sge; - srq_desc_size = max(16, 16 * srq->max_gs); + srq_desc_size = roundup_pow_of_two(max(16, 16 * srq->max_gs)); srq->wqe_shift = ilog2(srq_desc_size); diff --git a/drivers/input/ff-memless.c b/drivers/input/ff-memless.c index 1cb40c7475af..8229a9006917 100644 --- a/drivers/input/ff-memless.c +++ b/drivers/input/ff-memless.c @@ -489,6 +489,15 @@ static void ml_ff_destroy(struct ff_device *ff) { struct ml_device *ml = ff->private; + /* + * Even though we stop all playing effects when tearing down + * an input device (via input_device_flush() that calls into + * input_ff_flush() that stops and erases all effects), we + * do not actually stop the timer, and therefore we should + * do it here. + */ + del_timer_sync(&ml->timer); + kfree(ml->private); } diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index 56fae3472114..704558d449a2 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -177,6 +177,7 @@ static const char * const smbus_pnp_ids[] = { "LEN0096", /* X280 */ "LEN0097", /* X280 -> ALPS trackpoint */ "LEN009b", /* T580 */ + "LEN0402", /* X1 Extreme 2nd Generation */ "LEN200f", /* T450s */ "LEN2054", /* E480 */ "LEN2055", /* E580 */ diff --git a/drivers/input/rmi4/rmi_f11.c b/drivers/input/rmi4/rmi_f11.c index f28a7158b2ef..bbf9ae9f3f0c 100644 --- a/drivers/input/rmi4/rmi_f11.c +++ b/drivers/input/rmi4/rmi_f11.c @@ -510,7 +510,6 @@ struct f11_data { struct rmi_2d_sensor_platform_data sensor_pdata; unsigned long *abs_mask; unsigned long *rel_mask; - unsigned long *result_bits; }; enum f11_finger_state { @@ -1057,7 +1056,7 @@ static int rmi_f11_initialize(struct rmi_function *fn) /* ** init instance data, fill in values and create any sysfs files */ - f11 = devm_kzalloc(&fn->dev, sizeof(struct f11_data) + mask_size * 3, + f11 = devm_kzalloc(&fn->dev, sizeof(struct f11_data) + mask_size * 2, GFP_KERNEL); if (!f11) return -ENOMEM; @@ -1076,8 +1075,6 @@ static int rmi_f11_initialize(struct rmi_function *fn) + sizeof(struct f11_data)); f11->rel_mask = (unsigned long *)((char *)f11 + sizeof(struct f11_data) + mask_size); - f11->result_bits = (unsigned long *)((char *)f11 - + sizeof(struct f11_data) + mask_size * 2); set_bit(fn->irq_pos, f11->abs_mask); set_bit(fn->irq_pos + 1, f11->rel_mask); @@ -1284,8 +1281,8 @@ static irqreturn_t rmi_f11_attention(int irq, void *ctx) valid_bytes = f11->sensor.attn_size; memcpy(f11->sensor.data_pkt, drvdata->attn_data.data, valid_bytes); - drvdata->attn_data.data += f11->sensor.attn_size; - drvdata->attn_data.size -= f11->sensor.attn_size; + drvdata->attn_data.data += valid_bytes; + drvdata->attn_data.size -= valid_bytes; } else { error = rmi_read_block(rmi_dev, data_base_addr, f11->sensor.data_pkt, diff --git a/drivers/input/rmi4/rmi_f12.c b/drivers/input/rmi4/rmi_f12.c index d20a5d6780d1..7e97944f7616 100644 --- a/drivers/input/rmi4/rmi_f12.c +++ b/drivers/input/rmi4/rmi_f12.c @@ -55,6 +55,9 @@ struct f12_data { const struct rmi_register_desc_item *data15; u16 data15_offset; + + unsigned long *abs_mask; + unsigned long *rel_mask; }; static int rmi_f12_read_sensor_tuning(struct f12_data *f12) @@ -209,8 +212,8 @@ static irqreturn_t rmi_f12_attention(int irq, void *ctx) valid_bytes = sensor->attn_size; memcpy(sensor->data_pkt, drvdata->attn_data.data, valid_bytes); - drvdata->attn_data.data += sensor->attn_size; - drvdata->attn_data.size -= sensor->attn_size; + drvdata->attn_data.data += valid_bytes; + drvdata->attn_data.size -= valid_bytes; } else { retval = rmi_read_block(rmi_dev, f12->data_addr, sensor->data_pkt, sensor->pkt_size); @@ -291,9 +294,18 @@ static int rmi_f12_write_control_regs(struct rmi_function *fn) static int rmi_f12_config(struct rmi_function *fn) { struct rmi_driver *drv = fn->rmi_dev->driver; + struct f12_data *f12 = dev_get_drvdata(&fn->dev); + struct rmi_2d_sensor *sensor; int ret; - drv->set_irq_bits(fn->rmi_dev, fn->irq_mask); + sensor = &f12->sensor; + + if (!sensor->report_abs) + drv->clear_irq_bits(fn->rmi_dev, f12->abs_mask); + else + drv->set_irq_bits(fn->rmi_dev, f12->abs_mask); + + drv->clear_irq_bits(fn->rmi_dev, f12->rel_mask); ret = rmi_f12_write_control_regs(fn); if (ret) @@ -315,9 +327,12 @@ static int rmi_f12_probe(struct rmi_function *fn) struct rmi_device_platform_data *pdata = rmi_get_platform_data(rmi_dev); struct rmi_driver_data *drvdata = dev_get_drvdata(&rmi_dev->dev); u16 data_offset = 0; + int mask_size; rmi_dbg(RMI_DEBUG_FN, &fn->dev, "%s\n", __func__); + mask_size = BITS_TO_LONGS(drvdata->irq_count) * sizeof(unsigned long); + ret = rmi_read(fn->rmi_dev, query_addr, &buf); if (ret < 0) { dev_err(&fn->dev, "Failed to read general info register: %d\n", @@ -332,10 +347,19 @@ static int rmi_f12_probe(struct rmi_function *fn) return -ENODEV; } - f12 = devm_kzalloc(&fn->dev, sizeof(struct f12_data), GFP_KERNEL); + f12 = devm_kzalloc(&fn->dev, sizeof(struct f12_data) + mask_size * 2, + GFP_KERNEL); if (!f12) return -ENOMEM; + f12->abs_mask = (unsigned long *)((char *)f12 + + sizeof(struct f12_data)); + f12->rel_mask = (unsigned long *)((char *)f12 + + sizeof(struct f12_data) + mask_size); + + set_bit(fn->irq_pos, f12->abs_mask); + set_bit(fn->irq_pos + 1, f12->rel_mask); + f12->has_dribble = !!(buf & BIT(3)); if (fn->dev.of_node) { diff --git a/drivers/input/rmi4/rmi_f54.c b/drivers/input/rmi4/rmi_f54.c index 710b02595486..897105b9a98b 100644 --- a/drivers/input/rmi4/rmi_f54.c +++ b/drivers/input/rmi4/rmi_f54.c @@ -359,7 +359,7 @@ static const struct vb2_ops rmi_f54_queue_ops = { static const struct vb2_queue rmi_f54_queue = { .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, .io_modes = VB2_MMAP | VB2_USERPTR | VB2_DMABUF | VB2_READ, - .buf_struct_size = sizeof(struct vb2_buffer), + .buf_struct_size = sizeof(struct vb2_v4l2_buffer), .ops = &rmi_f54_queue_ops, .mem_ops = &vb2_vmalloc_memops, .timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC, @@ -601,7 +601,7 @@ static int rmi_f54_config(struct rmi_function *fn) { struct rmi_driver *drv = fn->rmi_dev->driver; - drv->set_irq_bits(fn->rmi_dev, fn->irq_mask); + drv->clear_irq_bits(fn->rmi_dev, fn->irq_mask); return 0; } @@ -730,6 +730,7 @@ static void rmi_f54_remove(struct rmi_function *fn) video_unregister_device(&f54->vdev); v4l2_device_unregister(&f54->v4l2); + destroy_workqueue(f54->workqueue); } struct rmi_function_handler rmi_f54_handler = { diff --git a/drivers/input/touchscreen/cyttsp4_core.c b/drivers/input/touchscreen/cyttsp4_core.c index 4b22d49a0f49..6bcffc930384 100644 --- a/drivers/input/touchscreen/cyttsp4_core.c +++ b/drivers/input/touchscreen/cyttsp4_core.c @@ -1990,11 +1990,6 @@ static int cyttsp4_mt_probe(struct cyttsp4 *cd) /* get sysinfo */ md->si = &cd->sysinfo; - if (!md->si) { - dev_err(dev, "%s: Fail get sysinfo pointer from core p=%p\n", - __func__, md->si); - goto error_get_sysinfo; - } rc = cyttsp4_setup_input_device(cd); if (rc) @@ -2004,8 +1999,6 @@ static int cyttsp4_mt_probe(struct cyttsp4 *cd) error_init_input: input_free_device(md->input); -error_get_sysinfo: - input_set_drvdata(md->input, NULL); error_alloc_failed: dev_err(dev, "%s failed.\n", __func__); return rc; diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c index 7b971228df38..c498796adc07 100644 --- a/drivers/interconnect/core.c +++ b/drivers/interconnect/core.c @@ -405,8 +405,12 @@ void icc_set_tag(struct icc_path *path, u32 tag) if (!path) return; + mutex_lock(&icc_lock); + for (i = 0; i < path->num_nodes; i++) path->reqs[i].tag = tag; + + mutex_unlock(&icc_lock); } EXPORT_SYMBOL_GPL(icc_set_tag); diff --git a/drivers/interconnect/qcom/qcs404.c b/drivers/interconnect/qcom/qcs404.c index 910081d6ddc0..b4966d8f3348 100644 --- a/drivers/interconnect/qcom/qcs404.c +++ b/drivers/interconnect/qcom/qcs404.c @@ -433,7 +433,8 @@ static int qnoc_probe(struct platform_device *pdev) if (!qp) return -ENOMEM; - data = devm_kcalloc(dev, num_nodes, sizeof(*node), GFP_KERNEL); + data = devm_kzalloc(dev, struct_size(data, nodes, num_nodes), + GFP_KERNEL); if (!data) return -ENOMEM; diff --git a/drivers/interconnect/qcom/sdm845.c b/drivers/interconnect/qcom/sdm845.c index 57955596bb59..502a6c22b41e 100644 --- a/drivers/interconnect/qcom/sdm845.c +++ b/drivers/interconnect/qcom/sdm845.c @@ -790,7 +790,8 @@ static int qnoc_probe(struct platform_device *pdev) if (!qp) return -ENOMEM; - data = devm_kcalloc(&pdev->dev, num_nodes, sizeof(*node), GFP_KERNEL); + data = devm_kzalloc(&pdev->dev, struct_size(data, nodes, num_nodes), + GFP_KERNEL); if (!data) return -ENOMEM; diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c index e7d1920729fb..0ae986c42bc8 100644 --- a/drivers/mmc/host/sdhci-of-at91.c +++ b/drivers/mmc/host/sdhci-of-at91.c @@ -358,7 +358,7 @@ static int sdhci_at91_probe(struct platform_device *pdev) pm_runtime_use_autosuspend(&pdev->dev); /* HS200 is broken at this moment */ - host->quirks2 = SDHCI_QUIRK2_BROKEN_HS200; + host->quirks2 |= SDHCI_QUIRK2_BROKEN_HS200; ret = sdhci_add_host(host); if (ret) diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c index bb6032211043..0a9f42e5fedf 100644 --- a/drivers/net/can/slcan.c +++ b/drivers/net/can/slcan.c @@ -617,6 +617,7 @@ err_free_chan: sl->tty = NULL; tty->disc_data = NULL; clear_bit(SLF_INUSE, &sl->flags); + free_netdev(sl->dev); err_exit: rtnl_unlock(); diff --git a/drivers/net/dsa/mv88e6xxx/ptp.c b/drivers/net/dsa/mv88e6xxx/ptp.c index 073cbd0bb91b..d838c174dc0d 100644 --- a/drivers/net/dsa/mv88e6xxx/ptp.c +++ b/drivers/net/dsa/mv88e6xxx/ptp.c @@ -273,6 +273,19 @@ static int mv88e6352_ptp_enable_extts(struct mv88e6xxx_chip *chip, int pin; int err; + /* Reject requests with unsupported flags */ + if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | + PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS)) + return -EOPNOTSUPP; + + /* Reject requests to enable time stamping on both edges. */ + if ((rq->extts.flags & PTP_STRICT_FLAGS) && + (rq->extts.flags & PTP_ENABLE_FEATURE) && + (rq->extts.flags & PTP_EXTTS_EDGES) == PTP_EXTTS_EDGES) + return -EOPNOTSUPP; + pin = ptp_find_pin(chip->ptp_clock, PTP_PF_EXTTS, rq->extts.index); if (pin < 0) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 77f3511b97de..ca3aa1250dd1 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -6280,6 +6280,10 @@ static int tg3_ptp_enable(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_PEROUT: + /* Reject requests with unsupported flags */ + if (rq->perout.flags) + return -EOPNOTSUPP; + if (rq->perout.index != 0) return -EINVAL; diff --git a/drivers/net/ethernet/cirrus/ep93xx_eth.c b/drivers/net/ethernet/cirrus/ep93xx_eth.c index f1a0c4dceda0..f37c9a08c4cf 100644 --- a/drivers/net/ethernet/cirrus/ep93xx_eth.c +++ b/drivers/net/ethernet/cirrus/ep93xx_eth.c @@ -763,6 +763,7 @@ static int ep93xx_eth_remove(struct platform_device *pdev) { struct net_device *dev; struct ep93xx_priv *ep; + struct resource *mem; dev = platform_get_drvdata(pdev); if (dev == NULL) @@ -778,8 +779,8 @@ static int ep93xx_eth_remove(struct platform_device *pdev) iounmap(ep->base_addr); if (ep->res != NULL) { - release_resource(ep->res); - kfree(ep->res); + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + release_mem_region(mem->start, resource_size(mem)); } free_netdev(dev); diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index e736ce2c58ca..a8f4c69252ff 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -2524,6 +2524,7 @@ static int gemini_ethernet_port_remove(struct platform_device *pdev) struct gemini_ethernet_port *port = platform_get_drvdata(pdev); gemini_port_remove(port); + free_netdev(port->netdev); return 0; } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index c26c0a7cbb6b..11538e59f587 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -2260,8 +2260,16 @@ err_set_cdan: err_service_reg: free_channel(priv, channel); err_alloc_ch: - if (err == -EPROBE_DEFER) + if (err == -EPROBE_DEFER) { + for (i = 0; i < priv->num_channels; i++) { + channel = priv->channel[i]; + nctx = &channel->nctx; + dpaa2_io_service_deregister(channel->dpio, nctx, dev); + free_channel(priv, channel); + } + priv->num_channels = 0; return err; + } if (cpumask_empty(&priv->dpio_cpumask)) { dev_err(dev, "No cpu with an affine DPIO/DPCON\n"); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index b104d3c3b757..6e0212b79438 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -70,11 +70,6 @@ static const struct hns3_stats hns3_rxq_stats[] = { #define HNS3_NIC_LB_TEST_TX_CNT_ERR 2 #define HNS3_NIC_LB_TEST_RX_CNT_ERR 3 -struct hns3_link_mode_mapping { - u32 hns3_link_mode; - u32 ethtool_link_mode; -}; - static int hns3_lp_setup(struct net_device *ndev, enum hnae3_loop loop, bool en) { struct hnae3_handle *h = hns3_get_handle(ndev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c index 49ad8483723d..d6c3952aba04 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c @@ -124,7 +124,7 @@ static int hclge_ets_validate(struct hclge_dev *hdev, struct ieee_ets *ets, if (ret) return ret; - for (i = 0; i < HNAE3_MAX_TC; i++) { + for (i = 0; i < hdev->tc_max; i++) { switch (ets->tc_tsa[i]) { case IEEE_8021QAZ_TSA_STRICT: if (hdev->tm_info.tc_info[i].tc_sch_mode != @@ -318,6 +318,7 @@ static int hclge_ieee_setpfc(struct hnae3_handle *h, struct ieee_pfc *pfc) struct net_device *netdev = h->kinfo.netdev; struct hclge_dev *hdev = vport->back; u8 i, j, pfc_map, *prio_tc; + int ret; if (!(hdev->dcbx_cap & DCB_CAP_DCBX_VER_IEEE) || hdev->flag & HCLGE_FLAG_MQPRIO_ENABLE) @@ -347,7 +348,21 @@ static int hclge_ieee_setpfc(struct hnae3_handle *h, struct ieee_pfc *pfc) hclge_tm_pfc_info_update(hdev); - return hclge_pause_setup_hw(hdev, false); + ret = hclge_pause_setup_hw(hdev, false); + if (ret) + return ret; + + ret = hclge_notify_client(hdev, HNAE3_DOWN_CLIENT); + if (ret) + return ret; + + ret = hclge_buffer_alloc(hdev); + if (ret) { + hclge_notify_client(hdev, HNAE3_UP_CLIENT); + return ret; + } + + return hclge_notify_client(hdev, HNAE3_UP_CLIENT); } /* DCBX configuration */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index eb14c43e4a90..7c7038676d6d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -6366,11 +6366,23 @@ static int hclge_config_switch_param(struct hclge_dev *hdev, int vfid, func_id = hclge_get_port_number(HOST_PORT, 0, vfid, 0); req = (struct hclge_mac_vlan_switch_cmd *)desc.data; + + /* read current config parameter */ hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MAC_VLAN_SWITCH_PARAM, - false); + true); req->roce_sel = HCLGE_MAC_VLAN_NIC_SEL; req->func_id = cpu_to_le32(func_id); - req->switch_param = switch_param; + + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "read mac vlan switch parameter fail, ret = %d\n", ret); + return ret; + } + + /* modify and write new config parameter */ + hclge_cmd_reuse_desc(&desc, false); + req->switch_param = (req->switch_param & param_mask) | switch_param; req->param_mask = param_mask; ret = hclge_cmd_send(&hdev->hw, &desc, 1); diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c index fd3071f55bd3..c39e921757ba 100644 --- a/drivers/net/ethernet/intel/igb/igb_ptp.c +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c @@ -521,6 +521,19 @@ static int igb_ptp_feature_enable_i210(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: + /* Reject requests with unsupported flags */ + if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | + PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS)) + return -EOPNOTSUPP; + + /* Reject requests failing to enable both edges. */ + if ((rq->extts.flags & PTP_STRICT_FLAGS) && + (rq->extts.flags & PTP_ENABLE_FEATURE) && + (rq->extts.flags & PTP_EXTTS_EDGES) != PTP_EXTTS_EDGES) + return -EOPNOTSUPP; + if (on) { pin = ptp_find_pin(igb->ptp_clock, PTP_PF_EXTTS, rq->extts.index); @@ -551,6 +564,10 @@ static int igb_ptp_feature_enable_i210(struct ptp_clock_info *ptp, return 0; case PTP_CLK_REQ_PEROUT: + /* Reject requests with unsupported flags */ + if (rq->perout.flags) + return -EOPNOTSUPP; + if (on) { pin = ptp_find_pin(igb->ptp_clock, PTP_PF_PEROUT, rq->perout.index); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h index 096a04a2f3e1..9343bf39cfac 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Marvell OcteonTx2 CGX driver +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell OcteonTx2 CGX driver * * Copyright (C) 2018 Marvell International Ltd. * diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx_fw_if.h b/drivers/net/ethernet/marvell/octeontx2/af/cgx_fw_if.h index fb3ba4968a9b..473d9751601f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx_fw_if.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx_fw_if.h @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Marvell OcteonTx2 CGX driver +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell OcteonTx2 CGX driver * * Copyright (C) 2018 Marvell International Ltd. * diff --git a/drivers/net/ethernet/marvell/octeontx2/af/common.h b/drivers/net/ethernet/marvell/octeontx2/af/common.h index baec832962df..784207bae5f8 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/common.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/common.h @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Marvell OcteonTx2 RVU Admin Function driver +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell OcteonTx2 RVU Admin Function driver * * Copyright (C) 2018 Marvell International Ltd. * diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h index f143d7b31d73..a589748f1240 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Marvell OcteonTx2 RVU Admin Function driver +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell OcteonTx2 RVU Admin Function driver * * Copyright (C) 2018 Marvell International Ltd. * diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc.h b/drivers/net/ethernet/marvell/octeontx2/af/npc.h index 168fb4ea6ab6..3803af9231c6 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/npc.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/npc.h @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Marvell OcteonTx2 RVU Admin Function driver +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell OcteonTx2 RVU Admin Function driver * * Copyright (C) 2018 Marvell International Ltd. * diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h b/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h index 832810ad6b02..aa2727e6211a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Marvell OcteonTx2 RVU Admin Function driver +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell OcteonTx2 RVU Admin Function driver * * Copyright (C) 2018 Marvell International Ltd. * diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index b252d8683aa7..51c206f4fe6f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Marvell OcteonTx2 RVU Admin Function driver +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell OcteonTx2 RVU Admin Function driver * * Copyright (C) 2018 Marvell International Ltd. * diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h index be758c19648d..7ca599b973c0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Marvell OcteonTx2 RVU Admin Function driver +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell OcteonTx2 RVU Admin Function driver * * Copyright (C) 2018 Marvell International Ltd. * diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h index f6a260d419fd..9d8942acc232 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Marvell OcteonTx2 RVU Admin Function driver +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell OcteonTx2 RVU Admin Function driver * * Copyright (C) 2018 Marvell International Ltd. * diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 0059b290e095..43f97601b500 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -236,6 +236,19 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, if (!MLX5_PPS_CAP(mdev)) return -EOPNOTSUPP; + /* Reject requests with unsupported flags */ + if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | + PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS)) + return -EOPNOTSUPP; + + /* Reject requests to enable time stamping on both edges. */ + if ((rq->extts.flags & PTP_STRICT_FLAGS) && + (rq->extts.flags & PTP_ENABLE_FEATURE) && + (rq->extts.flags & PTP_EXTTS_EDGES) == PTP_EXTTS_EDGES) + return -EOPNOTSUPP; + if (rq->extts.index >= clock->ptp_info.n_pins) return -EINVAL; @@ -290,6 +303,10 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, if (!MLX5_PPS_CAP(mdev)) return -EOPNOTSUPP; + /* Reject requests with unsupported flags */ + if (rq->perout.flags) + return -EOPNOTSUPP; + if (rq->perout.index >= clock->ptp_info.n_pins) return -EINVAL; diff --git a/drivers/net/ethernet/microchip/lan743x_ptp.c b/drivers/net/ethernet/microchip/lan743x_ptp.c index e177b1ae03f1..afe52463dc57 100644 --- a/drivers/net/ethernet/microchip/lan743x_ptp.c +++ b/drivers/net/ethernet/microchip/lan743x_ptp.c @@ -492,6 +492,10 @@ static int lan743x_ptp_perout(struct lan743x_adapter *adapter, int on, unsigned int index = perout_request->index; struct lan743x_ptp_perout *perout = &ptp->perout[index]; + /* Reject requests with unsupported flags */ + if (perout_request->flags) + return -EOPNOTSUPP; + if (on) { perout_pin = ptp_find_pin(ptp->ptp_clock, PTP_PF_PEROUT, perout_request->index); diff --git a/drivers/net/ethernet/renesas/ravb.h b/drivers/net/ethernet/renesas/ravb.h index a9c89d5d8898..9f88b5db4f89 100644 --- a/drivers/net/ethernet/renesas/ravb.h +++ b/drivers/net/ethernet/renesas/ravb.h @@ -955,6 +955,8 @@ enum RAVB_QUEUE { #define NUM_RX_QUEUE 2 #define NUM_TX_QUEUE 2 +#define RX_BUF_SZ (2048 - ETH_FCS_LEN + sizeof(__sum16)) + /* TX descriptors per packet */ #define NUM_TX_DESC_GEN2 2 #define NUM_TX_DESC_GEN3 1 @@ -1018,7 +1020,6 @@ struct ravb_private { u32 dirty_rx[NUM_RX_QUEUE]; /* Producer ring indices */ u32 cur_tx[NUM_TX_QUEUE]; u32 dirty_tx[NUM_TX_QUEUE]; - u32 rx_buf_sz; /* Based on MTU+slack. */ struct napi_struct napi[NUM_RX_QUEUE]; struct work_struct work; /* MII transceiver section. */ diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 5ea14b5fbed8..4b13a184bfc7 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -230,7 +230,7 @@ static void ravb_ring_free(struct net_device *ndev, int q) le32_to_cpu(desc->dptr))) dma_unmap_single(ndev->dev.parent, le32_to_cpu(desc->dptr), - priv->rx_buf_sz, + RX_BUF_SZ, DMA_FROM_DEVICE); } ring_size = sizeof(struct ravb_ex_rx_desc) * @@ -293,9 +293,9 @@ static void ravb_ring_format(struct net_device *ndev, int q) for (i = 0; i < priv->num_rx_ring[q]; i++) { /* RX descriptor */ rx_desc = &priv->rx_ring[q][i]; - rx_desc->ds_cc = cpu_to_le16(priv->rx_buf_sz); + rx_desc->ds_cc = cpu_to_le16(RX_BUF_SZ); dma_addr = dma_map_single(ndev->dev.parent, priv->rx_skb[q][i]->data, - priv->rx_buf_sz, + RX_BUF_SZ, DMA_FROM_DEVICE); /* We just set the data size to 0 for a failed mapping which * should prevent DMA from happening... @@ -342,9 +342,6 @@ static int ravb_ring_init(struct net_device *ndev, int q) int ring_size; int i; - priv->rx_buf_sz = (ndev->mtu <= 1492 ? PKT_BUF_SZ : ndev->mtu) + - ETH_HLEN + VLAN_HLEN + sizeof(__sum16); - /* Allocate RX and TX skb rings */ priv->rx_skb[q] = kcalloc(priv->num_rx_ring[q], sizeof(*priv->rx_skb[q]), GFP_KERNEL); @@ -354,7 +351,7 @@ static int ravb_ring_init(struct net_device *ndev, int q) goto error; for (i = 0; i < priv->num_rx_ring[q]; i++) { - skb = netdev_alloc_skb(ndev, priv->rx_buf_sz + RAVB_ALIGN - 1); + skb = netdev_alloc_skb(ndev, RX_BUF_SZ + RAVB_ALIGN - 1); if (!skb) goto error; ravb_set_buffer_align(skb); @@ -584,7 +581,7 @@ static bool ravb_rx(struct net_device *ndev, int *quota, int q) skb = priv->rx_skb[q][entry]; priv->rx_skb[q][entry] = NULL; dma_unmap_single(ndev->dev.parent, le32_to_cpu(desc->dptr), - priv->rx_buf_sz, + RX_BUF_SZ, DMA_FROM_DEVICE); get_ts &= (q == RAVB_NC) ? RAVB_RXTSTAMP_TYPE_V2_L2_EVENT : @@ -617,11 +614,11 @@ static bool ravb_rx(struct net_device *ndev, int *quota, int q) for (; priv->cur_rx[q] - priv->dirty_rx[q] > 0; priv->dirty_rx[q]++) { entry = priv->dirty_rx[q] % priv->num_rx_ring[q]; desc = &priv->rx_ring[q][entry]; - desc->ds_cc = cpu_to_le16(priv->rx_buf_sz); + desc->ds_cc = cpu_to_le16(RX_BUF_SZ); if (!priv->rx_skb[q][entry]) { skb = netdev_alloc_skb(ndev, - priv->rx_buf_sz + + RX_BUF_SZ + RAVB_ALIGN - 1); if (!skb) break; /* Better luck next round. */ @@ -1801,10 +1798,15 @@ static int ravb_do_ioctl(struct net_device *ndev, struct ifreq *req, int cmd) static int ravb_change_mtu(struct net_device *ndev, int new_mtu) { - if (netif_running(ndev)) - return -EBUSY; + struct ravb_private *priv = netdev_priv(ndev); ndev->mtu = new_mtu; + + if (netif_running(ndev)) { + synchronize_irq(priv->emac_irq); + ravb_emac_init(ndev); + } + netdev_update_features(ndev); return 0; diff --git a/drivers/net/ethernet/renesas/ravb_ptp.c b/drivers/net/ethernet/renesas/ravb_ptp.c index 9a42580693cb..6984bd5b7da9 100644 --- a/drivers/net/ethernet/renesas/ravb_ptp.c +++ b/drivers/net/ethernet/renesas/ravb_ptp.c @@ -182,6 +182,13 @@ static int ravb_ptp_extts(struct ptp_clock_info *ptp, struct net_device *ndev = priv->ndev; unsigned long flags; + /* Reject requests with unsupported flags */ + if (req->flags & ~(PTP_ENABLE_FEATURE | + PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS)) + return -EOPNOTSUPP; + if (req->index) return -EINVAL; @@ -211,6 +218,10 @@ static int ravb_ptp_perout(struct ptp_clock_info *ptp, unsigned long flags; int error = 0; + /* Reject requests with unsupported flags */ + if (req->flags) + return -EOPNOTSUPP; + if (req->index) return -EINVAL; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index eefb06d918c8..1c8d84ed8410 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1227,7 +1227,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) dwmac_mux: sun8i_dwmac_unset_syscon(gmac); dwmac_exit: - sun8i_dwmac_exit(pdev, plat_dat->bsp_priv); + stmmac_pltfr_remove(pdev); return ret; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h index 775db776b3cc..23fecf68f781 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ // Copyright (c) 2017 Synopsys, Inc. and/or its affiliates. // stmmac Support for 5.xx Ethernet QoS cores diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h index e908d80a1d6f..3b6e559aa0b9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ /* * Copyright (c) 2018 Synopsys, Inc. and/or its affiliates. * stmmac XGMAC definitions. diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 509daeefdb79..aa5b917398fe 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ // Copyright (c) 2018 Synopsys, Inc. and/or its affiliates. // stmmac HW Interface Callbacks diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c index df638b18b72c..0989e2bb6ee3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c @@ -140,6 +140,10 @@ static int stmmac_enable(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_PEROUT: + /* Reject requests with unsupported flags */ + if (rq->perout.flags) + return -EOPNOTSUPP; + cfg = &priv->pps[rq->perout.index]; cfg->start.tv_sec = rq->perout.start.sec; diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index 6580094161a9..8f241b57fcf6 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -469,6 +469,19 @@ static int ptp_dp83640_enable(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: + /* Reject requests with unsupported flags */ + if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | + PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS)) + return -EOPNOTSUPP; + + /* Reject requests to enable time stamping on both edges. */ + if ((rq->extts.flags & PTP_STRICT_FLAGS) && + (rq->extts.flags & PTP_ENABLE_FEATURE) && + (rq->extts.flags & PTP_EXTTS_EDGES) == PTP_EXTTS_EDGES) + return -EOPNOTSUPP; + index = rq->extts.index; if (index >= N_EXT_TS) return -EINVAL; @@ -491,6 +504,9 @@ static int ptp_dp83640_enable(struct ptp_clock_info *ptp, return 0; case PTP_CLK_REQ_PEROUT: + /* Reject requests with unsupported flags */ + if (rq->perout.flags) + return -EOPNOTSUPP; if (rq->perout.index >= N_PER_OUT) return -EINVAL; return periodic_output(clock, rq, on, rq->perout.index); diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 2e29ab841b4d..35876562e32a 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -64,11 +64,12 @@ static int mdiobus_register_reset(struct mdio_device *mdiodev) if (mdiodev->dev.of_node) reset = devm_reset_control_get_exclusive(&mdiodev->dev, "phy"); - if (PTR_ERR(reset) == -ENOENT || - PTR_ERR(reset) == -ENOTSUPP) - reset = NULL; - else if (IS_ERR(reset)) - return PTR_ERR(reset); + if (IS_ERR(reset)) { + if (PTR_ERR(reset) == -ENOENT || PTR_ERR(reset) == -ENOSYS) + reset = NULL; + else + return PTR_ERR(reset); + } mdiodev->reset_ctrl = reset; diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c index cac64b96d545..4d479e3c817d 100644 --- a/drivers/net/slip/slip.c +++ b/drivers/net/slip/slip.c @@ -855,6 +855,7 @@ err_free_chan: sl->tty = NULL; tty->disc_data = NULL; clear_bit(SLF_INUSE, &sl->flags); + free_netdev(sl->dev); err_exit: rtnl_unlock(); diff --git a/drivers/net/usb/ax88172a.c b/drivers/net/usb/ax88172a.c index 011bd4cb546e..af3994e0853b 100644 --- a/drivers/net/usb/ax88172a.c +++ b/drivers/net/usb/ax88172a.c @@ -196,7 +196,7 @@ static int ax88172a_bind(struct usbnet *dev, struct usb_interface *intf) /* Get the MAC address */ ret = asix_read_cmd(dev, AX_CMD_READ_NODE_ID, 0, 0, ETH_ALEN, buf, 0); - if (ret < 0) { + if (ret < ETH_ALEN) { netdev_err(dev->net, "Failed to read MAC address: %d\n", ret); goto free; } diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index a245597a3902..c2c82e6391b4 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -579,7 +579,7 @@ static void cdc_ncm_set_dgram_size(struct usbnet *dev, int new_size) err = usbnet_read_cmd(dev, USB_CDC_GET_MAX_DATAGRAM_SIZE, USB_TYPE_CLASS | USB_DIR_IN | USB_RECIP_INTERFACE, 0, iface_no, &max_datagram_size, sizeof(max_datagram_size)); - if (err < sizeof(max_datagram_size)) { + if (err != sizeof(max_datagram_size)) { dev_dbg(&dev->intf->dev, "GET_MAX_DATAGRAM_SIZE failed\n"); goto out; } diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 56d334b9ad45..4196c0e32740 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1371,6 +1371,8 @@ static const struct usb_device_id products[] = { {QMI_QUIRK_SET_DTR(0x2c7c, 0x0191, 4)}, /* Quectel EG91 */ {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)}, /* Quectel BG96 */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0104, 4)}, /* Fibocom NL678 series */ + {QMI_FIXED_INTF(0x0489, 0xe0b4, 0)}, /* Foxconn T77W968 LTE */ + {QMI_FIXED_INTF(0x0489, 0xe0b5, 0)}, /* Foxconn T77W968 LTE with eSIM support*/ /* 4. Gobi 1000 devices */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */ diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c index 4c9c78d1ba98..8323fa7c0762 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c @@ -251,27 +251,23 @@ static int iwl_pcie_gen2_build_amsdu(struct iwl_trans *trans, struct ieee80211_hdr *hdr = (void *)skb->data; unsigned int snap_ip_tcp_hdrlen, ip_hdrlen, total_len, hdr_room; unsigned int mss = skb_shinfo(skb)->gso_size; - u16 length, iv_len, amsdu_pad; + u16 length, amsdu_pad; u8 *start_hdr; struct iwl_tso_hdr_page *hdr_page; struct page **page_ptr; struct tso_t tso; - /* if the packet is protected, then it must be CCMP or GCMP */ - iv_len = ieee80211_has_protected(hdr->frame_control) ? - IEEE80211_CCMP_HDR_LEN : 0; - trace_iwlwifi_dev_tx(trans->dev, skb, tfd, sizeof(*tfd), &dev_cmd->hdr, start_len, 0); ip_hdrlen = skb_transport_header(skb) - skb_network_header(skb); snap_ip_tcp_hdrlen = 8 + ip_hdrlen + tcp_hdrlen(skb); - total_len = skb->len - snap_ip_tcp_hdrlen - hdr_len - iv_len; + total_len = skb->len - snap_ip_tcp_hdrlen - hdr_len; amsdu_pad = 0; /* total amount of header we may need for this A-MSDU */ hdr_room = DIV_ROUND_UP(total_len, mss) * - (3 + snap_ip_tcp_hdrlen + sizeof(struct ethhdr)) + iv_len; + (3 + snap_ip_tcp_hdrlen + sizeof(struct ethhdr)); /* Our device supports 9 segments at most, it will fit in 1 page */ hdr_page = get_page_hdr(trans, hdr_room); @@ -282,14 +278,12 @@ static int iwl_pcie_gen2_build_amsdu(struct iwl_trans *trans, start_hdr = hdr_page->pos; page_ptr = (void *)((u8 *)skb->cb + trans_pcie->page_offs); *page_ptr = hdr_page->page; - memcpy(hdr_page->pos, skb->data + hdr_len, iv_len); - hdr_page->pos += iv_len; /* - * Pull the ieee80211 header + IV to be able to use TSO core, + * Pull the ieee80211 header to be able to use TSO core, * we will restore it for the tx_status flow. */ - skb_pull(skb, hdr_len + iv_len); + skb_pull(skb, hdr_len); /* * Remove the length of all the headers that we don't actually @@ -364,8 +358,8 @@ static int iwl_pcie_gen2_build_amsdu(struct iwl_trans *trans, } } - /* re -add the WiFi header and IV */ - skb_push(skb, hdr_len + iv_len); + /* re -add the WiFi header */ + skb_push(skb, hdr_len); return 0; diff --git a/drivers/nfc/nxp-nci/i2c.c b/drivers/nfc/nxp-nci/i2c.c index 307bd2afbe05..4d1909aecd6c 100644 --- a/drivers/nfc/nxp-nci/i2c.c +++ b/drivers/nfc/nxp-nci/i2c.c @@ -220,8 +220,10 @@ static irqreturn_t nxp_nci_i2c_irq_thread_fn(int irq, void *phy_id) if (r == -EREMOTEIO) { phy->hard_fault = r; - skb = NULL; - } else if (r < 0) { + if (info->mode == NXP_NCI_MODE_FW) + nxp_nci_fw_recv_frame(phy->ndev, NULL); + } + if (r < 0) { nfc_err(&client->dev, "Read failed with error %d\n", r); goto exit_irq_handled; } diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c index c6251eac8946..2c419fa5d1c1 100644 --- a/drivers/pinctrl/intel/pinctrl-cherryview.c +++ b/drivers/pinctrl/intel/pinctrl-cherryview.c @@ -147,6 +147,7 @@ struct chv_pin_context { * @pctldesc: Pin controller description * @pctldev: Pointer to the pin controller device * @chip: GPIO chip in this pin controller + * @irqchip: IRQ chip in this pin controller * @regs: MMIO registers * @intr_lines: Stores mapping between 16 HW interrupt wires and GPIO * offset (in GPIO number space) @@ -162,6 +163,7 @@ struct chv_pinctrl { struct pinctrl_desc pctldesc; struct pinctrl_dev *pctldev; struct gpio_chip chip; + struct irq_chip irqchip; void __iomem *regs; unsigned intr_lines[16]; const struct chv_community *community; @@ -1466,16 +1468,6 @@ static int chv_gpio_irq_type(struct irq_data *d, unsigned int type) return 0; } -static struct irq_chip chv_gpio_irqchip = { - .name = "chv-gpio", - .irq_startup = chv_gpio_irq_startup, - .irq_ack = chv_gpio_irq_ack, - .irq_mask = chv_gpio_irq_mask, - .irq_unmask = chv_gpio_irq_unmask, - .irq_set_type = chv_gpio_irq_type, - .flags = IRQCHIP_SKIP_SET_WAKE, -}; - static void chv_gpio_irq_handler(struct irq_desc *desc) { struct gpio_chip *gc = irq_desc_get_handler_data(desc); @@ -1559,7 +1551,7 @@ static void chv_init_irq_valid_mask(struct gpio_chip *chip, intsel >>= CHV_PADCTRL0_INTSEL_SHIFT; if (intsel >= community->nirqs) - clear_bit(i, valid_mask); + clear_bit(desc->number, valid_mask); } } @@ -1625,7 +1617,15 @@ static int chv_gpio_probe(struct chv_pinctrl *pctrl, int irq) } } - ret = gpiochip_irqchip_add(chip, &chv_gpio_irqchip, 0, + pctrl->irqchip.name = "chv-gpio"; + pctrl->irqchip.irq_startup = chv_gpio_irq_startup; + pctrl->irqchip.irq_ack = chv_gpio_irq_ack; + pctrl->irqchip.irq_mask = chv_gpio_irq_mask; + pctrl->irqchip.irq_unmask = chv_gpio_irq_unmask; + pctrl->irqchip.irq_set_type = chv_gpio_irq_type; + pctrl->irqchip.flags = IRQCHIP_SKIP_SET_WAKE; + + ret = gpiochip_irqchip_add(chip, &pctrl->irqchip, 0, handle_bad_irq, IRQ_TYPE_NONE); if (ret) { dev_err(pctrl->dev, "failed to add IRQ chip\n"); @@ -1642,7 +1642,7 @@ static int chv_gpio_probe(struct chv_pinctrl *pctrl, int irq) } } - gpiochip_set_chained_irqchip(chip, &chv_gpio_irqchip, irq, + gpiochip_set_chained_irqchip(chip, &pctrl->irqchip, irq, chv_gpio_irq_handler); return 0; } diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index bc013599a9a3..83981ad66a71 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -52,6 +52,7 @@ #define PADCFG0_GPIROUTNMI BIT(17) #define PADCFG0_PMODE_SHIFT 10 #define PADCFG0_PMODE_MASK GENMASK(13, 10) +#define PADCFG0_PMODE_GPIO 0 #define PADCFG0_GPIORXDIS BIT(9) #define PADCFG0_GPIOTXDIS BIT(8) #define PADCFG0_GPIORXSTATE BIT(1) @@ -332,7 +333,7 @@ static void intel_pin_dbg_show(struct pinctrl_dev *pctldev, struct seq_file *s, cfg1 = readl(intel_get_padcfg(pctrl, pin, PADCFG1)); mode = (cfg0 & PADCFG0_PMODE_MASK) >> PADCFG0_PMODE_SHIFT; - if (!mode) + if (mode == PADCFG0_PMODE_GPIO) seq_puts(s, "GPIO "); else seq_printf(s, "mode %d ", mode); @@ -458,6 +459,11 @@ static void __intel_gpio_set_direction(void __iomem *padcfg0, bool input) writel(value, padcfg0); } +static int intel_gpio_get_gpio_mode(void __iomem *padcfg0) +{ + return (readl(padcfg0) & PADCFG0_PMODE_MASK) >> PADCFG0_PMODE_SHIFT; +} + static void intel_gpio_set_gpio_mode(void __iomem *padcfg0) { u32 value; @@ -491,7 +497,20 @@ static int intel_gpio_request_enable(struct pinctrl_dev *pctldev, } padcfg0 = intel_get_padcfg(pctrl, pin, PADCFG0); + + /* + * If pin is already configured in GPIO mode, we assume that + * firmware provides correct settings. In such case we avoid + * potential glitches on the pin. Otherwise, for the pin in + * alternative mode, consumer has to supply respective flags. + */ + if (intel_gpio_get_gpio_mode(padcfg0) == PADCFG0_PMODE_GPIO) { + raw_spin_unlock_irqrestore(&pctrl->lock, flags); + return 0; + } + intel_gpio_set_gpio_mode(padcfg0); + /* Disable TX buffer and enable RX (this will be input) */ __intel_gpio_set_direction(padcfg0, true); diff --git a/drivers/pinctrl/pinctrl-stmfx.c b/drivers/pinctrl/pinctrl-stmfx.c index 564660028fcc..ccdf0bb21414 100644 --- a/drivers/pinctrl/pinctrl-stmfx.c +++ b/drivers/pinctrl/pinctrl-stmfx.c @@ -585,19 +585,6 @@ static int stmfx_pinctrl_gpio_function_enable(struct stmfx_pinctrl *pctl) return stmfx_function_enable(pctl->stmfx, func); } -static int stmfx_pinctrl_gpio_init_valid_mask(struct gpio_chip *gc, - unsigned long *valid_mask, - unsigned int ngpios) -{ - struct stmfx_pinctrl *pctl = gpiochip_get_data(gc); - u32 n; - - for_each_clear_bit(n, &pctl->gpio_valid_mask, ngpios) - clear_bit(n, valid_mask); - - return 0; -} - static int stmfx_pinctrl_probe(struct platform_device *pdev) { struct stmfx *stmfx = dev_get_drvdata(pdev->dev.parent); @@ -660,7 +647,6 @@ static int stmfx_pinctrl_probe(struct platform_device *pdev) pctl->gpio_chip.ngpio = pctl->pctl_desc.npins; pctl->gpio_chip.can_sleep = true; pctl->gpio_chip.of_node = np; - pctl->gpio_chip.init_valid_mask = stmfx_pinctrl_gpio_init_valid_mask; ret = devm_gpiochip_add_data(pctl->dev, &pctl->gpio_chip, pctl); if (ret) { diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 67d0199840fd..9d72ab593f13 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -149,11 +149,21 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) err = -EFAULT; break; } - if (((req.extts.flags & ~PTP_EXTTS_VALID_FLAGS) || - req.extts.rsv[0] || req.extts.rsv[1]) && - cmd == PTP_EXTTS_REQUEST2) { - err = -EINVAL; - break; + if (cmd == PTP_EXTTS_REQUEST2) { + /* Tell the drivers to check the flags carefully. */ + req.extts.flags |= PTP_STRICT_FLAGS; + /* Make sure no reserved bit is set. */ + if ((req.extts.flags & ~PTP_EXTTS_VALID_FLAGS) || + req.extts.rsv[0] || req.extts.rsv[1]) { + err = -EINVAL; + break; + } + /* Ensure one of the rising/falling edge bits is set. */ + if ((req.extts.flags & PTP_ENABLE_FEATURE) && + (req.extts.flags & PTP_EXTTS_EDGES) == 0) { + err = -EINVAL; + break; + } } else if (cmd == PTP_EXTTS_REQUEST) { req.extts.flags &= PTP_EXTTS_V1_VALID_FLAGS; req.extts.rsv[0] = 0; diff --git a/drivers/reset/core.c b/drivers/reset/core.c index 213ff40dda11..3c9a64c1b7a8 100644 --- a/drivers/reset/core.c +++ b/drivers/reset/core.c @@ -76,7 +76,6 @@ static const char *rcdev_name(struct reset_controller_dev *rcdev) * of_reset_simple_xlate - translate reset_spec to the reset line number * @rcdev: a pointer to the reset controller device * @reset_spec: reset line specifier as found in the device tree - * @flags: a flags pointer to fill in (optional) * * This simple translation function should be used for reset controllers * with 1:1 mapping, where reset lines can be indexed by number without gaps. @@ -748,6 +747,7 @@ static void reset_control_array_put(struct reset_control_array *resets) for (i = 0; i < resets->num_rstcs; i++) __reset_control_put_internal(resets->rstc[i]); mutex_unlock(&reset_list_mutex); + kfree(resets); } /** @@ -825,9 +825,10 @@ int __device_reset(struct device *dev, bool optional) } EXPORT_SYMBOL_GPL(__device_reset); -/** +/* * APIs to manage an array of reset controls. */ + /** * of_reset_control_get_count - Count number of resets available with a device * diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c index 6afad68e5ba2..238240984bc1 100644 --- a/drivers/scsi/qla2xxx/qla_mid.c +++ b/drivers/scsi/qla2xxx/qla_mid.c @@ -76,9 +76,11 @@ qla24xx_deallocate_vp_id(scsi_qla_host_t *vha) * ensures no active vp_list traversal while the vport is removed * from the queue) */ - for (i = 0; i < 10 && atomic_read(&vha->vref_count); i++) - wait_event_timeout(vha->vref_waitq, - atomic_read(&vha->vref_count), HZ); + for (i = 0; i < 10; i++) { + if (wait_event_timeout(vha->vref_waitq, + !atomic_read(&vha->vref_count), HZ) > 0) + break; + } spin_lock_irqsave(&ha->vport_slock, flags); if (atomic_read(&vha->vref_count)) { diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 337162ac3a77..726ad4cbf4a6 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1119,9 +1119,11 @@ qla2x00_wait_for_sess_deletion(scsi_qla_host_t *vha) qla2x00_mark_all_devices_lost(vha, 0); - for (i = 0; i < 10; i++) - wait_event_timeout(vha->fcport_waitQ, test_fcport_count(vha), - HZ); + for (i = 0; i < 10; i++) { + if (wait_event_timeout(vha->fcport_waitQ, + test_fcport_count(vha), HZ) > 0) + break; + } flush_workqueue(vha->hw->wq); } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 5447738906ac..91c007d26c1e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1883,7 +1883,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost) { unsigned int cmd_size, sgl_size; - sgl_size = scsi_mq_inline_sgl_size(shost); + sgl_size = max_t(unsigned int, sizeof(struct scatterlist), + scsi_mq_inline_sgl_size(shost)); cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size + sgl_size; if (scsi_host_get_prot(shost)) cmd_size += sizeof(struct scsi_data_buffer) + diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index de4019dc0f0b..1efc69e194f8 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -263,25 +263,16 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, int result = cmd->result; struct request *rq = cmd->request; - switch (req_op(rq)) { - case REQ_OP_ZONE_RESET: - case REQ_OP_ZONE_RESET_ALL: - - if (result && - sshdr->sense_key == ILLEGAL_REQUEST && - sshdr->asc == 0x24) - /* - * INVALID FIELD IN CDB error: reset of a conventional - * zone was attempted. Nothing to worry about, so be - * quiet about the error. - */ - rq->rq_flags |= RQF_QUIET; - break; - - case REQ_OP_WRITE: - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE_SAME: - break; + if (req_op(rq) == REQ_OP_ZONE_RESET && + result && + sshdr->sense_key == ILLEGAL_REQUEST && + sshdr->asc == 0x24) { + /* + * INVALID FIELD IN CDB error: reset of a conventional + * zone was attempted. Nothing to worry about, so be + * quiet about the error. + */ + rq->rq_flags |= RQF_QUIET; } } diff --git a/drivers/soc/imx/gpc.c b/drivers/soc/imx/gpc.c index d9231bd3c691..98b9d9a902ae 100644 --- a/drivers/soc/imx/gpc.c +++ b/drivers/soc/imx/gpc.c @@ -249,13 +249,13 @@ static struct genpd_power_state imx6_pm_domain_pu_state = { }; static struct imx_pm_domain imx_gpc_domains[] = { - [GPC_PGC_DOMAIN_ARM] { + [GPC_PGC_DOMAIN_ARM] = { .base = { .name = "ARM", .flags = GENPD_FLAG_ALWAYS_ON, }, }, - [GPC_PGC_DOMAIN_PU] { + [GPC_PGC_DOMAIN_PU] = { .base = { .name = "PU", .power_off = imx6_pm_domain_power_off, @@ -266,7 +266,7 @@ static struct imx_pm_domain imx_gpc_domains[] = { .reg_offs = 0x260, .cntr_pdn_bit = 0, }, - [GPC_PGC_DOMAIN_DISPLAY] { + [GPC_PGC_DOMAIN_DISPLAY] = { .base = { .name = "DISPLAY", .power_off = imx6_pm_domain_power_off, @@ -275,7 +275,7 @@ static struct imx_pm_domain imx_gpc_domains[] = { .reg_offs = 0x240, .cntr_pdn_bit = 4, }, - [GPC_PGC_DOMAIN_PCI] { + [GPC_PGC_DOMAIN_PCI] = { .base = { .name = "PCI", .power_off = imx6_pm_domain_power_off, diff --git a/drivers/soundwire/Kconfig b/drivers/soundwire/Kconfig index f518273cfbe3..c8c80df090d1 100644 --- a/drivers/soundwire/Kconfig +++ b/drivers/soundwire/Kconfig @@ -5,6 +5,7 @@ menuconfig SOUNDWIRE tristate "SoundWire support" + depends on ACPI || OF help SoundWire is a 2-Pin interface with data and clock line ratified by the MIPI Alliance. SoundWire is used for transporting data diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c index f1e38a293967..13c54eac0cc3 100644 --- a/drivers/soundwire/intel.c +++ b/drivers/soundwire/intel.c @@ -900,7 +900,7 @@ static int intel_register_dai(struct sdw_intel *sdw) /* Create PCM DAIs */ stream = &cdns->pcm; - ret = intel_create_dai(cdns, dais, INTEL_PDI_IN, stream->num_in, + ret = intel_create_dai(cdns, dais, INTEL_PDI_IN, cdns->pcm.num_in, off, stream->num_ch_in, true); if (ret) return ret; @@ -931,7 +931,7 @@ static int intel_register_dai(struct sdw_intel *sdw) if (ret) return ret; - off += cdns->pdm.num_bd; + off += cdns->pdm.num_out; ret = intel_create_dai(cdns, dais, INTEL_PDI_BD, cdns->pdm.num_bd, off, stream->num_ch_bd, false); if (ret) diff --git a/drivers/soundwire/slave.c b/drivers/soundwire/slave.c index 48a63ca130d2..6473fa602f82 100644 --- a/drivers/soundwire/slave.c +++ b/drivers/soundwire/slave.c @@ -128,7 +128,8 @@ int sdw_of_find_slaves(struct sdw_bus *bus) struct device_node *node; for_each_child_of_node(bus->dev->of_node, node) { - int link_id, sdw_version, ret, len; + int link_id, ret, len; + unsigned int sdw_version; const char *compat = NULL; struct sdw_slave_id id; const __be32 *addr; diff --git a/drivers/thunderbolt/nhi_ops.c b/drivers/thunderbolt/nhi_ops.c index 61cd09cef943..6795851aac95 100644 --- a/drivers/thunderbolt/nhi_ops.c +++ b/drivers/thunderbolt/nhi_ops.c @@ -80,7 +80,6 @@ static void icl_nhi_lc_mailbox_cmd(struct tb_nhi *nhi, enum icl_lc_mailbox_cmd c { u32 data; - pci_read_config_dword(nhi->pdev, VS_CAP_19, &data); data = (cmd << VS_CAP_19_CMD_SHIFT) & VS_CAP_19_CMD_MASK; pci_write_config_dword(nhi->pdev, VS_CAP_19, data | VS_CAP_19_VALID); } diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index 410bf1bceeee..5ea8db667e83 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -896,12 +896,13 @@ int tb_dp_port_set_hops(struct tb_port *port, unsigned int video, */ bool tb_dp_port_is_enabled(struct tb_port *port) { - u32 data; + u32 data[2]; - if (tb_port_read(port, &data, TB_CFG_PORT, port->cap_adap, 1)) + if (tb_port_read(port, data, TB_CFG_PORT, port->cap_adap, + ARRAY_SIZE(data))) return false; - return !!(data & (TB_DP_VIDEO_EN | TB_DP_AUX_EN)); + return !!(data[0] & (TB_DP_VIDEO_EN | TB_DP_AUX_EN)); } /** @@ -914,19 +915,21 @@ bool tb_dp_port_is_enabled(struct tb_port *port) */ int tb_dp_port_enable(struct tb_port *port, bool enable) { - u32 data; + u32 data[2]; int ret; - ret = tb_port_read(port, &data, TB_CFG_PORT, port->cap_adap, 1); + ret = tb_port_read(port, data, TB_CFG_PORT, port->cap_adap, + ARRAY_SIZE(data)); if (ret) return ret; if (enable) - data |= TB_DP_VIDEO_EN | TB_DP_AUX_EN; + data[0] |= TB_DP_VIDEO_EN | TB_DP_AUX_EN; else - data &= ~(TB_DP_VIDEO_EN | TB_DP_AUX_EN); + data[0] &= ~(TB_DP_VIDEO_EN | TB_DP_AUX_EN); - return tb_port_write(port, &data, TB_CFG_PORT, port->cap_adap, 1); + return tb_port_write(port, data, TB_CFG_PORT, port->cap_adap, + ARRAY_SIZE(data)); } /* switch utility functions */ @@ -1031,13 +1034,6 @@ static int tb_switch_set_authorized(struct tb_switch *sw, unsigned int val) if (sw->authorized) goto unlock; - /* - * Make sure there is no PCIe rescan ongoing when a new PCIe - * tunnel is created. Otherwise the PCIe rescan code might find - * the new tunnel too early. - */ - pci_lock_rescan_remove(); - switch (val) { /* Approve switch */ case 1: @@ -1057,8 +1053,6 @@ static int tb_switch_set_authorized(struct tb_switch *sw, unsigned int val) break; } - pci_unlock_rescan_remove(); - if (!ret) { sw->authorized = val; /* Notify status change to the userspace */ diff --git a/drivers/watchdog/bd70528_wdt.c b/drivers/watchdog/bd70528_wdt.c index b0152fef4fc7..bc60e036627a 100644 --- a/drivers/watchdog/bd70528_wdt.c +++ b/drivers/watchdog/bd70528_wdt.c @@ -288,3 +288,4 @@ module_platform_driver(bd70528_wdt); MODULE_AUTHOR("Matti Vaittinen "); MODULE_DESCRIPTION("BD70528 watchdog driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:bd70528-wdt"); diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c index 9393be584e72..808eeb4779e4 100644 --- a/drivers/watchdog/cpwd.c +++ b/drivers/watchdog/cpwd.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -473,6 +474,11 @@ static long cpwd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +static long cpwd_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + return cpwd_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} + static ssize_t cpwd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -497,7 +503,7 @@ static ssize_t cpwd_read(struct file *file, char __user *buffer, static const struct file_operations cpwd_fops = { .owner = THIS_MODULE, .unlocked_ioctl = cpwd_ioctl, - .compat_ioctl = compat_ptr_ioctl, + .compat_ioctl = cpwd_compat_ioctl, .open = cpwd_open, .write = cpwd_write, .read = cpwd_read, diff --git a/drivers/watchdog/imx_sc_wdt.c b/drivers/watchdog/imx_sc_wdt.c index 7ea5cf54e94a..8ed89f032ebf 100644 --- a/drivers/watchdog/imx_sc_wdt.c +++ b/drivers/watchdog/imx_sc_wdt.c @@ -99,8 +99,14 @@ static int imx_sc_wdt_set_pretimeout(struct watchdog_device *wdog, { struct arm_smccc_res res; + /* + * SCU firmware calculates pretimeout based on current time + * stamp instead of watchdog timeout stamp, need to convert + * the pretimeout to SCU firmware's timeout value. + */ arm_smccc_smc(IMX_SIP_TIMER, IMX_SIP_TIMER_SET_PRETIME_WDOG, - pretimeout * 1000, 0, 0, 0, 0, 0, &res); + (wdog->timeout - pretimeout) * 1000, 0, 0, 0, + 0, 0, &res); if (res.a0) return -EACCES; diff --git a/drivers/watchdog/meson_gxbb_wdt.c b/drivers/watchdog/meson_gxbb_wdt.c index d17c1a6ed723..5a9ca10fbcfa 100644 --- a/drivers/watchdog/meson_gxbb_wdt.c +++ b/drivers/watchdog/meson_gxbb_wdt.c @@ -89,8 +89,8 @@ static unsigned int meson_gxbb_wdt_get_timeleft(struct watchdog_device *wdt_dev) reg = readl(data->reg_base + GXBB_WDT_TCNT_REG); - return ((reg >> GXBB_WDT_TCNT_CNT_SHIFT) - - (reg & GXBB_WDT_TCNT_SETUP_MASK)) / 1000; + return ((reg & GXBB_WDT_TCNT_SETUP_MASK) - + (reg >> GXBB_WDT_TCNT_CNT_SHIFT)) / 1000; } static const struct watchdog_ops meson_gxbb_wdt_ops = { diff --git a/drivers/watchdog/pm8916_wdt.c b/drivers/watchdog/pm8916_wdt.c index 2d3652004e39..1213179f863c 100644 --- a/drivers/watchdog/pm8916_wdt.c +++ b/drivers/watchdog/pm8916_wdt.c @@ -163,9 +163,17 @@ static int pm8916_wdt_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq > 0) { - if (devm_request_irq(dev, irq, pm8916_wdt_isr, 0, "pm8916_wdt", - wdt)) - irq = 0; + err = devm_request_irq(dev, irq, pm8916_wdt_isr, 0, + "pm8916_wdt", wdt); + if (err) + return err; + + wdt->wdev.info = &pm8916_wdt_pt_ident; + } else { + if (irq == -EPROBE_DEFER) + return -EPROBE_DEFER; + + wdt->wdev.info = &pm8916_wdt_ident; } /* Configure watchdog to hard-reset mode */ @@ -177,7 +185,6 @@ static int pm8916_wdt_probe(struct platform_device *pdev) return err; } - wdt->wdev.info = (irq > 0) ? &pm8916_wdt_pt_ident : &pm8916_wdt_ident, wdt->wdev.ops = &pm8916_wdt_ops, wdt->wdev.parent = dev; wdt->wdev.min_timeout = PM8916_WDT_MIN_TIMEOUT; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index cc12772d0a4d..497f979018c2 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -803,7 +803,12 @@ success: continue; if (cookie->inodes[i]) { - afs_vnode_commit_status(&fc, AFS_FS_I(cookie->inodes[i]), + struct afs_vnode *iv = AFS_FS_I(cookie->inodes[i]); + + if (test_bit(AFS_VNODE_UNSET, &iv->flags)) + continue; + + afs_vnode_commit_status(&fc, iv, scb->cb_break, NULL, scb); continue; } diff --git a/fs/aio.c b/fs/aio.c index 01e0fb9ae45a..0d9a559d488c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2179,7 +2179,7 @@ SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id, #ifdef CONFIG_COMPAT struct __compat_aio_sigset { - compat_sigset_t __user *sigmask; + compat_uptr_t sigmask; compat_size_t sigsetsize; }; @@ -2193,7 +2193,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, struct old_timespec32 __user *, timeout, const struct __compat_aio_sigset __user *, usig) { - struct __compat_aio_sigset ksig = { NULL, }; + struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; @@ -2204,7 +2204,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize); + ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; @@ -2228,7 +2228,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, struct __kernel_timespec __user *, timeout, const struct __compat_aio_sigset __user *, usig) { - struct __compat_aio_sigset ksig = { NULL, }; + struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; @@ -2239,7 +2239,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize); + ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 2866fabf497f..91f5787dae7c 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -459,9 +459,10 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb, */ how &= ~AUTOFS_EXP_LEAVES; found = should_expire(expired, mnt, timeout, how); - if (!found || found != expired) - /* Something has changed, continue */ + if (found != expired) { // something has changed, continue + dput(found); goto next; + } if (expired != dentry) dput(dentry); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c3f386b7cc0b..015910079e73 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -474,6 +474,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) u64 start = async_chunk->start; u64 end = async_chunk->end; u64 actual_end; + u64 i_size; int ret = 0; struct page **pages = NULL; unsigned long nr_pages; @@ -488,7 +489,19 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, SZ_16K); - actual_end = min_t(u64, i_size_read(inode), end + 1); + /* + * We need to save i_size before now because it could change in between + * us evaluating the size and assigning it. This is because we lock and + * unlock the page in truncate and fallocate, and then modify the i_size + * later on. + * + * The barriers are to emulate READ_ONCE, remove that once i_size_read + * does that for us. + */ + barrier(); + i_size = i_size_read(inode); + barrier(); + actual_end = min_t(u64, i_size, end + 1); again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; @@ -9731,6 +9744,18 @@ out_fail: commit_transaction = true; } if (commit_transaction) { + /* + * We may have set commit_transaction when logging the new name + * in the destination root, in which case we left the source + * root context in the list of log contextes. So make sure we + * remove it to avoid invalid memory accesses, since the context + * was allocated in our stack frame. + */ + if (sync_log_root) { + mutex_lock(&root->log_mutex); + list_del_init(&ctx_root.list); + mutex_unlock(&root->log_mutex); + } ret = btrfs_commit_transaction(trans); } else { int ret2; @@ -9744,6 +9769,9 @@ out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); + ASSERT(list_empty(&ctx_root.list)); + ASSERT(list_empty(&ctx_dest.list)); + return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7c145a41decd..23272d9154f3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4195,9 +4195,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, u64 transid; int ret; - btrfs_warn(root->fs_info, - "START_SYNC ioctl is deprecated and will be removed in kernel 5.7"); - trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) @@ -4225,9 +4222,6 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, { u64 transid; - btrfs_warn(fs_info, - "WAIT_SYNC ioctl is deprecated and will be removed in kernel 5.7"); - if (argp) { if (copy_from_user(&transid, argp, sizeof(transid))) return -EFAULT; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 98dc092a905e..e8a4b0ebe97f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -893,6 +893,15 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, while (ticket->bytes > 0 && ticket->error == 0) { ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); if (ret) { + /* + * Delete us from the list. After we unlock the space + * info, we don't want the async reclaim job to reserve + * space for this ticket. If that would happen, then the + * ticket's task would not known that space was reserved + * despite getting an error, resulting in a space leak + * (bytes_may_use counter of our space_info). + */ + list_del_init(&ticket->list); ticket->error = -EINTR; break; } @@ -945,12 +954,24 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); ret = ticket->error; if (ticket->bytes || ticket->error) { + /* + * Need to delete here for priority tickets. For regular tickets + * either the async reclaim job deletes the ticket from the list + * or we delete it ourselves at wait_reserve_ticket(). + */ list_del_init(&ticket->list); if (!ret) ret = -ENOSPC; } spin_unlock(&space_info->lock); ASSERT(list_empty(&ticket->list)); + /* + * Check that we can't have an error set if the reservation succeeded, + * as that would confuse tasks and lead them to error out without + * releasing reserved space (if an error happens the expectation is that + * space wasn't reserved at all). + */ + ASSERT(!(ticket->bytes == 0 && ticket->error)); return ret; } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43e488f5d063..076d5b8014fb 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -686,9 +686,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot, static int check_dev_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { - struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_dev_item *ditem; - u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK); if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) { dev_item_err(leaf, slot, @@ -696,12 +694,6 @@ static int check_dev_item(struct extent_buffer *leaf, key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } - if (key->offset > max_devid) { - dev_item_err(leaf, slot, - "invalid devid: has=%llu expect=[0, %llu]", - key->offset, max_devid); - return -EUCLEAN; - } ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); if (btrfs_device_id(leaf, ditem) != key->offset) { dev_item_err(leaf, slot, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bdfe4493e43a..e04409f85063 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4967,6 +4967,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; + devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); } else { btrfs_err(info, "invalid chunk type 0x%llx requested", type); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index bd77adb64bfd..8de633964dc3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -753,6 +753,9 @@ static void ceph_aio_complete(struct inode *inode, if (!atomic_dec_and_test(&aio_req->pending_reqs)) return; + if (aio_req->iocb->ki_flags & IOCB_DIRECT) + inode_dio_end(inode); + ret = aio_req->error; if (!ret) ret = aio_req->total_len; @@ -1091,6 +1094,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, CEPH_CAP_FILE_RD); list_splice(&aio_req->osd_reqs, &osd_reqs); + inode_dio_begin(inode); while (!list_empty(&osd_reqs)) { req = list_first_entry(&osd_reqs, struct ceph_osd_request, @@ -1264,14 +1268,24 @@ again: dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_start_io_direct(inode); + else + ceph_start_io_read(inode); + if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); - if (ret < 0) + if (ret < 0) { + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_read(inode); return ret; + } if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_flags & IOCB_DIRECT) || @@ -1283,16 +1297,12 @@ again: if (ci->i_inline_version == CEPH_INLINE_NONE) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { - ceph_start_io_direct(inode); ret = ceph_direct_read_write(iocb, to, NULL, NULL); - ceph_end_io_direct(inode); if (ret >= 0 && ret < len) retry_op = CHECK_EOF; } else { - ceph_start_io_read(inode); ret = ceph_sync_read(iocb, to, &retry_op); - ceph_end_io_read(inode); } } else { retry_op = READ_INLINE; @@ -1303,11 +1313,10 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); ceph_add_rw_context(fi, &rw_ctx); - ceph_start_io_read(inode); ret = generic_file_read_iter(iocb, to); - ceph_end_io_read(inode); ceph_del_rw_context(fi, &rw_ctx); } + dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); if (pinned_page) { @@ -1315,6 +1324,12 @@ again: pinned_page = NULL; } ceph_put_cap_refs(ci, got); + + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_read(inode); + if (retry_op > HAVE_RETRIED && ret >= 0) { int statret; struct page *page = NULL; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index ea735d59c36e..0abfde6d0b05 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -838,6 +838,7 @@ struct create_durable_handle_reconnect_v2 { struct create_context ccontext; __u8 Name[8]; struct durable_reconnect_context_v2 dcontext; + __u8 Pad[4]; } __packed; /* See MS-SMB2 2.2.13.2.5 */ diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index dc5dbf6a81d7..cb61467478ca 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -101,7 +101,7 @@ static int create_link(struct config_item *parent_item, } target_sd->s_links++; spin_unlock(&configfs_dirent_lock); - ret = configfs_get_target_path(item, item, body); + ret = configfs_get_target_path(parent_item, item, body); if (!ret) ret = configfs_create_link(target_sd, parent_item->ci_dentry, dentry, body); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 18426f4855f1..e23752d9a79f 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -128,13 +128,20 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, struct inode *inode) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); struct dentry *lower_dir_dentry; + struct inode *lower_dir_inode; int rc; - dget(lower_dentry); - lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL); + lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); + lower_dir_inode = d_inode(lower_dir_dentry); + inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT); + dget(lower_dentry); // don't even try to make the lower negative + if (lower_dentry->d_parent != lower_dir_dentry) + rc = -EINVAL; + else if (d_unhashed(lower_dentry)) + rc = -EINVAL; + else + rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL); if (rc) { printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; @@ -142,10 +149,11 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, fsstack_copy_attr_times(dir, lower_dir_inode); set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); inode->i_ctime = dir->i_ctime; - d_drop(dentry); out_unlock: - unlock_dir(lower_dir_dentry); dput(lower_dentry); + inode_unlock(lower_dir_inode); + if (!rc) + d_drop(dentry); return rc; } @@ -311,9 +319,9 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, struct dentry *lower_dentry) { - struct inode *inode, *lower_inode = d_inode(lower_dentry); + struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent); + struct inode *inode, *lower_inode; struct ecryptfs_dentry_info *dentry_info; - struct vfsmount *lower_mnt; int rc = 0; dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); @@ -322,16 +330,23 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, return ERR_PTR(-ENOMEM); } - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); fsstack_copy_attr_atime(d_inode(dentry->d_parent), - d_inode(lower_dentry->d_parent)); + d_inode(path->dentry)); BUG_ON(!d_count(lower_dentry)); ecryptfs_set_dentry_private(dentry, dentry_info); - dentry_info->lower_path.mnt = lower_mnt; + dentry_info->lower_path.mnt = mntget(path->mnt); dentry_info->lower_path.dentry = lower_dentry; - if (d_really_is_negative(lower_dentry)) { + /* + * negative dentry can go positive under us here - its parent is not + * locked. That's OK and that could happen just as we return from + * ecryptfs_lookup() anyway. Just need to be careful and fetch + * ->d_inode only once - it's not stable here. + */ + lower_inode = READ_ONCE(lower_dentry->d_inode); + + if (!lower_inode) { /* We want to add because we couldn't find in lower */ d_add(dentry, NULL); return NULL; @@ -512,22 +527,30 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) { struct dentry *lower_dentry; struct dentry *lower_dir_dentry; + struct inode *lower_dir_inode; int rc; lower_dentry = ecryptfs_dentry_to_lower(dentry); - dget(dentry); - lower_dir_dentry = lock_parent(lower_dentry); - dget(lower_dentry); - rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry); - dput(lower_dentry); - if (!rc && d_really_is_positive(dentry)) + lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); + lower_dir_inode = d_inode(lower_dir_dentry); + + inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT); + dget(lower_dentry); // don't even try to make the lower negative + if (lower_dentry->d_parent != lower_dir_dentry) + rc = -EINVAL; + else if (d_unhashed(lower_dentry)) + rc = -EINVAL; + else + rc = vfs_rmdir(lower_dir_inode, lower_dentry); + if (!rc) { clear_nlink(d_inode(dentry)); - fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); - set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); - unlock_dir(lower_dir_dentry); + fsstack_copy_attr_times(dir, lower_dir_inode); + set_nlink(dir, lower_dir_inode->i_nlink); + } + dput(lower_dentry); + inode_unlock(lower_dir_inode); if (!rc) d_drop(dentry); - dput(dentry); return rc; } @@ -565,20 +588,22 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dentry; struct dentry *lower_old_dir_dentry; struct dentry *lower_new_dir_dentry; - struct dentry *trap = NULL; + struct dentry *trap; struct inode *target_inode; if (flags) return -EINVAL; + lower_old_dir_dentry = ecryptfs_dentry_to_lower(old_dentry->d_parent); + lower_new_dir_dentry = ecryptfs_dentry_to_lower(new_dentry->d_parent); + lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); - dget(lower_old_dentry); - dget(lower_new_dentry); - lower_old_dir_dentry = dget_parent(lower_old_dentry); - lower_new_dir_dentry = dget_parent(lower_new_dentry); + target_inode = d_inode(new_dentry); + trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + dget(lower_new_dentry); rc = -EINVAL; if (lower_old_dentry->d_parent != lower_old_dir_dentry) goto out_lock; @@ -606,11 +631,8 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_dir != old_dir) fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); out_lock: - unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); - dput(lower_new_dir_dentry); - dput(lower_old_dir_dentry); dput(lower_new_dentry); - dput(lower_old_dentry); + unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); return rc; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 09bc68708d28..2dd55b172d57 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -519,26 +519,33 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, * inode is actually connected to the parent. */ err = exportfs_get_name(mnt, target_dir, nbuf, result); - if (!err) { - inode_lock(target_dir->d_inode); - nresult = lookup_one_len(nbuf, target_dir, - strlen(nbuf)); - inode_unlock(target_dir->d_inode); - if (!IS_ERR(nresult)) { - if (nresult->d_inode) { - dput(result); - result = nresult; - } else - dput(nresult); - } + if (err) { + dput(target_dir); + goto err_result; } + inode_lock(target_dir->d_inode); + nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); + if (!IS_ERR(nresult)) { + if (unlikely(nresult->d_inode != result->d_inode)) { + dput(nresult); + nresult = ERR_PTR(-ESTALE); + } + } + inode_unlock(target_dir->d_inode); /* * At this point we are done with the parent, but it's pinned * by the child dentry anyway. */ dput(target_dir); + if (IS_ERR(nresult)) { + err = PTR_ERR(nresult); + goto err_result; + } + dput(result); + result = nresult; + /* * And finally make sure the dentry is actually acceptable * to NFSD. diff --git a/fs/io_uring.c b/fs/io_uring.c index f9a38998f2fc..2c819c3c855d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -326,6 +326,7 @@ struct io_kiocb { #define REQ_F_TIMEOUT 1024 /* timeout request */ #define REQ_F_ISREG 2048 /* regular file */ #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ +#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ u64 user_data; u32 result; u32 sequence; @@ -453,9 +454,13 @@ static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) struct io_kiocb *req; req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); - if (req && !__io_sequence_defer(ctx, req)) { - list_del_init(&req->list); - return req; + if (req) { + if (req->flags & REQ_F_TIMEOUT_NOSEQ) + return NULL; + if (!__io_sequence_defer(ctx, req)) { + list_del_init(&req->list); + return req; + } } return NULL; @@ -1225,7 +1230,7 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, } } - return 0; + return len; } static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, @@ -1941,18 +1946,24 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; + req->flags |= REQ_F_TIMEOUT; + /* * sqe->off holds how many events that need to occur for this - * timeout event to be satisfied. + * timeout event to be satisfied. If it isn't set, then this is + * a pure timeout request, sequence isn't used. */ count = READ_ONCE(sqe->off); - if (!count) - count = 1; + if (!count) { + req->flags |= REQ_F_TIMEOUT_NOSEQ; + spin_lock_irq(&ctx->completion_lock); + entry = ctx->timeout_list.prev; + goto add; + } req->sequence = ctx->cached_sq_head + count - 1; /* reuse it to store the count */ req->submit.sequence = count; - req->flags |= REQ_F_TIMEOUT; /* * Insertion sort, ensuring the first entry in the list is always @@ -1964,6 +1975,9 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned nxt_sq_head; long long tmp, tmp_nxt; + if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) + continue; + /* * Since cached_sq_head + count - 1 can overflow, use type long * long to store it. @@ -1990,6 +2004,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) nxt->sequence++; } req->sequence -= span; +add: list_add(&req->list, entry); spin_unlock_irq(&ctx->completion_lock); @@ -2283,6 +2298,7 @@ static bool io_op_needs_file(const struct io_uring_sqe *sqe) switch (op) { case IORING_OP_NOP: case IORING_OP_POLL_REMOVE: + case IORING_OP_TIMEOUT: return false; default: return true; diff --git a/fs/namespace.c b/fs/namespace.c index fe0e9e1410fe..2adfe7b166a3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2478,8 +2478,10 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * time64_to_tm(sb->s_time_max, 0, &tm); - pr_warn("Mounted %s file system at %s supports timestamps until %04ld (0x%llx)\n", - sb->s_type->name, mntpath, + pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n", + sb->s_type->name, + is_mounted(mnt) ? "remounted" : "mounted", + mntpath, tm.tm_year+1900, (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); @@ -2764,14 +2766,11 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, if (IS_ERR(mnt)) return PTR_ERR(mnt); - error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); - if (error < 0) { - mntput(mnt); - return error; - } - mnt_warn_timestamp_expiry(mountpoint, mnt); + error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + if (error < 0) + mntput(mnt); return error; } diff --git a/include/asm-generic/vdso/vsyscall.h b/include/asm-generic/vdso/vsyscall.h index e94b19782c92..ce4103208619 100644 --- a/include/asm-generic/vdso/vsyscall.h +++ b/include/asm-generic/vdso/vsyscall.h @@ -25,13 +25,6 @@ static __always_inline int __arch_get_clock_mode(struct timekeeper *tk) } #endif /* __arch_get_clock_mode */ -#ifndef __arch_use_vsyscall -static __always_inline int __arch_use_vsyscall(struct vdso_data *vdata) -{ - return 1; -} -#endif /* __arch_use_vsyscall */ - #ifndef __arch_update_vsyscall static __always_inline void __arch_update_vsyscall(struct vdso_data *vdata, struct timekeeper *tk) diff --git a/include/linux/can/core.h b/include/linux/can/core.h index 8339071ab08b..e20a0cd09ba5 100644 --- a/include/linux/can/core.h +++ b/include/linux/can/core.h @@ -65,5 +65,6 @@ extern void can_rx_unregister(struct net *net, struct net_device *dev, void *data); extern int can_send(struct sk_buff *skb, int loop); +void can_sock_destruct(struct sock *sk); #endif /* !_CAN_CORE_H */ diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d0633ebdaa9c..bc6c879bd110 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -59,6 +59,11 @@ extern ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_tsx_async_abort(struct device *dev, + struct device_attribute *attr, + char *buf); +extern ssize_t cpu_show_itlb_multihit(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, @@ -213,28 +218,7 @@ static inline int cpuhp_smt_enable(void) { return 0; } static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } #endif -/* - * These are used for a global "mitigations=" cmdline option for toggling - * optional CPU mitigations. - */ -enum cpu_mitigations { - CPU_MITIGATIONS_OFF, - CPU_MITIGATIONS_AUTO, - CPU_MITIGATIONS_AUTO_NOSMT, -}; - -extern enum cpu_mitigations cpu_mitigations; - -/* mitigations=off */ -static inline bool cpu_mitigations_off(void) -{ - return cpu_mitigations == CPU_MITIGATIONS_OFF; -} - -/* mitigations=auto,nosmt */ -static inline bool cpu_mitigations_auto_nosmt(void) -{ - return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; -} +extern bool cpu_mitigations_off(void); +extern bool cpu_mitigations_auto_nosmt(void); #endif /* _LINUX_CPU_H_ */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 719fc3e15ea4..d41c521a39da 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -966,6 +966,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); bool kvm_is_reserved_pfn(kvm_pfn_t pfn); +bool kvm_is_zone_device_pfn(kvm_pfn_t pfn); struct kvm_irq_ack_notifier { struct hlist_node link; @@ -1382,4 +1383,10 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */ +typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data); + +int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, + uintptr_t data, const char *name, + struct task_struct **thread_ptr); + #endif diff --git a/include/linux/memory.h b/include/linux/memory.h index 0ebb105eb261..4c75dae8dd29 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -119,6 +119,7 @@ extern struct memory_block *find_memory_block(struct mem_section *); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); +extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func); #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<sk_v6_rcv_saddr, sk->sk_v6_daddr); ), - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s\n", + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s", __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->state)) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index b558ea88b766..ae37fd4d194a 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -421,10 +421,11 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_FAILED, /* u8 0 or 1 */ + DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, /* u64 */ + DEVLINK_ATTR_NETNS_FD, /* u32 */ DEVLINK_ATTR_NETNS_PID, /* u32 */ DEVLINK_ATTR_NETNS_ID, /* u32 */ - /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 59e89a1bc3bb..9dc9d0079e98 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -31,13 +31,16 @@ #define PTP_ENABLE_FEATURE (1<<0) #define PTP_RISING_EDGE (1<<1) #define PTP_FALLING_EDGE (1<<2) +#define PTP_STRICT_FLAGS (1<<3) +#define PTP_EXTTS_EDGES (PTP_RISING_EDGE | PTP_FALLING_EDGE) /* * flag fields valid for the new PTP_EXTTS_REQUEST2 ioctl. */ #define PTP_EXTTS_VALID_FLAGS (PTP_ENABLE_FEATURE | \ PTP_RISING_EDGE | \ - PTP_FALLING_EDGE) + PTP_FALLING_EDGE | \ + PTP_STRICT_FLAGS) /* * flag fields valid for the original PTP_EXTTS_REQUEST ioctl. diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 1f31c2f1e6fc..4508d5e0cf69 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -351,12 +351,12 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - inode_unlock(d_backing_inode(parent->dentry)); if (d_is_positive(d)) { /* update watch filter fields */ watch->dev = d->d_sb->s_dev; watch->ino = d_backing_inode(d)->i_ino; } + inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 080561bb8a4b..ef4242e5d4bc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2119,11 +2119,12 @@ int cgroup_do_get_tree(struct fs_context *fc) nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(fc->root); - fc->root = nsdentry; if (IS_ERR(nsdentry)) { - ret = PTR_ERR(nsdentry); deactivate_locked_super(sb); + ret = PTR_ERR(nsdentry); + nsdentry = NULL; } + fc->root = nsdentry; } if (!ctx->kfc.new_sb_created) diff --git a/kernel/cpu.c b/kernel/cpu.c index fc28e17940e0..e2cad3ee2ead 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2373,7 +2373,18 @@ void __init boot_cpu_hotplug_init(void) this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; +/* + * These are used for a global "mitigations=" cmdline option for toggling + * optional CPU mitigations. + */ +enum cpu_mitigations { + CPU_MITIGATIONS_OFF, + CPU_MITIGATIONS_AUTO, + CPU_MITIGATIONS_AUTO_NOSMT, +}; + +static enum cpu_mitigations cpu_mitigations __ro_after_init = + CPU_MITIGATIONS_AUTO; static int __init mitigations_parse_cmdline(char *arg) { @@ -2390,3 +2401,17 @@ static int __init mitigations_parse_cmdline(char *arg) return 0; } early_param("mitigations", mitigations_parse_cmdline); + +/* mitigations=off */ +bool cpu_mitigations_off(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_OFF; +} +EXPORT_SYMBOL_GPL(cpu_mitigations_off); + +/* mitigations=auto,nosmt */ +bool cpu_mitigations_auto_nosmt(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; +} +EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); diff --git a/kernel/events/core.c b/kernel/events/core.c index aec8dba2bea4..00a014670ed0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1031,7 +1031,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, { } -void +static inline void perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } @@ -10535,6 +10535,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + /* + * Disallow uncore-cgroup events, they don't make sense as the cgroup will + * be different on other CPUs in the uncore mask. + */ + if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) { + err = -EINVAL; + goto err_pmu; + } + if (event->attr.aux_output && !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { err = -EOPNOTSUPP; @@ -11323,8 +11332,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, int err; /* - * Get the target context (task or percpu): + * Grouping is not supported for kernel events, neither is 'AUX', + * make sure the caller's intentions are adjusted. */ + if (attr->aux_output) + return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); @@ -11336,6 +11348,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; + /* + * Get the target context (task or percpu): + */ ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); @@ -11787,7 +11802,7 @@ inherit_event(struct perf_event *parent_event, GFP_KERNEL); if (!child_ctx->task_ctx_data) { free_event(child_event); - return NULL; + return ERR_PTR(-ENOMEM); } } @@ -11890,7 +11905,7 @@ static int inherit_group(struct perf_event *parent_event, if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); - if (sub->aux_event == parent_event && + if (sub->aux_event == parent_event && child_ctr && !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 132672b74e4b..dd822fd8a7d5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); * @type: Type of irqchip_fwnode. See linux/irqdomain.h * @name: Optional user provided domain name * @id: Optional user provided id if name != NULL - * @data: Optional user-provided data + * @pa: Optional user-provided physical address * * Allocate a struct irqchip_fwid, and return a poiner to the embedded * fwnode_handle (or NULL on failure). diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd05a378631a..0f2eb3629070 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1073,6 +1073,7 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) task_rq_unlock(rq, p, &rf); } +#ifdef CONFIG_UCLAMP_TASK_GROUP static inline void uclamp_update_active_tasks(struct cgroup_subsys_state *css, unsigned int clamps) @@ -1091,7 +1092,6 @@ uclamp_update_active_tasks(struct cgroup_subsys_state *css, css_task_iter_end(&it); } -#ifdef CONFIG_UCLAMP_TASK_GROUP static void cpu_util_update_eff(struct cgroup_subsys_state *css); static void uclamp_update_root_tg(void) { @@ -3929,13 +3929,22 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } restart: +#ifdef CONFIG_SMP /* - * Ensure that we put DL/RT tasks before the pick loop, such that they - * can PULL higher prio tasks when we lower the RQ 'priority'. + * We must do the balancing pass before put_next_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. */ - prev->sched_class->put_prev_task(rq, prev, rf); - if (!rq->nr_running) - newidle_balance(rq, rf); + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); for_each_class(class) { p = class->pick_next_task(rq, NULL, NULL); @@ -6201,7 +6210,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) for_each_class(class) { next = class->pick_next_task(rq, NULL, NULL); if (next) { - next->sched_class->put_prev_task(rq, next, NULL); + next->sched_class->put_prev_task(rq, next); return next; } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2dc48720f189..a8a08030a8f7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1691,6 +1691,22 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_curr(rq); } +static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +{ + if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_dl_task(rq); + rq_repin_lock(rq, rf); + } + + return sched_stop_runnable(rq) || sched_dl_runnable(rq); +} #endif /* CONFIG_SMP */ /* @@ -1758,45 +1774,28 @@ static struct task_struct * pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_dl_entity *dl_se; + struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; - struct dl_rq *dl_rq; WARN_ON_ONCE(prev || rf); - dl_rq = &rq->dl; - - if (unlikely(!dl_rq->dl_nr_running)) + if (!sched_dl_runnable(rq)) return NULL; dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); - p = dl_task_of(dl_se); - set_next_task_dl(rq, p); - return p; } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); - - if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet started the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_dl_task(rq); - rq_repin_lock(rq, rf); - } } /* @@ -2442,6 +2441,7 @@ const struct sched_class dl_sched_class = { .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP + .balance = balance_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 682a754ea3e1..22a2fed29054 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6570,6 +6570,15 @@ static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); } + +static int +balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + if (rq->nr_running) + return 1; + + return newidle_balance(rq, rf) != 0; +} #endif /* CONFIG_SMP */ static unsigned long wakeup_gran(struct sched_entity *se) @@ -6746,7 +6755,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - if (!cfs_rq->nr_running) + if (!sched_fair_runnable(rq)) goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6884,7 +6893,7 @@ idle: /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; @@ -10414,11 +10423,11 @@ const struct sched_class fair_sched_class = { .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, - .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP + .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8dad5aa600ea..f65ef1e2f204 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -365,6 +365,12 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static int +balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return WARN_ON_ONCE(1); +} #endif /* @@ -375,7 +381,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { } @@ -460,6 +466,7 @@ const struct sched_class idle_sched_class = { .set_next_task = set_next_task_idle, #ifdef CONFIG_SMP + .balance = balance_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ebaa4e619684..9b8adc01be3d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1469,6 +1469,22 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) resched_curr(rq); } +static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +{ + if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_rt_task(rq); + rq_repin_lock(rq, rf); + } + + return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); +} #endif /* CONFIG_SMP */ /* @@ -1552,21 +1568,18 @@ static struct task_struct * pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *p; - struct rt_rq *rt_rq = &rq->rt; WARN_ON_ONCE(prev || rf); - if (!rt_rq->rt_queued) + if (!sched_rt_runnable(rq)) return NULL; p = _pick_next_task_rt(rq); - set_next_task_rt(rq, p); - return p; } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); @@ -1578,18 +1591,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_fla */ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); - - if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet started the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_rt_task(rq); - rq_repin_lock(rq, rf); - } } #ifdef CONFIG_SMP @@ -2366,8 +2367,8 @@ const struct sched_class rt_sched_class = { .set_next_task = set_next_task_rt, #ifdef CONFIG_SMP + .balance = balance_rt, .select_task_rq = select_task_rq_rt, - .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0db2c1b3361e..c8870c5bd7df 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1727,10 +1727,11 @@ struct sched_class { struct task_struct * (*pick_next_task)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); void (*set_next_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); @@ -1773,7 +1774,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev, NULL); + prev->sched_class->put_prev_task(rq, prev); } static inline void set_next_task(struct rq *rq, struct task_struct *next) @@ -1787,8 +1788,12 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) #else #define sched_class_highest (&dl_sched_class) #endif + +#define for_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = class->next) + #define for_each_class(class) \ - for (class = sched_class_highest; class; class = class->next) + for_class_range(class, sched_class_highest, NULL) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; @@ -1796,6 +1801,25 @@ extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; +static inline bool sched_stop_runnable(struct rq *rq) +{ + return rq->stop && task_on_rq_queued(rq->stop); +} + +static inline bool sched_dl_runnable(struct rq *rq) +{ + return rq->dl.dl_nr_running > 0; +} + +static inline bool sched_rt_runnable(struct rq *rq) +{ + return rq->rt.rt_queued > 0; +} + +static inline bool sched_fair_runnable(struct rq *rq) +{ + return rq->cfs.nr_running > 0; +} #ifdef CONFIG_SMP diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7e1cee4e65b2..c0640739e05e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -15,6 +15,12 @@ select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } + +static int +balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return sched_stop_runnable(rq); +} #endif /* CONFIG_SMP */ static void @@ -31,16 +37,13 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop) static struct task_struct * pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *stop = rq->stop; - WARN_ON_ONCE(prev || rf); - if (!stop || !task_on_rq_queued(stop)) + if (!sched_stop_runnable(rq)) return NULL; - set_next_task_stop(rq, stop); - - return stop; + set_next_task_stop(rq, rq->stop); + return rq->stop; } static void @@ -60,7 +63,7 @@ static void yield_task_stop(struct rq *rq) BUG(); /* the stop task should never yield, its pointless. */ } -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *curr = rq->curr; u64 delta_exec; @@ -129,6 +132,7 @@ const struct sched_class stop_sched_class = { .set_next_task = set_next_task_stop, #ifdef CONFIG_SMP + .balance = balance_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/signal.c b/kernel/signal.c index c4da1ef56fdf..bcd46f547db3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2205,8 +2205,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t */ preempt_disable(); read_unlock(&tasklist_lock); - preempt_enable_no_resched(); cgroup_enter_frozen(); + preempt_enable_no_resched(); freezable_schedule(); cgroup_leave_frozen(true); } else { diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 6d1f68b7e528..c9ea7eb2cb1a 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -141,7 +141,8 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, struct stacktrace_cookie c = { .store = store, .size = size, - .skip = skipnr + 1, + /* skip this function if they are tracing us */ + .skip = skipnr + !!(current == tsk), }; if (!try_get_task_stack(tsk)) @@ -298,7 +299,8 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, struct stack_trace trace = { .entries = store, .max_entries = size, - .skip = skipnr + 1, + /* skip this function if they are tracing us */ + .skip = skipnr + !!(current == task), }; save_stack_trace_tsk(task, &trace); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 65eb796610dc..069ca78fb0bf 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -771,7 +771,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, /* fill PPS status fields */ pps_fill_timex(txc); - txc->time.tv_sec = (time_t)ts->tv_sec; + txc->time.tv_sec = ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC; diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 4bc37ac3bb05..5ee0f7709410 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -110,8 +110,7 @@ void update_vsyscall(struct timekeeper *tk) nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); - if (__arch_use_vsyscall(vdata)) - update_vdso_data(vdata, tk); + update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); @@ -124,10 +123,8 @@ void update_vsyscall_tz(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); - if (__arch_use_vsyscall(vdata)) { - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; - } + vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; + vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; __arch_sync_vdso_data(vdata); } diff --git a/lib/Kconfig b/lib/Kconfig index 183f92a297ca..3321d04dfa5a 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -447,7 +447,6 @@ config ASSOCIATIVE_ARRAY config HAS_IOMEM bool depends on !NO_IOMEM - select GENERIC_IO default y config HAS_IOPORT_MAP diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index 08c3c8049998..156f26fdc4c9 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c @@ -1146,6 +1146,7 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) if (DEC_IS_DYNALLOC(s->dict.mode)) { if (s->dict.allocated < s->dict.size) { + s->dict.allocated = s->dict.size; vfree(s->dict.buf); s->dict.buf = vmalloc(s->dict.size); if (s->dict.buf == NULL) { diff --git a/mm/debug.c b/mm/debug.c index 8345bb6e4769..0461df1207cb 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -67,28 +67,31 @@ void __dump_page(struct page *page, const char *reason) */ mapcount = PageSlab(page) ? 0 : page_mapcount(page); - pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx", - page, page_ref_count(page), mapcount, - page->mapping, page_to_pgoff(page)); if (PageCompound(page)) - pr_cont(" compound_mapcount: %d", compound_mapcount(page)); - pr_cont("\n"); - if (PageAnon(page)) - pr_warn("anon "); - else if (PageKsm(page)) - pr_warn("ksm "); + pr_warn("page:%px refcount:%d mapcount:%d mapping:%px " + "index:%#lx compound_mapcount: %d\n", + page, page_ref_count(page), mapcount, + page->mapping, page_to_pgoff(page), + compound_mapcount(page)); + else + pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx\n", + page, page_ref_count(page), mapcount, + page->mapping, page_to_pgoff(page)); + if (PageKsm(page)) + pr_warn("ksm flags: %#lx(%pGp)\n", page->flags, &page->flags); + else if (PageAnon(page)) + pr_warn("anon flags: %#lx(%pGp)\n", page->flags, &page->flags); else if (mapping) { - pr_warn("%ps ", mapping->a_ops); if (mapping->host && mapping->host->i_dentry.first) { struct dentry *dentry; dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); - pr_warn("name:\"%pd\" ", dentry); - } + pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry); + } else + pr_warn("%ps\n", mapping->a_ops); + pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags); } BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); - pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags); - hex_only: print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index f1930fa0b445..2ac38bdc18a1 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -196,7 +196,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); - if (!css_tryget_online(&h_cg->css)) { + if (!css_tryget(&h_cg->css)) { rcu_read_unlock(); goto again; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f05d27b7183d..a8a57bebb5fa 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1602,17 +1602,6 @@ static void collapse_file(struct mm_struct *mm, result = SCAN_FAIL; goto xa_unlocked; } - } else if (!PageUptodate(page)) { - xas_unlock_irq(&xas); - wait_on_page_locked(page); - if (!trylock_page(page)) { - result = SCAN_PAGE_LOCK; - goto xa_unlocked; - } - get_page(page); - } else if (PageDirty(page)) { - result = SCAN_FAIL; - goto xa_locked; } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); @@ -1627,7 +1616,12 @@ static void collapse_file(struct mm_struct *mm, * without racing with truncate. */ VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageUptodate(page), page); + + /* make sure the page is up to date */ + if (unlikely(!PageUptodate(page))) { + result = SCAN_FAIL; + goto out_unlock; + } /* * If file was truncated then extended, or hole-punched, before @@ -1643,6 +1637,16 @@ static void collapse_file(struct mm_struct *mm, goto out_unlock; } + if (!is_shmem && PageDirty(page)) { + /* + * khugepaged only works on read-only fd, so this + * page is dirty because it hasn't been flushed + * since first write. + */ + result = SCAN_FAIL; + goto out_unlock; + } + if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; diff --git a/mm/madvise.c b/mm/madvise.c index 2be9f3fdb05e..94c343b4c968 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -363,8 +363,12 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, ClearPageReferenced(page); test_and_clear_page_young(page); if (pageout) { - if (!isolate_lru_page(page)) - list_add(&page->lru, &page_list); + if (!isolate_lru_page(page)) { + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add(&page->lru, &page_list); + } } else deactivate_page(page); huge_unlock: @@ -441,8 +445,12 @@ regular_page: ClearPageReferenced(page); test_and_clear_page_young(page); if (pageout) { - if (!isolate_lru_page(page)) - list_add(&page->lru, &page_list); + if (!isolate_lru_page(page)) { + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add(&page->lru, &page_list); + } } else deactivate_page(page); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 37592dd7ae32..46ad252e6d6a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -960,7 +960,7 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) if (unlikely(!memcg)) memcg = root_mem_cgroup; } - } while (!css_tryget_online(&memcg->css)); + } while (!css_tryget(&memcg->css)); rcu_read_unlock(); return memcg; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 07e5c67f48a8..3b62a9ff8ea0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1646,6 +1646,18 @@ static int check_cpu_on_node(pg_data_t *pgdat) return 0; } +static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg) +{ + int nid = *(int *)arg; + + /* + * If a memory block belongs to multiple nodes, the stored nid is not + * reliable. However, such blocks are always online (e.g., cannot get + * offlined) and, therefore, are still spanned by the node. + */ + return mem->nid == nid ? -EEXIST : 0; +} + /** * try_offline_node * @nid: the node ID @@ -1658,25 +1670,24 @@ static int check_cpu_on_node(pg_data_t *pgdat) void try_offline_node(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - unsigned long start_pfn = pgdat->node_start_pfn; - unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; - unsigned long pfn; + int rc; - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(pfn); - - if (!present_section_nr(section_nr)) - continue; - - if (pfn_to_nid(pfn) != nid) - continue; - - /* - * some memory sections of this node are not removed, and we - * can't offline node now. - */ + /* + * If the node still spans pages (especially ZONE_DEVICE), don't + * offline it. A node spans memory after move_pfn_range_to_zone(), + * e.g., after the memory block was onlined. + */ + if (pgdat->node_spanned_pages) + return; + + /* + * Especially offline memory blocks might not be spanned by the + * node. They will get spanned by the node once they get onlined. + * However, they link to the node in sysfs and can get onlined later. + */ + rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb); + if (rc) return; - } if (check_cpu_on_node(pgdat)) return; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ae967bcf954..e08c94170ae4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -672,7 +672,9 @@ static const struct mm_walk_ops queue_pages_walk_ops = { * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * 0 - queue pages successfully or no misplaced page. - * -EIO - there is misplaced page and only MPOL_MF_STRICT was specified. + * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or + * memory range specified by nodemask and maxnode points outside + * your accessible address space (-EFAULT) */ static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, @@ -1286,7 +1288,7 @@ static long do_mbind(unsigned long start, unsigned long len, flags | MPOL_MF_INVERT, &pagelist); if (ret < 0) { - err = -EIO; + err = ret; goto up_out; } @@ -1305,10 +1307,12 @@ static long do_mbind(unsigned long start, unsigned long len, if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT))) err = -EIO; - } else - putback_movable_pages(&pagelist); - + } else { up_out: + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); + } + up_write(&mm->mmap_sem); mpol_out: mpol_put(new); diff --git a/mm/page_io.c b/mm/page_io.c index 24ee600f9131..60a66a58b9bf 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -73,6 +73,7 @@ static void swap_slot_free_notify(struct page *page) { struct swap_info_struct *sis; struct gendisk *disk; + swp_entry_t entry; /* * There is no guarantee that the page is in swap cache - the software @@ -104,11 +105,10 @@ static void swap_slot_free_notify(struct page *page) * we again wish to reclaim it. */ disk = sis->bdev->bd_disk; - if (disk->fops->swap_slot_free_notify) { - swp_entry_t entry; + entry.val = page_private(page); + if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) { unsigned long offset; - entry.val = page_private(page); offset = swp_offset(entry); SetPageDirty(page); diff --git a/mm/slub.c b/mm/slub.c index b25c807a111f..e72e802fc569 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1433,12 +1433,15 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, void *old_tail = *tail ? *tail : *head; int rsize; - if (slab_want_init_on_free(s)) { - void *p = NULL; + /* Head and tail of the reconstructed freelist */ + *head = NULL; + *tail = NULL; - do { - object = next; - next = get_freepointer(s, object); + do { + object = next; + next = get_freepointer(s, object); + + if (slab_want_init_on_free(s)) { /* * Clear the object and the metadata, but don't touch * the redzone. @@ -1448,29 +1451,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, : 0; memset((char *)object + s->inuse, 0, s->size - s->inuse - rsize); - set_freepointer(s, object, p); - p = object; - } while (object != old_tail); - } -/* - * Compiler cannot detect this function can be removed if slab_free_hook() - * evaluates to nothing. Thus, catch all relevant config debug options here. - */ -#if defined(CONFIG_LOCKDEP) || \ - defined(CONFIG_DEBUG_KMEMLEAK) || \ - defined(CONFIG_DEBUG_OBJECTS_FREE) || \ - defined(CONFIG_KASAN) - - next = *head; - - /* Head and tail of the reconstructed freelist */ - *head = NULL; - *tail = NULL; - - do { - object = next; - next = get_freepointer(s, object); + } /* If object's reuse doesn't have to be delayed */ if (!slab_free_hook(s, object)) { /* Move object to the new freelist */ @@ -1485,9 +1467,6 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, *tail = NULL; return *head != NULL; -#else - return true; -#endif } static void *setup_object(struct kmem_cache *s, struct page *page, diff --git a/net/can/af_can.c b/net/can/af_can.c index 5518a7d9eed9..128d37a4c2e0 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -86,11 +86,12 @@ static atomic_t skbcounter = ATOMIC_INIT(0); /* af_can socket functions */ -static void can_sock_destruct(struct sock *sk) +void can_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_error_queue); } +EXPORT_SYMBOL(can_sock_destruct); static const struct can_proto *can_get_proto(int protocol) { diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index def2f813ffce..137054bff9ec 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -51,6 +51,7 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data) if (!skb) return; + j1939_priv_get(priv); can_skb_set_owner(skb, iskb->sk); /* get a pointer to the header of the skb @@ -104,6 +105,7 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data) j1939_simple_recv(priv, skb); j1939_sk_recv(priv, skb); done: + j1939_priv_put(priv); kfree_skb(skb); } @@ -150,6 +152,10 @@ static void __j1939_priv_release(struct kref *kref) netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv); + WARN_ON_ONCE(!list_empty(&priv->active_session_list)); + WARN_ON_ONCE(!list_empty(&priv->ecus)); + WARN_ON_ONCE(!list_empty(&priv->j1939_socks)); + dev_put(ndev); kfree(priv); } @@ -207,6 +213,9 @@ static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev) { struct can_ml_priv *can_ml_priv = ndev->ml_priv; + if (!can_ml_priv) + return NULL; + return can_ml_priv->j1939_priv; } diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 4d8ba701e15d..de09b0a65791 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -78,7 +78,6 @@ static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) { jsk->state |= J1939_SOCK_BOUND; j1939_priv_get(priv); - jsk->priv = priv; spin_lock_bh(&priv->j1939_socks_lock); list_add_tail(&jsk->list, &priv->j1939_socks); @@ -91,7 +90,6 @@ static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) list_del_init(&jsk->list); spin_unlock_bh(&priv->j1939_socks_lock); - jsk->priv = NULL; j1939_priv_put(priv); jsk->state &= ~J1939_SOCK_BOUND; } @@ -349,6 +347,34 @@ void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) spin_unlock_bh(&priv->j1939_socks_lock); } +static void j1939_sk_sock_destruct(struct sock *sk) +{ + struct j1939_sock *jsk = j1939_sk(sk); + + /* This function will be call by the generic networking code, when then + * the socket is ultimately closed (sk->sk_destruct). + * + * The race between + * - processing a received CAN frame + * (can_receive -> j1939_can_recv) + * and accessing j1939_priv + * ... and ... + * - closing a socket + * (j1939_can_rx_unregister -> can_rx_unregister) + * and calling the final j1939_priv_put() + * + * is avoided by calling the final j1939_priv_put() from this + * RCU deferred cleanup call. + */ + if (jsk->priv) { + j1939_priv_put(jsk->priv); + jsk->priv = NULL; + } + + /* call generic CAN sock destruct */ + can_sock_destruct(sk); +} + static int j1939_sk_init(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); @@ -371,6 +397,7 @@ static int j1939_sk_init(struct sock *sk) atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); + sk->sk_destruct = j1939_sk_sock_destruct; return 0; } @@ -443,6 +470,12 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) } jsk->ifindex = addr->can_ifindex; + + /* the corresponding j1939_priv_put() is called via + * sk->sk_destruct, which points to j1939_sk_sock_destruct() + */ + j1939_priv_get(priv); + jsk->priv = priv; } /* set default transmit pgn */ @@ -560,8 +593,8 @@ static int j1939_sk_release(struct socket *sock) if (!sk) return 0; - jsk = j1939_sk(sk); lock_sock(sk); + jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_BOUND) { struct j1939_priv *priv = jsk->priv; @@ -1059,51 +1092,72 @@ static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); - struct j1939_priv *priv = jsk->priv; + struct j1939_priv *priv; int ifindex; int ret; + lock_sock(sock->sk); /* various socket state tests */ - if (!(jsk->state & J1939_SOCK_BOUND)) - return -EBADFD; + if (!(jsk->state & J1939_SOCK_BOUND)) { + ret = -EBADFD; + goto sendmsg_done; + } + priv = jsk->priv; ifindex = jsk->ifindex; - if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) + if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) { /* no source address assigned yet */ - return -EBADFD; + ret = -EBADFD; + goto sendmsg_done; + } /* deal with provided destination address info */ if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; - if (msg->msg_namelen < J1939_MIN_NAMELEN) - return -EINVAL; + if (msg->msg_namelen < J1939_MIN_NAMELEN) { + ret = -EINVAL; + goto sendmsg_done; + } - if (addr->can_family != AF_CAN) - return -EINVAL; + if (addr->can_family != AF_CAN) { + ret = -EINVAL; + goto sendmsg_done; + } - if (addr->can_ifindex && addr->can_ifindex != ifindex) - return -EBADFD; + if (addr->can_ifindex && addr->can_ifindex != ifindex) { + ret = -EBADFD; + goto sendmsg_done; + } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && - !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) - return -EINVAL; + !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) { + ret = -EINVAL; + goto sendmsg_done; + } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && - !sock_flag(sk, SOCK_BROADCAST)) + !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ - return -EACCES; + ret = -EACCES; + goto sendmsg_done; + } } else { if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR && - !sock_flag(sk, SOCK_BROADCAST)) + !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ - return -EACCES; + ret = -EACCES; + goto sendmsg_done; + } } ret = j1939_sk_send_loop(priv, sk, msg, size); +sendmsg_done: + release_sock(sock->sk); + return ret; } diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index e5f1a56994c6..9f99af5b0b11 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -255,6 +255,7 @@ static void __j1939_session_drop(struct j1939_session *session) return; j1939_sock_pending_del(session->sk); + sock_put(session->sk); } static void j1939_session_destroy(struct j1939_session *session) @@ -266,6 +267,9 @@ static void j1939_session_destroy(struct j1939_session *session) netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); + WARN_ON_ONCE(!list_empty(&session->sk_session_queue_entry)); + WARN_ON_ONCE(!list_empty(&session->active_session_list_entry)); + skb_queue_purge(&session->skb_queue); __j1939_session_drop(session); j1939_priv_put(session->priv); @@ -1042,12 +1046,13 @@ j1939_session_deactivate_activate_next(struct j1939_session *session) j1939_sk_queue_activate_next(session); } -static void j1939_session_cancel(struct j1939_session *session, +static void __j1939_session_cancel(struct j1939_session *session, enum j1939_xtp_abort err) { struct j1939_priv *priv = session->priv; WARN_ON_ONCE(!err); + lockdep_assert_held(&session->priv->active_session_list_lock); session->err = j1939_xtp_abort_to_errno(priv, err); /* do not send aborts on incoming broadcasts */ @@ -1062,6 +1067,20 @@ static void j1939_session_cancel(struct j1939_session *session, j1939_sk_send_loop_abort(session->sk, session->err); } +static void j1939_session_cancel(struct j1939_session *session, + enum j1939_xtp_abort err) +{ + j1939_session_list_lock(session->priv); + + if (session->state >= J1939_SESSION_ACTIVE && + session->state < J1939_SESSION_WAITING_ABORT) { + j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS); + __j1939_session_cancel(session, err); + } + + j1939_session_list_unlock(session->priv); +} + static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) { struct j1939_session *session = @@ -1108,8 +1127,6 @@ static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) netdev_alert(priv->ndev, "%s: 0x%p: tx aborted with unknown reason: %i\n", __func__, session, ret); if (session->skcb.addr.type != J1939_SIMPLE) { - j1939_tp_set_rxtimeout(session, - J1939_XTP_ABORT_TIMEOUT_MS); j1939_session_cancel(session, J1939_XTP_ABORT_OTHER); } else { session->err = ret; @@ -1169,7 +1186,7 @@ static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer) hrtimer_start(&session->rxtimer, ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS), HRTIMER_MODE_REL_SOFT); - j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT); + __j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT); } j1939_session_list_unlock(session->priv); } @@ -1375,7 +1392,6 @@ j1939_xtp_rx_cts_one(struct j1939_session *session, struct sk_buff *skb) out_session_cancel: j1939_session_timers_cancel(session); - j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS); j1939_session_cancel(session, err); } @@ -1572,7 +1588,6 @@ static int j1939_xtp_rx_rts_session_active(struct j1939_session *session, /* RTS on active session */ j1939_session_timers_cancel(session); - j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS); j1939_session_cancel(session, J1939_XTP_ABORT_BUSY); } @@ -1583,7 +1598,6 @@ static int j1939_xtp_rx_rts_session_active(struct j1939_session *session, session->last_cmd); j1939_session_timers_cancel(session); - j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS); j1939_session_cancel(session, J1939_XTP_ABORT_BUSY); return -EBUSY; @@ -1785,7 +1799,6 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, out_session_cancel: j1939_session_timers_cancel(session); - j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS); j1939_session_cancel(session, J1939_XTP_ABORT_FAULT); j1939_session_put(session); } @@ -1866,6 +1879,7 @@ struct j1939_session *j1939_tp_send(struct j1939_priv *priv, return ERR_PTR(-ENOMEM); /* skb is recounted in j1939_session_new() */ + sock_hold(skb->sk); session->sk = skb->sk; session->transmission = true; session->pkt.total = (size + 6) / 7; @@ -2028,7 +2042,11 @@ int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk) &priv->active_session_list, active_session_list_entry) { if (!sk || sk == session->sk) { - j1939_session_timers_cancel(session); + if (hrtimer_try_to_cancel(&session->txtimer) == 1) + j1939_session_put(session); + if (hrtimer_try_to_cancel(&session->rxtimer) == 1) + j1939_session_put(session); + session->err = ESHUTDOWN; j1939_session_deactivate_locked(session); } diff --git a/net/core/devlink.c b/net/core/devlink.c index 1338f5fbc7d2..4c63c9a4c09e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2812,7 +2812,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) struct net *dest_net = NULL; int err; - if (!devlink_reload_supported(devlink)) + if (!devlink_reload_supported(devlink) || !devlink->reload_enabled) return -EOPNOTSUPP; err = devlink_resources_validate(devlink, NULL, info); @@ -4747,6 +4747,7 @@ struct devlink_health_reporter { bool auto_recover; u8 health_state; u64 dump_ts; + u64 dump_real_ts; u64 error_count; u64 recovery_count; u64 last_recovery_ts; @@ -4923,6 +4924,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter, goto dump_err; reporter->dump_ts = jiffies; + reporter->dump_real_ts = ktime_get_real_ns(); return 0; @@ -5072,6 +5074,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, jiffies_to_msecs(reporter->dump_ts), DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; + if (reporter->dump_fmsg && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, + reporter->dump_real_ts, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; nla_nest_end(msg, reporter_attr); genlmsg_end(msg, hdr); diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 73632d21f1a6..2fb6c26294b5 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -105,7 +105,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) slave = dsa_to_port(ds, port)->slave; err = br_vlan_get_pvid(slave, &pvid); - if (err < 0) + if (!pvid || err < 0) /* There is no pvid on the bridge for this port, which is * perfectly valid. Nothing to restore, bye-bye! */ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 440294bdb752..6e68def66822 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2291,7 +2291,8 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, rcu_read_unlock(); return -ENODEV; } - skb2 = skb_clone(skb, GFP_ATOMIC); + + skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr)); if (!skb2) { read_unlock(&mrt_lock); rcu_read_unlock(); diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 9d4f75e0d33a..e70567446f28 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -81,6 +81,11 @@ static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb) if (!pskb_may_pull(skb, srhoff + len)) return NULL; + /* note that pskb_may_pull may change pointers in header; + * for this reason it is necessary to reload them when needed. + */ + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + if (!seg6_validate_srh(srh, len)) return NULL; @@ -336,6 +341,8 @@ static int input_action_end_dx6(struct sk_buff *skb, if (!ipv6_addr_any(&slwt->nh6)) nhaddr = &slwt->nh6; + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + seg6_lookup_nexthop(skb, nhaddr, 0); return dst_input(skb); @@ -365,6 +372,8 @@ static int input_action_end_dx4(struct sk_buff *skb, skb_dst_drop(skb); + skb_set_transport_header(skb, sizeof(struct iphdr)); + err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); if (err) goto drop; @@ -385,6 +394,8 @@ static int input_action_end_dt6(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + seg6_lookup_nexthop(skb, NULL, slwt->table); return dst_input(skb); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 6b345c858dba..c71f4328d138 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -513,6 +513,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct ib_qp_init_attr attr; struct ib_cq_init_attr cq_attr = {}; struct rds_ib_device *rds_ibdev; + unsigned long max_wrs; int ret, fr_queue_space; struct dma_pool *pool; @@ -533,10 +534,15 @@ static int rds_ib_setup_qp(struct rds_connection *conn) /* add the conn now so that connection establishment has the dev */ rds_ib_add_conn(rds_ibdev, conn); - if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) - rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); - if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) - rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); + max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_send_wr + 1 ? + rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_send_wr; + if (ic->i_send_ring.w_nr != max_wrs) + rds_ib_ring_resize(&ic->i_send_ring, max_wrs); + + max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_recv_wr + 1 ? + rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_recv_wr; + if (ic->i_recv_ring.w_nr != max_wrs) + rds_ib_ring_resize(&ic->i_recv_ring, max_wrs); /* Protection domain and memory range */ ic->i_pd = rds_ibdev->pd; @@ -1176,8 +1182,9 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_flowctl = 0; atomic_set(&ic->i_credits, 0); - rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); - rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + /* Re-init rings, but retain sizes. */ + rds_ib_ring_init(&ic->i_send_ring, ic->i_send_ring.w_nr); + rds_ib_ring_init(&ic->i_recv_ring, ic->i_recv_ring.w_nr); if (ic->i_ibinc) { rds_inc_put(&ic->i_ibinc->ii_inc); @@ -1224,8 +1231,8 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) * rds_ib_conn_shutdown() waits for these to be emptied so they * must be initialized before it can be called. */ - rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); - rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + rds_ib_ring_init(&ic->i_send_ring, 0); + rds_ib_ring_init(&ic->i_recv_ring, 0); ic->conn = conn; conn->c_transport_data = ic; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index cde4dc0ed173..b997072c72e5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -799,6 +799,7 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_err = EPIPE; else if (signal_pending(current)) smc->sk.sk_err = -sock_intr_errno(timeo); + sock_put(&smc->sk); /* passive closing */ goto out; } @@ -1736,7 +1737,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ - if (sk->sk_state == SMC_INIT) { + if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { smc_switch_to_fallback(smc); smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { diff --git a/net/tipc/core.c b/net/tipc/core.c index fc01a13d7462..7532a00ac73d 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -34,8 +34,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include "core.h" #include "name_table.h" #include "subscr.h" diff --git a/net/tipc/core.h b/net/tipc/core.h index 775848a5f27e..631d83c9705f 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -61,6 +61,12 @@ #include #include +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + struct tipc_node; struct tipc_bearer; struct tipc_bc_base; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 9b599ed66d97..2c86a2fc3915 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -480,6 +480,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) else XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); + + if (encap_type == -1) + dev_put(skb->dev); goto drop; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index c6f3c4a1bd99..f3423562d933 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -495,6 +495,8 @@ static void ___xfrm_state_destroy(struct xfrm_state *x) x->type->destructor(x); xfrm_put_type(x->type); } + if (x->xfrag.page) + put_page(x->xfrag.page); xfrm_dev_state_free(x); security_xfrm_state_free(x); xfrm_state_free(x); diff --git a/scripts/tools-support-relr.sh b/scripts/tools-support-relr.sh index 97a2c844a95e..45e8aa360b45 100755 --- a/scripts/tools-support-relr.sh +++ b/scripts/tools-support-relr.sh @@ -4,13 +4,13 @@ tmp_file=$(mktemp) trap "rm -f $tmp_file.o $tmp_file $tmp_file.bin" EXIT -cat << "END" | "$CC" -c -x c - -o $tmp_file.o >/dev/null 2>&1 +cat << "END" | $CC -c -x c - -o $tmp_file.o >/dev/null 2>&1 void *p = &p; END -"$LD" $tmp_file.o -shared -Bsymbolic --pack-dyn-relocs=relr -o $tmp_file +$LD $tmp_file.o -shared -Bsymbolic --pack-dyn-relocs=relr -o $tmp_file # Despite printing an error message, GNU nm still exits with exit code 0 if it # sees a relr section. So we need to check that nothing is printed to stderr. -test -z "$("$NM" $tmp_file 2>&1 >/dev/null)" +test -z "$($NM $tmp_file 2>&1 >/dev/null)" -"$OBJCOPY" -O binary $tmp_file $tmp_file.bin +$OBJCOPY -O binary $tmp_file $tmp_file.bin diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index d80041ea4e01..2236b5e0c1f2 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -1782,11 +1782,14 @@ void snd_pcm_period_elapsed(struct snd_pcm_substream *substream) struct snd_pcm_runtime *runtime; unsigned long flags; - if (PCM_RUNTIME_CHECK(substream)) + if (snd_BUG_ON(!substream)) return; - runtime = substream->runtime; snd_pcm_stream_lock_irqsave(substream, flags); + if (PCM_RUNTIME_CHECK(substream)) + goto _unlock; + runtime = substream->runtime; + if (!snd_pcm_running(substream) || snd_pcm_update_hw_ptr0(substream, 1) < 0) goto _end; @@ -1797,6 +1800,7 @@ void snd_pcm_period_elapsed(struct snd_pcm_substream *substream) #endif _end: kill_fasync(&runtime->fasync, SIGIO, POLL_IN); + _unlock: snd_pcm_stream_unlock_irqrestore(substream, flags); } EXPORT_SYMBOL(snd_pcm_period_elapsed); diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index cf53fbd872ee..c52419376c74 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2396,6 +2396,9 @@ static const struct pci_device_id azx_ids[] = { /* CometLake-H */ { PCI_DEVICE(0x8086, 0x06C8), .driver_data = AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE}, + /* CometLake-S */ + { PCI_DEVICE(0x8086, 0xa3f0), + .driver_data = AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE}, /* Icelake */ { PCI_DEVICE(0x8086, 0x34c8), .driver_data = AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE}, diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 3c720703ebb8..78bd2e3722c7 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -46,10 +46,12 @@ MODULE_PARM_DESC(static_hdmi_pcm, "Don't restrict PCM parameters per ELD info"); ((codec)->core.vendor_id == 0x80862800)) #define is_cannonlake(codec) ((codec)->core.vendor_id == 0x8086280c) #define is_icelake(codec) ((codec)->core.vendor_id == 0x8086280f) +#define is_tigerlake(codec) ((codec)->core.vendor_id == 0x80862812) #define is_haswell_plus(codec) (is_haswell(codec) || is_broadwell(codec) \ || is_skylake(codec) || is_broxton(codec) \ || is_kabylake(codec) || is_geminilake(codec) \ - || is_cannonlake(codec) || is_icelake(codec)) + || is_cannonlake(codec) || is_icelake(codec) \ + || is_tigerlake(codec)) #define is_valleyview(codec) ((codec)->core.vendor_id == 0x80862882) #define is_cherryview(codec) ((codec)->core.vendor_id == 0x80862883) #define is_valleyview_plus(codec) (is_valleyview(codec) || is_cherryview(codec)) diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index a2ab8e8d3a93..4a9a2f6ef5a4 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -388,6 +388,9 @@ static void snd_complete_urb(struct urb *urb) } prepare_outbound_urb(ep, ctx); + /* can be stopped during prepare callback */ + if (unlikely(!test_bit(EP_FLAG_RUNNING, &ep->flags))) + goto exit_clear; } else { retire_inbound_urb(ep, ctx); /* can be stopped during retire callback */ diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 3fd1d1749edf..45eee5cc312e 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -1229,7 +1229,8 @@ static int get_min_max_with_quirks(struct usb_mixer_elem_info *cval, if (cval->min + cval->res < cval->max) { int last_valid_res = cval->res; int saved, test, check; - get_cur_mix_raw(cval, minchn, &saved); + if (get_cur_mix_raw(cval, minchn, &saved) < 0) + goto no_res_check; for (;;) { test = saved; if (test < cval->max) @@ -1249,6 +1250,7 @@ static int get_min_max_with_quirks(struct usb_mixer_elem_info *cval, snd_usb_set_cur_mix_value(cval, minchn, 0, saved); } +no_res_check: cval->initialized = 1; } diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 0bbe1201a6ac..349e1e52996d 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -248,8 +248,8 @@ static int create_yamaha_midi_quirk(struct snd_usb_audio *chip, NULL, USB_MS_MIDI_OUT_JACK); if (!injd && !outjd) return -ENODEV; - if (!(injd && snd_usb_validate_midi_desc(injd)) || - !(outjd && snd_usb_validate_midi_desc(outjd))) + if ((injd && !snd_usb_validate_midi_desc(injd)) || + (outjd && !snd_usb_validate_midi_desc(outjd))) return -ENODEV; if (injd && (injd->bLength < 5 || (injd->bJackType != USB_MS_EMBEDDED && diff --git a/sound/usb/validate.c b/sound/usb/validate.c index a5e584b60dcd..389e8657434a 100644 --- a/sound/usb/validate.c +++ b/sound/usb/validate.c @@ -81,9 +81,9 @@ static bool validate_processing_unit(const void *p, switch (v->protocol) { case UAC_VERSION_1: default: - /* bNrChannels, wChannelConfig, iChannelNames, bControlSize */ - len += 1 + 2 + 1 + 1; - if (d->bLength < len) /* bControlSize */ + /* bNrChannels, wChannelConfig, iChannelNames */ + len += 1 + 2 + 1; + if (d->bLength < len + 1) /* bControlSize */ return false; m = hdr[len]; len += 1 + m + 1; /* bControlSize, bmControls, iProcessing */ diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 679a1d75090c..7b6eaf5e0bda 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1625,7 +1625,7 @@ int hists__collapse_resort(struct hists *hists, struct ui_progress *prog) return 0; } -static int hist_entry__sort(struct hist_entry *a, struct hist_entry *b) +static int64_t hist_entry__sort(struct hist_entry *a, struct hist_entry *b) { struct hists *hists = a->hists; struct perf_hpp_fmt *fmt; diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index 15961854ba67..741f040648b5 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -539,10 +539,11 @@ static int perl_stop_script(void) static int perl_generate_script(struct tep_handle *pevent, const char *outfile) { + int i, not_first, count, nr_events; + struct tep_event **all_events; struct tep_event *event = NULL; struct tep_format_field *f; char fname[PATH_MAX]; - int not_first, count; FILE *ofp; sprintf(fname, "%s.pl", outfile); @@ -603,8 +604,11 @@ sub print_backtrace\n\ }\n\n\ "); + nr_events = tep_get_events_count(pevent); + all_events = tep_list_events(pevent, TEP_EVENT_SORT_ID); - while ((event = trace_find_next_event(pevent, event))) { + for (i = 0; all_events && i < nr_events; i++) { + event = all_events[i]; fprintf(ofp, "sub %s::%s\n{\n", event->system, event->name); fprintf(ofp, "\tmy ("); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 5d341efc3237..93c03b39cd9c 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1687,10 +1687,11 @@ static int python_stop_script(void) static int python_generate_script(struct tep_handle *pevent, const char *outfile) { + int i, not_first, count, nr_events; + struct tep_event **all_events; struct tep_event *event = NULL; struct tep_format_field *f; char fname[PATH_MAX]; - int not_first, count; FILE *ofp; sprintf(fname, "%s.py", outfile); @@ -1735,7 +1736,11 @@ static int python_generate_script(struct tep_handle *pevent, const char *outfile fprintf(ofp, "def trace_end():\n"); fprintf(ofp, "\tprint(\"in trace_end\")\n\n"); - while ((event = trace_find_next_event(pevent, event))) { + nr_events = tep_get_events_count(pevent); + all_events = tep_list_events(pevent, TEP_EVENT_SORT_ID); + + for (i = 0; all_events && i < nr_events; i++) { + event = all_events[i]; fprintf(ofp, "def %s__%s(", event->system, event->name); fprintf(ofp, "event_name, "); fprintf(ofp, "context, "); diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index 5d6bfc70b210..9634f0ae57be 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -173,37 +173,6 @@ int parse_event_file(struct tep_handle *pevent, return tep_parse_event(pevent, buf, size, sys); } -struct tep_event *trace_find_next_event(struct tep_handle *pevent, - struct tep_event *event) -{ - static int idx; - int events_count; - struct tep_event *all_events; - - all_events = tep_get_first_event(pevent); - events_count = tep_get_events_count(pevent); - if (!pevent || !all_events || events_count < 1) - return NULL; - - if (!event) { - idx = 0; - return all_events; - } - - if (idx < events_count && event == (all_events + idx)) { - idx++; - if (idx == events_count) - return NULL; - return (all_events + idx); - } - - for (idx = 1; idx < events_count; idx++) { - if (event == (all_events + (idx - 1))) - return (all_events + idx); - } - return NULL; -} - struct flag { const char *name; unsigned long long value; diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index 2e158387b3d7..72fdf2a3577c 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -47,8 +47,6 @@ void parse_saved_cmdline(struct tep_handle *pevent, char *file, unsigned int siz ssize_t trace_report(int fd, struct trace_event *tevent, bool repipe); -struct tep_event *trace_find_next_event(struct tep_handle *pevent, - struct tep_event *event); unsigned long long read_size(struct tep_event *event, void *ptr, int size); unsigned long long eval_flag(const char *flag); diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh index ae6146ec5afd..4632f51af7ab 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh @@ -112,14 +112,16 @@ sanitization_single_dev_mcast_group_test() RET=0 ip link add dev br0 type bridge mcast_snooping 0 + ip link add name dummy1 up type dummy ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \ ttl 20 tos inherit local 198.51.100.1 dstport 4789 \ - dev $swp2 group 239.0.0.1 + dev dummy1 group 239.0.0.1 sanitization_single_dev_test_fail ip link del dev vxlan0 + ip link del dev dummy1 ip link del dev br0 log_test "vxlan device with a multicast group" @@ -181,13 +183,15 @@ sanitization_single_dev_local_interface_test() RET=0 ip link add dev br0 type bridge mcast_snooping 0 + ip link add name dummy1 up type dummy ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \ - ttl 20 tos inherit local 198.51.100.1 dstport 4789 dev $swp2 + ttl 20 tos inherit local 198.51.100.1 dstport 4789 dev dummy1 sanitization_single_dev_test_fail ip link del dev vxlan0 + ip link del dev dummy1 ip link del dev br0 log_test "vxlan device with local interface" diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c index 4911fc77d0f6..d1cf9f6e0e6b 100644 --- a/tools/testing/selftests/kvm/lib/assert.c +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -55,7 +55,7 @@ static void test_dump_stack(void) #pragma GCC diagnostic pop } -static pid_t gettid(void) +static pid_t _gettid(void) { return syscall(SYS_gettid); } @@ -72,7 +72,7 @@ test_assert(bool exp, const char *exp_str, fprintf(stderr, "==== Test Assertion Failure ====\n" " %s:%u: %s\n" " pid=%d tid=%d - %s\n", - file, line, exp_str, getpid(), gettid(), + file, line, exp_str, getpid(), _gettid(), strerror(errno)); test_dump_stack(); if (fmt) { diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c index bd4a7247b44f..c0dd10257df5 100644 --- a/tools/testing/selftests/ptp/testptp.c +++ b/tools/testing/selftests/ptp/testptp.c @@ -44,6 +44,46 @@ static int clock_adjtime(clockid_t id, struct timex *tx) } #endif +static void show_flag_test(int rq_index, unsigned int flags, int err) +{ + printf("PTP_EXTTS_REQUEST%c flags 0x%08x : (%d) %s\n", + rq_index ? '1' + rq_index : ' ', + flags, err, strerror(errno)); + /* sigh, uClibc ... */ + errno = 0; +} + +static void do_flag_test(int fd, unsigned int index) +{ + struct ptp_extts_request extts_request; + unsigned long request[2] = { + PTP_EXTTS_REQUEST, + PTP_EXTTS_REQUEST2, + }; + unsigned int enable_flags[5] = { + PTP_ENABLE_FEATURE, + PTP_ENABLE_FEATURE | PTP_RISING_EDGE, + PTP_ENABLE_FEATURE | PTP_FALLING_EDGE, + PTP_ENABLE_FEATURE | PTP_RISING_EDGE | PTP_FALLING_EDGE, + PTP_ENABLE_FEATURE | (PTP_EXTTS_VALID_FLAGS + 1), + }; + int err, i, j; + + memset(&extts_request, 0, sizeof(extts_request)); + extts_request.index = index; + + for (i = 0; i < 2; i++) { + for (j = 0; j < 5; j++) { + extts_request.flags = enable_flags[j]; + err = ioctl(fd, request[i], &extts_request); + show_flag_test(i, extts_request.flags, err); + + extts_request.flags = 0; + err = ioctl(fd, request[i], &extts_request); + } + } +} + static clockid_t get_clockid(int fd) { #define CLOCKFD 3 @@ -96,7 +136,8 @@ static void usage(char *progname) " -s set the ptp clock time from the system time\n" " -S set the system time from the ptp clock time\n" " -t val shift the ptp clock time by 'val' seconds\n" - " -T val set the ptp clock time to 'val' seconds\n", + " -T val set the ptp clock time to 'val' seconds\n" + " -z test combinations of rising/falling external time stamp flags\n", progname); } @@ -122,6 +163,7 @@ int main(int argc, char *argv[]) int adjtime = 0; int capabilities = 0; int extts = 0; + int flagtest = 0; int gettime = 0; int index = 0; int list_pins = 0; @@ -138,7 +180,7 @@ int main(int argc, char *argv[]) progname = strrchr(argv[0], '/'); progname = progname ? 1+progname : argv[0]; - while (EOF != (c = getopt(argc, argv, "cd:e:f:ghi:k:lL:p:P:sSt:T:v"))) { + while (EOF != (c = getopt(argc, argv, "cd:e:f:ghi:k:lL:p:P:sSt:T:z"))) { switch (c) { case 'c': capabilities = 1; @@ -191,6 +233,9 @@ int main(int argc, char *argv[]) settime = 3; seconds = atoi(optarg); break; + case 'z': + flagtest = 1; + break; case 'h': usage(progname); return 0; @@ -322,6 +367,10 @@ int main(int argc, char *argv[]) } } + if (flagtest) { + do_flag_test(fd, index); + } + if (list_pins) { int n_pins = 0; if (ioctl(fd, PTP_CLOCK_GETCAPS, &caps)) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d6f0696d98ef..13efc291b1c7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -121,9 +122,22 @@ static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); #define KVM_COMPAT(c) .compat_ioctl = (c) #else +/* + * For architectures that don't implement a compat infrastructure, + * adopt a double line of defense: + * - Prevent a compat task from opening /dev/kvm + * - If the open has been done by a 64bit task, and the KVM fd + * passed to a compat task, let the ioctls fail. + */ static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { return -EINVAL; } -#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl + +static int kvm_no_compat_open(struct inode *inode, struct file *file) +{ + return is_compat_task() ? -ENODEV : 0; +} +#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ + .open = kvm_no_compat_open #endif static int hardware_enable_all(void); static void hardware_disable_all(void); @@ -149,10 +163,30 @@ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, return 0; } +bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) +{ + /* + * The metadata used by is_zone_device_page() to determine whether or + * not a page is ZONE_DEVICE is guaranteed to be valid if and only if + * the device has been pinned, e.g. by get_user_pages(). WARN if the + * page_count() is zero to help detect bad usage of this helper. + */ + if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) + return false; + + return is_zone_device_page(pfn_to_page(pfn)); +} + bool kvm_is_reserved_pfn(kvm_pfn_t pfn) { + /* + * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting + * perspective they are "normal" pages, albeit with slightly different + * usage rules. + */ if (pfn_valid(pfn)) - return PageReserved(pfn_to_page(pfn)); + return PageReserved(pfn_to_page(pfn)) && + !kvm_is_zone_device_pfn(pfn); return true; } @@ -625,6 +659,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) return 0; } +/* + * Called after the VM is otherwise initialized, but just before adding it to + * the vm_list. + */ +int __weak kvm_arch_post_init_vm(struct kvm *kvm) +{ + return 0; +} + +/* + * Called just after removing the VM from the vm_list, but before doing any + * other destruction. + */ +void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ +} + static struct kvm *kvm_create_vm(unsigned long type) { struct kvm *kvm = kvm_arch_alloc_vm(); @@ -645,6 +696,12 @@ static struct kvm *kvm_create_vm(unsigned long type) BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); + if (init_srcu_struct(&kvm->srcu)) + goto out_err_no_srcu; + if (init_srcu_struct(&kvm->irq_srcu)) + goto out_err_no_irq_srcu; + + refcount_set(&kvm->users_count, 1); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { struct kvm_memslots *slots = kvm_alloc_memslots(); @@ -662,7 +719,6 @@ static struct kvm *kvm_create_vm(unsigned long type) goto out_err_no_arch_destroy_vm; } - refcount_set(&kvm->users_count, 1); r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_no_arch_destroy_vm; @@ -675,12 +731,11 @@ static struct kvm *kvm_create_vm(unsigned long type) INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); #endif - if (init_srcu_struct(&kvm->srcu)) - goto out_err_no_srcu; - if (init_srcu_struct(&kvm->irq_srcu)) - goto out_err_no_irq_srcu; - r = kvm_init_mmu_notifier(kvm); + if (r) + goto out_err_no_mmu_notifier; + + r = kvm_arch_post_init_vm(kvm); if (r) goto out_err; @@ -693,19 +748,24 @@ static struct kvm *kvm_create_vm(unsigned long type) return kvm; out_err: - cleanup_srcu_struct(&kvm->irq_srcu); -out_err_no_irq_srcu: - cleanup_srcu_struct(&kvm->srcu); -out_err_no_srcu: +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + if (kvm->mmu_notifier.ops) + mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); +#endif +out_err_no_mmu_notifier: hardware_disable_all(); out_err_no_disable: kvm_arch_destroy_vm(kvm); - WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); out_err_no_arch_destroy_vm: + WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm_get_bus(kvm, i)); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); + cleanup_srcu_struct(&kvm->irq_srcu); +out_err_no_irq_srcu: + cleanup_srcu_struct(&kvm->srcu); +out_err_no_srcu: kvm_arch_free_vm(kvm); mmdrop(current->mm); return ERR_PTR(r); @@ -737,6 +797,8 @@ static void kvm_destroy_vm(struct kvm *kvm) mutex_lock(&kvm_lock); list_del(&kvm->vm_list); mutex_unlock(&kvm_lock); + kvm_arch_pre_destroy_vm(kvm); + kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { struct kvm_io_bus *bus = kvm_get_bus(kvm, i); @@ -1857,7 +1919,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); void kvm_set_pfn_dirty(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn)) { + if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) { struct page *page = pfn_to_page(pfn); SetPageDirty(page); @@ -1867,7 +1929,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); void kvm_set_pfn_accessed(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn)) + if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) mark_page_accessed(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); @@ -4371,3 +4433,86 @@ void kvm_exit(void) kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); + +struct kvm_vm_worker_thread_context { + struct kvm *kvm; + struct task_struct *parent; + struct completion init_done; + kvm_vm_thread_fn_t thread_fn; + uintptr_t data; + int err; +}; + +static int kvm_vm_worker_thread(void *context) +{ + /* + * The init_context is allocated on the stack of the parent thread, so + * we have to locally copy anything that is needed beyond initialization + */ + struct kvm_vm_worker_thread_context *init_context = context; + struct kvm *kvm = init_context->kvm; + kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; + uintptr_t data = init_context->data; + int err; + + err = kthread_park(current); + /* kthread_park(current) is never supposed to return an error */ + WARN_ON(err != 0); + if (err) + goto init_complete; + + err = cgroup_attach_task_all(init_context->parent, current); + if (err) { + kvm_err("%s: cgroup_attach_task_all failed with err %d\n", + __func__, err); + goto init_complete; + } + + set_user_nice(current, task_nice(init_context->parent)); + +init_complete: + init_context->err = err; + complete(&init_context->init_done); + init_context = NULL; + + if (err) + return err; + + /* Wait to be woken up by the spawner before proceeding. */ + kthread_parkme(); + + if (!kthread_should_stop()) + err = thread_fn(kvm, data); + + return err; +} + +int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, + uintptr_t data, const char *name, + struct task_struct **thread_ptr) +{ + struct kvm_vm_worker_thread_context init_context = {}; + struct task_struct *thread; + + *thread_ptr = NULL; + init_context.kvm = kvm; + init_context.parent = current; + init_context.thread_fn = thread_fn; + init_context.data = data; + init_completion(&init_context.init_done); + + thread = kthread_run(kvm_vm_worker_thread, &init_context, + "%s-%d", name, task_pid_nr(current)); + if (IS_ERR(thread)) + return PTR_ERR(thread); + + /* kthread_run is never supposed to return NULL */ + WARN_ON(thread == NULL); + + wait_for_completion(&init_context.init_done); + + if (!init_context.err) + *thread_ptr = thread; + + return init_context.err; +}