2020-10-14 20:26:42 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
|
|
#ifndef __KVM_X86_MMU_TDP_ITER_H
|
|
|
|
|
#define __KVM_X86_MMU_TDP_ITER_H
|
|
|
|
|
|
|
|
|
|
#include <linux/kvm_host.h>
|
|
|
|
|
|
|
|
|
|
#include "mmu.h"
|
KVM: x86/mmu: Use atomic XCHG to write TDP MMU SPTEs with volatile bits
Use an atomic XCHG to write TDP MMU SPTEs that have volatile bits, even
if mmu_lock is held for write, as volatile SPTEs can be written by other
tasks/vCPUs outside of mmu_lock. If a vCPU uses the to-be-modified SPTE
to write a page, the CPU can cache the translation as WRITABLE in the TLB
despite it being seen by KVM as !WRITABLE, and/or KVM can clobber the
Accessed/Dirty bits and not properly tag the backing page.
Exempt non-leaf SPTEs from atomic updates as KVM itself doesn't modify
non-leaf SPTEs without holding mmu_lock, they do not have Dirty bits, and
KVM doesn't consume the Accessed bit of non-leaf SPTEs.
Dropping the Dirty and/or Writable bits is most problematic for dirty
logging, as doing so can result in a missed TLB flush and eventually a
missed dirty page. In the unlikely event that the only dirty page(s) is
a clobbered SPTE, clear_dirty_gfn_range() will see the SPTE as not dirty
(based on the Dirty or Writable bit depending on the method) and so not
update the SPTE and ultimately not flush. If the SPTE is cached in the
TLB as writable before it is clobbered, the guest can continue writing
the associated page without ever taking a write-protect fault.
For most (all?) file back memory, dropping the Dirty bit is a non-issue.
The primary MMU write-protects its PTEs on writeback, i.e. KVM's dirty
bit is effectively ignored because the primary MMU will mark that page
dirty when the write-protection is lifted, e.g. when KVM faults the page
back in for write.
The Accessed bit is a complete non-issue. Aside from being unused for
non-leaf SPTEs, KVM doesn't do a TLB flush when aging SPTEs, i.e. the
Accessed bit may be dropped anyways.
Lastly, the Writable bit is also problematic as an extension of the Dirty
bit, as KVM (correctly) treats the Dirty bit as volatile iff the SPTE is
!DIRTY && WRITABLE. If KVM fixes an MMU-writable, but !WRITABLE, SPTE
out of mmu_lock, then it can allow the CPU to set the Dirty bit despite
the SPTE being !WRITABLE when it is checked by KVM. But that all depends
on the Dirty bit being problematic in the first place.
Fixes: 2f2fad0897cb ("kvm: x86/mmu: Add functions to handle changed TDP SPTEs")
Cc: stable@vger.kernel.org
Cc: Ben Gardon <bgardon@google.com>
Cc: David Matlack <dmatlack@google.com>
Cc: Venkatesh Srinivas <venkateshs@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220423034752.1161007-4-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-04-23 03:47:43 +00:00
|
|
|
#include "spte.h"
|
2020-10-14 20:26:42 +02:00
|
|
|
|
2022-02-26 00:15:28 +00:00
|
|
|
/*
|
|
|
|
|
* TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
|
2022-02-26 00:15:37 +00:00
|
|
|
* to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
|
|
|
|
|
* batched without having to collect the list of zapped SPs. Flows that can
|
|
|
|
|
* remove SPs must service pending TLB flushes prior to dropping RCU protection.
|
2022-02-26 00:15:28 +00:00
|
|
|
*/
|
|
|
|
|
static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
|
|
|
|
|
{
|
|
|
|
|
return READ_ONCE(*rcu_dereference(sptep));
|
|
|
|
|
}
|
KVM: x86/mmu: Use atomic XCHG to write TDP MMU SPTEs with volatile bits
Use an atomic XCHG to write TDP MMU SPTEs that have volatile bits, even
if mmu_lock is held for write, as volatile SPTEs can be written by other
tasks/vCPUs outside of mmu_lock. If a vCPU uses the to-be-modified SPTE
to write a page, the CPU can cache the translation as WRITABLE in the TLB
despite it being seen by KVM as !WRITABLE, and/or KVM can clobber the
Accessed/Dirty bits and not properly tag the backing page.
Exempt non-leaf SPTEs from atomic updates as KVM itself doesn't modify
non-leaf SPTEs without holding mmu_lock, they do not have Dirty bits, and
KVM doesn't consume the Accessed bit of non-leaf SPTEs.
Dropping the Dirty and/or Writable bits is most problematic for dirty
logging, as doing so can result in a missed TLB flush and eventually a
missed dirty page. In the unlikely event that the only dirty page(s) is
a clobbered SPTE, clear_dirty_gfn_range() will see the SPTE as not dirty
(based on the Dirty or Writable bit depending on the method) and so not
update the SPTE and ultimately not flush. If the SPTE is cached in the
TLB as writable before it is clobbered, the guest can continue writing
the associated page without ever taking a write-protect fault.
For most (all?) file back memory, dropping the Dirty bit is a non-issue.
The primary MMU write-protects its PTEs on writeback, i.e. KVM's dirty
bit is effectively ignored because the primary MMU will mark that page
dirty when the write-protection is lifted, e.g. when KVM faults the page
back in for write.
The Accessed bit is a complete non-issue. Aside from being unused for
non-leaf SPTEs, KVM doesn't do a TLB flush when aging SPTEs, i.e. the
Accessed bit may be dropped anyways.
Lastly, the Writable bit is also problematic as an extension of the Dirty
bit, as KVM (correctly) treats the Dirty bit as volatile iff the SPTE is
!DIRTY && WRITABLE. If KVM fixes an MMU-writable, but !WRITABLE, SPTE
out of mmu_lock, then it can allow the CPU to set the Dirty bit despite
the SPTE being !WRITABLE when it is checked by KVM. But that all depends
on the Dirty bit being problematic in the first place.
Fixes: 2f2fad0897cb ("kvm: x86/mmu: Add functions to handle changed TDP SPTEs")
Cc: stable@vger.kernel.org
Cc: Ben Gardon <bgardon@google.com>
Cc: David Matlack <dmatlack@google.com>
Cc: Venkatesh Srinivas <venkateshs@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220423034752.1161007-4-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-04-23 03:47:43 +00:00
|
|
|
|
|
|
|
|
static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
|
|
|
|
|
{
|
|
|
|
|
return xchg(rcu_dereference(sptep), new_spte);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
|
2022-02-26 00:15:28 +00:00
|
|
|
{
|
KVM: x86/mmu: Use atomic XCHG to write TDP MMU SPTEs with volatile bits
Use an atomic XCHG to write TDP MMU SPTEs that have volatile bits, even
if mmu_lock is held for write, as volatile SPTEs can be written by other
tasks/vCPUs outside of mmu_lock. If a vCPU uses the to-be-modified SPTE
to write a page, the CPU can cache the translation as WRITABLE in the TLB
despite it being seen by KVM as !WRITABLE, and/or KVM can clobber the
Accessed/Dirty bits and not properly tag the backing page.
Exempt non-leaf SPTEs from atomic updates as KVM itself doesn't modify
non-leaf SPTEs without holding mmu_lock, they do not have Dirty bits, and
KVM doesn't consume the Accessed bit of non-leaf SPTEs.
Dropping the Dirty and/or Writable bits is most problematic for dirty
logging, as doing so can result in a missed TLB flush and eventually a
missed dirty page. In the unlikely event that the only dirty page(s) is
a clobbered SPTE, clear_dirty_gfn_range() will see the SPTE as not dirty
(based on the Dirty or Writable bit depending on the method) and so not
update the SPTE and ultimately not flush. If the SPTE is cached in the
TLB as writable before it is clobbered, the guest can continue writing
the associated page without ever taking a write-protect fault.
For most (all?) file back memory, dropping the Dirty bit is a non-issue.
The primary MMU write-protects its PTEs on writeback, i.e. KVM's dirty
bit is effectively ignored because the primary MMU will mark that page
dirty when the write-protection is lifted, e.g. when KVM faults the page
back in for write.
The Accessed bit is a complete non-issue. Aside from being unused for
non-leaf SPTEs, KVM doesn't do a TLB flush when aging SPTEs, i.e. the
Accessed bit may be dropped anyways.
Lastly, the Writable bit is also problematic as an extension of the Dirty
bit, as KVM (correctly) treats the Dirty bit as volatile iff the SPTE is
!DIRTY && WRITABLE. If KVM fixes an MMU-writable, but !WRITABLE, SPTE
out of mmu_lock, then it can allow the CPU to set the Dirty bit despite
the SPTE being !WRITABLE when it is checked by KVM. But that all depends
on the Dirty bit being problematic in the first place.
Fixes: 2f2fad0897cb ("kvm: x86/mmu: Add functions to handle changed TDP SPTEs")
Cc: stable@vger.kernel.org
Cc: Ben Gardon <bgardon@google.com>
Cc: David Matlack <dmatlack@google.com>
Cc: Venkatesh Srinivas <venkateshs@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220423034752.1161007-4-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-04-23 03:47:43 +00:00
|
|
|
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
|
|
|
|
|
u64 new_spte, int level)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* Atomically write the SPTE if it is a shadow-present, leaf SPTE with
|
|
|
|
|
* volatile bits, i.e. has bits that can be set outside of mmu_lock.
|
|
|
|
|
* The Writable bit can be set by KVM's fast page fault handler, and
|
|
|
|
|
* Accessed and Dirty bits can be set by the CPU.
|
|
|
|
|
*
|
|
|
|
|
* Note, non-leaf SPTEs do have Accessed bits and those bits are
|
|
|
|
|
* technically volatile, but KVM doesn't consume the Accessed bit of
|
|
|
|
|
* non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
|
|
|
|
|
* logic needs to be reassessed if KVM were to use non-leaf Accessed
|
|
|
|
|
* bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
|
|
|
|
|
*/
|
|
|
|
|
if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
|
|
|
|
|
spte_has_volatile_bits(old_spte))
|
|
|
|
|
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
|
|
|
|
|
|
|
|
|
|
__kvm_tdp_mmu_write_spte(sptep, new_spte);
|
|
|
|
|
return old_spte;
|
2022-02-26 00:15:28 +00:00
|
|
|
}
|
|
|
|
|
|
2020-10-14 20:26:42 +02:00
|
|
|
/*
|
|
|
|
|
* A TDP iterator performs a pre-order walk over a TDP paging structure.
|
|
|
|
|
*/
|
|
|
|
|
struct tdp_iter {
|
|
|
|
|
/*
|
|
|
|
|
* The iterator will traverse the paging structure towards the mapping
|
|
|
|
|
* for this GFN.
|
|
|
|
|
*/
|
2021-02-02 10:57:18 -08:00
|
|
|
gfn_t next_last_level_gfn;
|
2021-02-02 10:57:19 -08:00
|
|
|
/*
|
|
|
|
|
* The next_last_level_gfn at the time when the thread last
|
|
|
|
|
* yielded. Only yielding when the next_last_level_gfn !=
|
|
|
|
|
* yielded_gfn helps ensure forward progress.
|
|
|
|
|
*/
|
|
|
|
|
gfn_t yielded_gfn;
|
2020-10-14 20:26:42 +02:00
|
|
|
/* Pointers to the page tables traversed to reach the current SPTE */
|
2021-02-02 10:57:23 -08:00
|
|
|
tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
|
2020-10-14 20:26:42 +02:00
|
|
|
/* A pointer to the current SPTE */
|
2021-02-02 10:57:23 -08:00
|
|
|
tdp_ptep_t sptep;
|
2020-10-14 20:26:42 +02:00
|
|
|
/* The lowest GFN mapped by the current SPTE */
|
|
|
|
|
gfn_t gfn;
|
|
|
|
|
/* The level of the root page given to the iterator */
|
|
|
|
|
int root_level;
|
|
|
|
|
/* The lowest level the iterator should traverse to */
|
|
|
|
|
int min_level;
|
|
|
|
|
/* The iterator's current level within the paging structure */
|
|
|
|
|
int level;
|
2021-03-15 16:38:03 -07:00
|
|
|
/* The address space ID, i.e. SMM vs. regular. */
|
|
|
|
|
int as_id;
|
2020-10-14 20:26:42 +02:00
|
|
|
/* A snapshot of the value at sptep */
|
|
|
|
|
u64 old_spte;
|
|
|
|
|
/*
|
|
|
|
|
* Whether the iterator has a valid state. This will be false if the
|
|
|
|
|
* iterator walks off the end of the paging structure.
|
|
|
|
|
*/
|
|
|
|
|
bool valid;
|
KVM: x86/mmu: Don't advance iterator after restart due to yielding
After dropping mmu_lock in the TDP MMU, restart the iterator during
tdp_iter_next() and do not advance the iterator. Advancing the iterator
results in skipping the top-level SPTE and all its children, which is
fatal if any of the skipped SPTEs were not visited before yielding.
When zapping all SPTEs, i.e. when min_level == root_level, restarting the
iter and then invoking tdp_iter_next() is always fatal if the current gfn
has as a valid SPTE, as advancing the iterator results in try_step_side()
skipping the current gfn, which wasn't visited before yielding.
Sprinkle WARNs on iter->yielded being true in various helpers that are
often used in conjunction with yielding, and tag the helper with
__must_check to reduce the probabily of improper usage.
Failing to zap a top-level SPTE manifests in one of two ways. If a valid
SPTE is skipped by both kvm_tdp_mmu_zap_all() and kvm_tdp_mmu_put_root(),
the shadow page will be leaked and KVM will WARN accordingly.
WARNING: CPU: 1 PID: 3509 at arch/x86/kvm/mmu/tdp_mmu.c:46 [kvm]
RIP: 0010:kvm_mmu_uninit_tdp_mmu+0x3e/0x50 [kvm]
Call Trace:
<TASK>
kvm_arch_destroy_vm+0x130/0x1b0 [kvm]
kvm_destroy_vm+0x162/0x2a0 [kvm]
kvm_vcpu_release+0x34/0x60 [kvm]
__fput+0x82/0x240
task_work_run+0x5c/0x90
do_exit+0x364/0xa10
? futex_unqueue+0x38/0x60
do_group_exit+0x33/0xa0
get_signal+0x155/0x850
arch_do_signal_or_restart+0xed/0x750
exit_to_user_mode_prepare+0xc5/0x120
syscall_exit_to_user_mode+0x1d/0x40
do_syscall_64+0x48/0xc0
entry_SYSCALL_64_after_hwframe+0x44/0xae
If kvm_tdp_mmu_zap_all() skips a gfn/SPTE but that SPTE is then zapped by
kvm_tdp_mmu_put_root(), KVM triggers a use-after-free in the form of
marking a struct page as dirty/accessed after it has been put back on the
free list. This directly triggers a WARN due to encountering a page with
page_count() == 0, but it can also lead to data corruption and additional
errors in the kernel.
WARNING: CPU: 7 PID: 1995658 at arch/x86/kvm/../../../virt/kvm/kvm_main.c:171
RIP: 0010:kvm_is_zone_device_pfn.part.0+0x9e/0xd0 [kvm]
Call Trace:
<TASK>
kvm_set_pfn_dirty+0x120/0x1d0 [kvm]
__handle_changed_spte+0x92e/0xca0 [kvm]
__handle_changed_spte+0x63c/0xca0 [kvm]
__handle_changed_spte+0x63c/0xca0 [kvm]
__handle_changed_spte+0x63c/0xca0 [kvm]
zap_gfn_range+0x549/0x620 [kvm]
kvm_tdp_mmu_put_root+0x1b6/0x270 [kvm]
mmu_free_root_page+0x219/0x2c0 [kvm]
kvm_mmu_free_roots+0x1b4/0x4e0 [kvm]
kvm_mmu_unload+0x1c/0xa0 [kvm]
kvm_arch_destroy_vm+0x1f2/0x5c0 [kvm]
kvm_put_kvm+0x3b1/0x8b0 [kvm]
kvm_vcpu_release+0x4e/0x70 [kvm]
__fput+0x1f7/0x8c0
task_work_run+0xf8/0x1a0
do_exit+0x97b/0x2230
do_group_exit+0xda/0x2a0
get_signal+0x3be/0x1e50
arch_do_signal_or_restart+0x244/0x17f0
exit_to_user_mode_prepare+0xcb/0x120
syscall_exit_to_user_mode+0x1d/0x40
do_syscall_64+0x4d/0x90
entry_SYSCALL_64_after_hwframe+0x44/0xae
Note, the underlying bug existed even before commit 1af4a96025b3 ("KVM:
x86/mmu: Yield in TDU MMU iter even if no SPTES changed") moved calls to
tdp_mmu_iter_cond_resched() to the beginning of loops, as KVM could still
incorrectly advance past a top-level entry when yielding on a lower-level
entry. But with respect to leaking shadow pages, the bug was introduced
by yielding before processing the current gfn.
Alternatively, tdp_mmu_iter_cond_resched() could simply fall through, or
callers could jump to their "retry" label. The downside of that approach
is that tdp_mmu_iter_cond_resched() _must_ be called before anything else
in the loop, and there's no easy way to enfornce that requirement.
Ideally, KVM would handling the cond_resched() fully within the iterator
macro (the code is actually quite clean) and avoid this entire class of
bugs, but that is extremely difficult do while also supporting yielding
after tdp_mmu_set_spte_atomic() fails. Yielding after failing to set a
SPTE is very desirable as the "owner" of the REMOVED_SPTE isn't strictly
bounded, e.g. if it's zapping a high-level shadow page, the REMOVED_SPTE
may block operations on the SPTE for a significant amount of time.
Fixes: faaf05b00aec ("kvm: x86/mmu: Support zapping SPTEs in the TDP MMU")
Fixes: 1af4a96025b3 ("KVM: x86/mmu: Yield in TDU MMU iter even if no SPTES changed")
Reported-by: Ignat Korchagin <ignat@cloudflare.com>
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20211214033528.123268-1-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-12-14 03:35:28 +00:00
|
|
|
/*
|
|
|
|
|
* True if KVM dropped mmu_lock and yielded in the middle of a walk, in
|
|
|
|
|
* which case tdp_iter_next() needs to restart the walk at the root
|
|
|
|
|
* level instead of advancing to the next entry.
|
|
|
|
|
*/
|
|
|
|
|
bool yielded;
|
2020-10-14 20:26:42 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Iterates over every SPTE mapping the GFN range [start, end) in a
|
|
|
|
|
* preorder traversal.
|
|
|
|
|
*/
|
2022-01-19 23:07:32 +00:00
|
|
|
#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \
|
|
|
|
|
for (tdp_iter_start(&iter, root, min_level, start); \
|
2020-10-14 20:26:42 +02:00
|
|
|
iter.valid && iter.gfn < end; \
|
|
|
|
|
tdp_iter_next(&iter))
|
|
|
|
|
|
2022-01-19 23:07:32 +00:00
|
|
|
#define for_each_tdp_pte(iter, root, start, end) \
|
|
|
|
|
for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end)
|
2020-10-14 11:26:55 -07:00
|
|
|
|
2021-02-02 10:57:23 -08:00
|
|
|
tdp_ptep_t spte_to_child_pt(u64 pte, int level);
|
2020-10-14 20:26:42 +02:00
|
|
|
|
2022-01-19 23:07:32 +00:00
|
|
|
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
|
2021-02-02 10:57:18 -08:00
|
|
|
int min_level, gfn_t next_last_level_gfn);
|
2020-10-14 20:26:42 +02:00
|
|
|
void tdp_iter_next(struct tdp_iter *iter);
|
2021-03-15 16:38:02 -07:00
|
|
|
void tdp_iter_restart(struct tdp_iter *iter);
|
2020-10-14 20:26:42 +02:00
|
|
|
|
|
|
|
|
#endif /* __KVM_X86_MMU_TDP_ITER_H */
|