Skip to content

Commit

Permalink
Merge tag 'kvmarm-fixes-for-5.1' of git://git.kernel.org/pub/scm/linu…
Browse files Browse the repository at this point in the history
…x/kernel/git/kvmarm/kvmarm into kvm-master

KVM/ARM fixes for 5.1

- Fix THP handling in the presence of pre-existing PTEs
- Honor request for PTE mappings even when THPs are available
- GICv4 performance improvement
- Take the srcu lock when writing to guest-controlled ITS data structures
- Reset the virtual PMU in preemptible context
- Various cleanups
  • Loading branch information
bonzini committed Mar 28, 2019
2 parents e2788c4 + 8324c3d commit 690edec
Show file tree
Hide file tree
Showing 9 changed files with 133 additions and 75 deletions.
11 changes: 11 additions & 0 deletions arch/arm/include/asm/kvm_mmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm,
return ret;
}

static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
const void *data, unsigned long len)
{
int srcu_idx = srcu_read_lock(&kvm->srcu);
int ret = kvm_write_guest(kvm, gpa, data, len);

srcu_read_unlock(&kvm->srcu, srcu_idx);

return ret;
}

static inline void *kvm_get_hyp_vector(void)
{
switch(read_cpuid_part()) {
Expand Down
2 changes: 2 additions & 0 deletions arch/arm/include/asm/stage2_pgtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ static inline bool kvm_stage2_has_pud(struct kvm *kvm)

#define S2_PMD_MASK PMD_MASK
#define S2_PMD_SIZE PMD_SIZE
#define S2_PUD_MASK PUD_MASK
#define S2_PUD_SIZE PUD_SIZE

static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
{
Expand Down
11 changes: 11 additions & 0 deletions arch/arm64/include/asm/kvm_mmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,17 @@ static inline int kvm_read_guest_lock(struct kvm *kvm,
return ret;
}

static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
const void *data, unsigned long len)
{
int srcu_idx = srcu_read_lock(&kvm->srcu);
int ret = kvm_write_guest(kvm, gpa, data, len);

srcu_read_unlock(&kvm->srcu, srcu_idx);

return ret;
}

#ifdef CONFIG_KVM_INDIRECT_VECTORS
/*
* EL2 vectors can be mapped and rerouted in a number of ways,
Expand Down
6 changes: 3 additions & 3 deletions arch/arm64/kvm/reset.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
int ret = -EINVAL;
bool loaded;

/* Reset PMU outside of the non-preemptible section */
kvm_pmu_vcpu_reset(vcpu);

preempt_disable();
loaded = (vcpu->cpu != -1);
if (loaded)
Expand Down Expand Up @@ -170,9 +173,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
vcpu->arch.reset_state.reset = false;
}

/* Reset PMU */
kvm_pmu_vcpu_reset(vcpu);

/* Default workaround setup is enabled (if supported) */
if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL)
vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
Expand Down
4 changes: 2 additions & 2 deletions virt/kvm/arm/hyp/vgic-v3-sr.c
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
}
}

if (used_lrs) {
if (used_lrs || cpu_if->its_vpe.its_vm) {
int i;
u32 elrsr;

Expand All @@ -247,7 +247,7 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
int i;

if (used_lrs) {
if (used_lrs || cpu_if->its_vpe.its_vm) {
write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);

for (i = 0; i < used_lrs; i++)
Expand Down
125 changes: 73 additions & 52 deletions virt/kvm/arm/mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
* @addr: IPA
* @pmd: pmd pointer for IPA
*
* Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
* pages in the range dirty.
* Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
*/
static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
{
Expand All @@ -121,8 +120,7 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
* @addr: IPA
* @pud: pud pointer for IPA
*
* Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all
* pages in the range dirty.
* Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
*/
static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
{
Expand Down Expand Up @@ -899,9 +897,8 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
* kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
* @kvm: The KVM struct pointer for the VM.
*
* Allocates only the stage-2 HW PGD level table(s) (can support either full
* 40-bit input addresses or limited to 32-bit input addresses). Clears the
* allocated pages.
* Allocates only the stage-2 HW PGD level table(s) of size defined by
* stage2_pgd_size(kvm).
*
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
Expand Down Expand Up @@ -1067,25 +1064,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
{
pmd_t *pmd, old_pmd;

retry:
pmd = stage2_get_pmd(kvm, cache, addr);
VM_BUG_ON(!pmd);

old_pmd = *pmd;
/*
* Multiple vcpus faulting on the same PMD entry, can
* lead to them sequentially updating the PMD with the
* same value. Following the break-before-make
* (pmd_clear() followed by tlb_flush()) process can
* hinder forward progress due to refaults generated
* on missing translations.
*
* Skip updating the page table if the entry is
* unchanged.
*/
if (pmd_val(old_pmd) == pmd_val(*new_pmd))
return 0;

if (pmd_present(old_pmd)) {
/*
* Multiple vcpus faulting on the same PMD entry, can
* lead to them sequentially updating the PMD with the
* same value. Following the break-before-make
* (pmd_clear() followed by tlb_flush()) process can
* hinder forward progress due to refaults generated
* on missing translations.
* If we already have PTE level mapping for this block,
* we must unmap it to avoid inconsistent TLB state and
* leaking the table page. We could end up in this situation
* if the memory slot was marked for dirty logging and was
* reverted, leaving PTE level mappings for the pages accessed
* during the period. So, unmap the PTE level mapping for this
* block and retry, as we could have released the upper level
* table in the process.
*
* Skip updating the page table if the entry is
* unchanged.
* Normal THP split/merge follows mmu_notifier callbacks and do
* get handled accordingly.
*/
if (pmd_val(old_pmd) == pmd_val(*new_pmd))
return 0;

if (!pmd_thp_or_huge(old_pmd)) {
unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
goto retry;
}
/*
* Mapping in huge pages should only happen through a
* fault. If a page is merged into a transparent huge
Expand All @@ -1097,8 +1112,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
* should become splitting first, unmapped, merged,
* and mapped back in on-demand.
*/
VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));

WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
} else {
Expand All @@ -1114,21 +1128,31 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
{
pud_t *pudp, old_pud;

retry:
pudp = stage2_get_pud(kvm, cache, addr);
VM_BUG_ON(!pudp);

old_pud = *pudp;

/*
* A large number of vcpus faulting on the same stage 2 entry,
* can lead to a refault due to the
* stage2_pud_clear()/tlb_flush(). Skip updating the page
* tables if there is no change.
* can lead to a refault due to the stage2_pud_clear()/tlb_flush().
* Skip updating the page tables if there is no change.
*/
if (pud_val(old_pud) == pud_val(*new_pudp))
return 0;

if (stage2_pud_present(kvm, old_pud)) {
/*
* If we already have table level mapping for this block, unmap
* the range for this block and retry.
*/
if (!stage2_pud_huge(kvm, old_pud)) {
unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
goto retry;
}

WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
stage2_pud_clear(kvm, pudp);
kvm_tlb_flush_vmid_ipa(kvm, addr);
} else {
Expand Down Expand Up @@ -1451,13 +1475,11 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
}

/**
* stage2_wp_puds - write protect PGD range
* @pgd: pointer to pgd entry
* @addr: range start address
* @end: range end address
*
* Process PUD entries, for a huge PUD we cause a panic.
*/
* stage2_wp_puds - write protect PGD range
* @pgd: pointer to pgd entry
* @addr: range start address
* @end: range end address
*/
static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
phys_addr_t addr, phys_addr_t end)
{
Expand Down Expand Up @@ -1594,8 +1616,9 @@ static void kvm_send_hwpoison_signal(unsigned long address,
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
}

static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
unsigned long hva)
static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
unsigned long hva,
unsigned long map_size)
{
gpa_t gpa_start;
hva_t uaddr_start, uaddr_end;
Expand All @@ -1610,34 +1633,34 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,

/*
* Pages belonging to memslots that don't have the same alignment
* within a PMD for userspace and IPA cannot be mapped with stage-2
* PMD entries, because we'll end up mapping the wrong pages.
* within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
* PMD/PUD entries, because we'll end up mapping the wrong pages.
*
* Consider a layout like the following:
*
* memslot->userspace_addr:
* +-----+--------------------+--------------------+---+
* |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz|
* |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
* +-----+--------------------+--------------------+---+
*
* memslot->base_gfn << PAGE_SIZE:
* +---+--------------------+--------------------+-----+
* |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz|
* |abc|def Stage-2 block | Stage-2 block |tvxyz|
* +---+--------------------+--------------------+-----+
*
* If we create those stage-2 PMDs, we'll end up with this incorrect
* If we create those stage-2 blocks, we'll end up with this incorrect
* mapping:
* d -> f
* e -> g
* f -> h
*/
if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK))
if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
return false;

/*
* Next, let's make sure we're not trying to map anything not covered
* by the memslot. This means we have to prohibit PMD size mappings
* for the beginning and end of a non-PMD aligned and non-PMD sized
* by the memslot. This means we have to prohibit block size mappings
* for the beginning and end of a non-block aligned and non-block sized
* memory slot (illustrated by the head and tail parts of the
* userspace view above containing pages 'abcde' and 'xyz',
* respectively).
Expand All @@ -1646,8 +1669,8 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
* userspace_addr or the base_gfn, as both are equally aligned (per
* the check above) and equally sized.
*/
return (hva & S2_PMD_MASK) >= uaddr_start &&
(hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end;
return (hva & ~(map_size - 1)) >= uaddr_start &&
(hva & ~(map_size - 1)) + map_size <= uaddr_end;
}

static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
Expand Down Expand Up @@ -1676,12 +1699,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}

if (!fault_supports_stage2_pmd_mappings(memslot, hva))
force_pte = true;

if (logging_active)
force_pte = true;

/* Let's check if we will get back a huge page backed by hugetlbfs */
down_read(&current->mm->mmap_sem);
vma = find_vma_intersection(current->mm, hva, hva + 1);
Expand All @@ -1692,18 +1709,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}

vma_pagesize = vma_kernel_pagesize(vma);
if (logging_active ||
!fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
force_pte = true;
vma_pagesize = PAGE_SIZE;
}

/*
* The stage2 has a minimum of 2 level table (For arm64 see
* kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
* use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
* As for PUD huge maps, we must make sure that we have at least
* 3 levels, i.e, PMD is not folded.
*/
if ((vma_pagesize == PMD_SIZE ||
(vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) &&
!force_pte) {
if (vma_pagesize == PMD_SIZE ||
(vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
}
up_read(&current->mm->mmap_sem);

/* We need minimum second+third level pages */
Expand Down
Loading

0 comments on commit 690edec

Please sign in to comment.