s390/kvm: support collaborative memory management
authorKonstantin Weitz <konstantin.weitz@gmail.com>
Wed, 17 Apr 2013 15:36:29 +0000 (17:36 +0200)
committerMartin Schwidefsky <schwidefsky@de.ibm.com>
Fri, 21 Feb 2014 07:50:19 +0000 (08:50 +0100)
This patch enables Collaborative Memory Management (CMM) for kvm
on s390. CMM allows the guest to inform the host about page usage
(see arch/s390/mm/cmm.c). The host uses this information to avoid
swapping in unused pages in the page fault handler. Further, a CPU
provided list of unused invalid pages is processed to reclaim swap
space of not yet accessed unused pages.

[ Martin Schwidefsky: patch reordering and cleanup ]

Signed-off-by: Konstantin Weitz <konstantin.weitz@gmail.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/pgtable.h
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/priv.c
arch/s390/mm/pgtable.c

index eef3dd3fd9a9f76d105b2c92d266dc5079c7833f..9bf95bb30f1a6cf27d0494396168f87e23cf8a65 100644 (file)
@@ -106,7 +106,9 @@ struct kvm_s390_sie_block {
        __u64   gbea;                   /* 0x0180 */
        __u8    reserved188[24];        /* 0x0188 */
        __u32   fac;                    /* 0x01a0 */
-       __u8    reserved1a4[68];        /* 0x01a4 */
+       __u8    reserved1a4[20];        /* 0x01a4 */
+       __u64   cbrlo;                  /* 0x01b8 */
+       __u8    reserved1c0[40];        /* 0x01c0 */
        __u64   itdba;                  /* 0x01e8 */
        __u8    reserved1f0[16];        /* 0x01f0 */
 } __attribute__((packed));
@@ -155,6 +157,7 @@ struct kvm_vcpu_stat {
        u32 instruction_stsi;
        u32 instruction_stfl;
        u32 instruction_tprot;
+       u32 instruction_essa;
        u32 instruction_sigp_sense;
        u32 instruction_sigp_sense_running;
        u32 instruction_sigp_external_call;
index fc4bb82a07391bca616268acd15aa058a06bcb86..a7dd672c97f86027c813e7d3439333b548c45fd8 100644 (file)
@@ -229,6 +229,7 @@ extern unsigned long MODULES_END;
 #define _PAGE_READ     0x010           /* SW pte read bit */
 #define _PAGE_WRITE    0x020           /* SW pte write bit */
 #define _PAGE_SPECIAL  0x040           /* SW associated with special page */
+#define _PAGE_UNUSED   0x080           /* SW bit for pgste usage state */
 #define __HAVE_ARCH_PTE_SPECIAL
 
 /* Set of bits not changed in pte_modify */
@@ -394,6 +395,12 @@ extern unsigned long MODULES_END;
 
 #endif /* CONFIG_64BIT */
 
+/* Guest Page State used for virtualization */
+#define _PGSTE_GPS_ZERO                0x0000000080000000UL
+#define _PGSTE_GPS_USAGE_MASK  0x0000000003000000UL
+#define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL
+#define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL
+
 /*
  * A user page table pointer has the space-switch-event bit, the
  * private-space-control bit and the storage-alteration-event-control
@@ -617,6 +624,14 @@ static inline int pte_none(pte_t pte)
        return pte_val(pte) == _PAGE_INVALID;
 }
 
+static inline int pte_swap(pte_t pte)
+{
+       /* Bit pattern: (pte & 0x603) == 0x402 */
+       return (pte_val(pte) & (_PAGE_INVALID | _PAGE_PROTECT |
+                               _PAGE_TYPE | _PAGE_PRESENT))
+               == (_PAGE_INVALID | _PAGE_TYPE);
+}
+
 static inline int pte_file(pte_t pte)
 {
        /* Bit pattern: (pte & 0x601) == 0x600 */
@@ -821,6 +836,7 @@ unsigned long gmap_translate(unsigned long address, struct gmap *);
 unsigned long __gmap_fault(unsigned long address, struct gmap *);
 unsigned long gmap_fault(unsigned long address, struct gmap *);
 void gmap_discard(unsigned long from, unsigned long to, struct gmap *);
+void __gmap_zap(unsigned long address, struct gmap *);
 
 void gmap_register_ipte_notifier(struct gmap_notifier *);
 void gmap_unregister_ipte_notifier(struct gmap_notifier *);
@@ -852,6 +868,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 
        if (mm_has_pgste(mm)) {
                pgste = pgste_get_lock(ptep);
+               pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
                pgste_set_key(ptep, pgste, entry);
                pgste_set_pte(ptep, entry);
                pgste_set_unlock(ptep, pgste);
@@ -881,6 +898,12 @@ static inline int pte_young(pte_t pte)
        return (pte_val(pte) & _PAGE_YOUNG) != 0;
 }
 
+#define __HAVE_ARCH_PTE_UNUSED
+static inline int pte_unused(pte_t pte)
+{
+       return pte_val(pte) & _PAGE_UNUSED;
+}
+
 /*
  * pgd/pmd/pte modification functions
  */
@@ -1196,6 +1219,9 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
        pte_val(*ptep) = _PAGE_INVALID;
 
        if (mm_has_pgste(vma->vm_mm)) {
+               if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
+                   _PGSTE_GPS_USAGE_UNUSED)
+                       pte_val(pte) |= _PAGE_UNUSED;
                pgste = pgste_update_all(&pte, pgste);
                pgste_set_unlock(ptep, pgste);
        }
index e0676f390d57d22aeaf991a95c0a7d0e3458d369..10b5db3c9bc4a71d179ed02b994d7fdea6109311 100644 (file)
@@ -68,6 +68,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "instruction_storage_key", VCPU_STAT(instruction_storage_key) },
        { "instruction_stsch", VCPU_STAT(instruction_stsch) },
        { "instruction_chsc", VCPU_STAT(instruction_chsc) },
+       { "instruction_essa", VCPU_STAT(instruction_essa) },
        { "instruction_stsi", VCPU_STAT(instruction_stsi) },
        { "instruction_stfl", VCPU_STAT(instruction_stfl) },
        { "instruction_tprot", VCPU_STAT(instruction_tprot) },
@@ -283,7 +284,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        if (kvm_is_ucontrol(vcpu->kvm))
                gmap_free(vcpu->arch.gmap);
 
+       if (vcpu->arch.sie_block->cbrlo)
+               __free_page(__pfn_to_page(
+                               vcpu->arch.sie_block->cbrlo >> PAGE_SHIFT));
        free_page((unsigned long)(vcpu->arch.sie_block));
+
        kvm_vcpu_uninit(vcpu);
        kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
@@ -390,6 +395,8 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
+       struct page *cbrl;
+
        atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
                                                    CPUSTAT_SM |
                                                    CPUSTAT_STOPPED |
@@ -401,6 +408,14 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        vcpu->arch.sie_block->ecb2  = 8;
        vcpu->arch.sie_block->eca   = 0xC1002001U;
        vcpu->arch.sie_block->fac   = (int) (long) vfacilities;
+       if (kvm_enabled_cmma()) {
+               cbrl = alloc_page(GFP_KERNEL | __GFP_ZERO);
+               if (cbrl) {
+                       vcpu->arch.sie_block->ecb2 |= 0x80;
+                       vcpu->arch.sie_block->ecb2 &= ~0x08;
+                       vcpu->arch.sie_block->cbrlo = page_to_phys(cbrl);
+               }
+       }
        hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
        tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
                     (unsigned long) vcpu);
@@ -761,6 +776,16 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
        return rc;
 }
 
+bool kvm_enabled_cmma(void)
+{
+       if (!MACHINE_IS_LPAR)
+               return false;
+       /* only enable for z10 and later */
+       if (!MACHINE_HAS_EDAT1)
+               return false;
+       return true;
+}
+
 static int __vcpu_run(struct kvm_vcpu *vcpu)
 {
        int rc, exit_reason;
index f9559b0bd620962d095851fc5c884a759b49996c..564514f410f45682272bdc5a3e5064306a9e9960 100644 (file)
@@ -156,6 +156,8 @@ void s390_vcpu_block(struct kvm_vcpu *vcpu);
 void s390_vcpu_unblock(struct kvm_vcpu *vcpu);
 void exit_sie(struct kvm_vcpu *vcpu);
 void exit_sie_sync(struct kvm_vcpu *vcpu);
+/* are we going to support cmma? */
+bool kvm_enabled_cmma(void);
 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
 
index 75beea632a10ee7c1bba185d344831e11ec684c5..aacb6b129914bc1c7d207d0587fc4fb2efe66ccf 100644 (file)
@@ -636,8 +636,49 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int handle_essa(struct kvm_vcpu *vcpu)
+{
+       /* entries expected to be 1FF */
+       int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
+       unsigned long *cbrlo, cbrle;
+       struct gmap *gmap;
+       int i;
+
+       VCPU_EVENT(vcpu, 5, "cmma release %d pages", entries);
+       gmap = vcpu->arch.gmap;
+       vcpu->stat.instruction_essa++;
+       if (!kvm_enabled_cmma() || !vcpu->arch.sie_block->cbrlo)
+               return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+
+       if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+               return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+       if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+       /* Rewind PSW to repeat the ESSA instruction */
+       vcpu->arch.sie_block->gpsw.addr =
+               __rewind_psw(vcpu->arch.sie_block->gpsw, 4);
+       vcpu->arch.sie_block->cbrlo &= PAGE_MASK;       /* reset nceo */
+       cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
+       down_read(&gmap->mm->mmap_sem);
+       for (i = 0; i < entries; ++i) {
+               cbrle = cbrlo[i];
+               if (unlikely(cbrle & ~PAGE_MASK || cbrle < 2 * PAGE_SIZE))
+                       /* invalid entry */
+                       break;
+               /* try to free backing */
+               __gmap_zap(cbrle, gmap);
+       }
+       up_read(&gmap->mm->mmap_sem);
+       if (i < entries)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+       return 0;
+}
+
 static const intercept_handler_t b9_handlers[256] = {
        [0x8d] = handle_epsw,
+       [0xab] = handle_essa,
        [0xaf] = handle_pfmf,
 };
 
index 3584ed9b20a183de8c58cb4521c99517c2532c9a..9e2b4705dea2323300f9b3b725a4fd639e1c9a09 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/quicklist.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
+#include <linux/swapops.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -594,6 +595,82 @@ unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
 }
 EXPORT_SYMBOL_GPL(gmap_fault);
 
+static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
+{
+       if (!non_swap_entry(entry))
+               dec_mm_counter(mm, MM_SWAPENTS);
+       else if (is_migration_entry(entry)) {
+               struct page *page = migration_entry_to_page(entry);
+
+               if (PageAnon(page))
+                       dec_mm_counter(mm, MM_ANONPAGES);
+               else
+                       dec_mm_counter(mm, MM_FILEPAGES);
+       }
+       free_swap_and_cache(entry);
+}
+
+/**
+ * The mm->mmap_sem lock must be held
+ */
+static void gmap_zap_unused(struct mm_struct *mm, unsigned long address)
+{
+       unsigned long ptev, pgstev;
+       spinlock_t *ptl;
+       pgste_t pgste;
+       pte_t *ptep, pte;
+
+       ptep = get_locked_pte(mm, address, &ptl);
+       if (unlikely(!ptep))
+               return;
+       pte = *ptep;
+       if (!pte_swap(pte))
+               goto out_pte;
+       /* Zap unused and logically-zero pages */
+       pgste = pgste_get_lock(ptep);
+       pgstev = pgste_val(pgste);
+       ptev = pte_val(pte);
+       if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
+           ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
+               gmap_zap_swap_entry(pte_to_swp_entry(pte), mm);
+               pte_clear(mm, address, ptep);
+       }
+       pgste_set_unlock(ptep, pgste);
+out_pte:
+       pte_unmap_unlock(*ptep, ptl);
+}
+
+/*
+ * this function is assumed to be called with mmap_sem held
+ */
+void __gmap_zap(unsigned long address, struct gmap *gmap)
+{
+       unsigned long *table, *segment_ptr;
+       unsigned long segment, pgstev, ptev;
+       struct gmap_pgtable *mp;
+       struct page *page;
+
+       segment_ptr = gmap_table_walk(address, gmap);
+       if (IS_ERR(segment_ptr))
+               return;
+       segment = *segment_ptr;
+       if (segment & _SEGMENT_ENTRY_INVALID)
+               return;
+       page = pfn_to_page(segment >> PAGE_SHIFT);
+       mp = (struct gmap_pgtable *) page->index;
+       address = mp->vmaddr | (address & ~PMD_MASK);
+       /* Page table is present */
+       table = (unsigned long *)(segment & _SEGMENT_ENTRY_ORIGIN);
+       table = table + ((address >> 12) & 0xff);
+       pgstev = table[PTRS_PER_PTE];
+       ptev = table[0];
+       /* quick check, checked again with locks held */
+       if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
+           ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID)))
+               gmap_zap_unused(gmap->mm, address);
+}
+EXPORT_SYMBOL_GPL(__gmap_zap);
+
 void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
 {