From: Linus Torvalds Date: Sun, 19 Aug 2018 17:38:36 +0000 (-0700) Subject: Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm X-Git-Url: http://git.cdn.openwrt.org/?a=commitdiff_plain;h=e61cf2e3a5b452cfefcb145021f5a8ea88735cc1;p=openwrt%2Fstaging%2Fblogic.git Merge tag 'for-linus' of git://git./virt/kvm/kvm Pull first set of KVM updates from Paolo Bonzini: "PPC: - minor code cleanups x86: - PCID emulation and CR3 caching for shadow page tables - nested VMX live migration - nested VMCS shadowing - optimized IPI hypercall - some optimizations ARM will come next week" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (85 commits) kvm: x86: Set highest physical address bits in non-present/reserved SPTEs KVM/x86: Use CC_SET()/CC_OUT in arch/x86/kvm/vmx.c KVM: X86: Implement PV IPIs in linux guest KVM: X86: Add kvm hypervisor init time platform setup callback KVM: X86: Implement "send IPI" hypercall KVM/x86: Move X86_CR4_OSXSAVE check into kvm_valid_sregs() KVM: x86: Skip pae_root shadow allocation if tdp enabled KVM/MMU: Combine flushing remote tlb in mmu_set_spte() KVM: vmx: skip VMWRITE of HOST_{FS,GS}_BASE when possible KVM: vmx: skip VMWRITE of HOST_{FS,GS}_SEL when possible KVM: vmx: always initialize HOST_{FS,GS}_BASE to zero during setup KVM: vmx: move struct host_state usage to struct loaded_vmcs KVM: vmx: compute need to reload FS/GS/LDT on demand KVM: nVMX: remove a misleading comment regarding vmcs02 fields KVM: vmx: rename __vmx_load_host_state() and vmx_save_host_state() KVM: vmx: add dedicated utility to access guest's kernel_gs_base KVM: vmx: track host_state.loaded using a loaded_vmcs pointer KVM: vmx: refactor segmentation code in vmx_save_host_state() kvm: nVMX: Fix fault priority for VMX operations kvm: nVMX: Fix fault vector for VMX operation at CPL > 0 ... --- e61cf2e3a5b452cfefcb145021f5a8ea88735cc1 diff --cc arch/x86/include/asm/kvm_host.h index acebb808c4b5,c18958ef17d2..00ddb0c9e612 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@@ -1418,7 -1457,10 +1462,11 @@@ int kvm_cpu_get_interrupt(struct kvm_vc void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); + int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, + unsigned long ipi_bitmap_high, int min, + unsigned long icr, int op_64_bit); + +u64 kvm_get_arch_capabilities(void); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --cc arch/x86/include/asm/trace/hyperv.h index 9c0d4b588e3f,e1ffe61de8d6..2e6245a023ef --- a/arch/x86/include/asm/trace/hyperv.h +++ b/arch/x86/include/asm/trace/hyperv.h @@@ -28,21 -28,20 +28,35 @@@ TRACE_EVENT(hyperv_mmu_flush_tlb_others __entry->addr, __entry->end) ); + TRACE_EVENT(hyperv_nested_flush_guest_mapping, + TP_PROTO(u64 as, int ret), + TP_ARGS(as, ret), + + TP_STRUCT__entry( + __field(u64, as) + __field(int, ret) + ), + TP_fast_assign(__entry->as = as; + __entry->ret = ret; + ), + TP_printk("address space %llx ret %d", __entry->as, __entry->ret) + ); + +TRACE_EVENT(hyperv_send_ipi_mask, + TP_PROTO(const struct cpumask *cpus, + int vector), + TP_ARGS(cpus, vector), + TP_STRUCT__entry( + __field(unsigned int, ncpus) + __field(int, vector) + ), + TP_fast_assign(__entry->ncpus = cpumask_weight(cpus); + __entry->vector = vector; + ), + TP_printk("ncpus %d vector %x", + __entry->ncpus, __entry->vector) + ); + #endif /* CONFIG_HYPERV */ #undef TRACE_INCLUDE_PATH diff --cc arch/x86/kernel/kvm.c index 09aaabb2bbf1,62cbd089f709..0f471bd93417 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@@ -611,6 -716,19 +703,20 @@@ static uint32_t __init kvm_detect(void return kvm_cpuid_base(); } + static void __init kvm_apic_init(void) + { + #if defined(CONFIG_SMP) + if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI)) + kvm_setup_pv_ipi(); + #endif + } + + static void __init kvm_init_platform(void) + { ++ kvmclock_init(); + x86_platform.apic_post_init = kvm_apic_init; + } + const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, diff --cc arch/x86/kvm/vmx.c index 46b428c0990e,16f9373c01de..1519f030fd73 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@@ -188,150 -189,12 +189,156 @@@ module_param(ple_window_max, uint, 0444 extern const ulong vmx_return; +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); +static DEFINE_MUTEX(vmx_l1d_flush_mutex); + +/* Storage for pre module init parameter parsing */ +static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; + +static const struct { + const char *option; + enum vmx_l1d_flush_state cmd; +} vmentry_l1d_param[] = { + {"auto", VMENTER_L1D_FLUSH_AUTO}, + {"never", VMENTER_L1D_FLUSH_NEVER}, + {"cond", VMENTER_L1D_FLUSH_COND}, + {"always", VMENTER_L1D_FLUSH_ALWAYS}, +}; + +#define L1D_CACHE_ORDER 4 +static void *vmx_l1d_flush_pages; + +static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) +{ + struct page *page; + unsigned int i; + + if (!enable_ept) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; + return 0; + } + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { + u64 msr; + + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + } + + /* If set to auto use the default l1tf mitigation method */ + if (l1tf == VMENTER_L1D_FLUSH_AUTO) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + l1tf = VMENTER_L1D_FLUSH_NEVER; + break; + case L1TF_MITIGATION_FLUSH_NOWARN: + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + l1tf = VMENTER_L1D_FLUSH_COND; + break; + case L1TF_MITIGATION_FULL: + case L1TF_MITIGATION_FULL_FORCE: + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + break; + } + } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + } + + if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && + !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { + page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); + if (!page) + return -ENOMEM; + vmx_l1d_flush_pages = page_address(page); + + /* + * Initialize each page with a different pattern in + * order to protect against KSM in the nested + * virtualization case. + */ + for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { + memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, + PAGE_SIZE); + } + } + + l1tf_vmx_mitigation = l1tf; + + if (l1tf != VMENTER_L1D_FLUSH_NEVER) + static_branch_enable(&vmx_l1d_should_flush); + else + static_branch_disable(&vmx_l1d_should_flush); + + if (l1tf == VMENTER_L1D_FLUSH_COND) + static_branch_enable(&vmx_l1d_flush_cond); + else + static_branch_disable(&vmx_l1d_flush_cond); + return 0; +} + +static int vmentry_l1d_flush_parse(const char *s) +{ + unsigned int i; + + if (s) { + for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { + if (sysfs_streq(s, vmentry_l1d_param[i].option)) + return vmentry_l1d_param[i].cmd; + } + } + return -EINVAL; +} + +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) +{ + int l1tf, ret; + + if (!boot_cpu_has(X86_BUG_L1TF)) + return 0; + + l1tf = vmentry_l1d_flush_parse(s); + if (l1tf < 0) + return l1tf; + + /* + * Has vmx_init() run already? If not then this is the pre init + * parameter parsing. In that case just store the value and let + * vmx_init() do the proper setup after enable_ept has been + * established. + */ + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { + vmentry_l1d_flush_param = l1tf; + return 0; + } + + mutex_lock(&vmx_l1d_flush_mutex); + ret = vmx_setup_l1d_flush(l1tf); + mutex_unlock(&vmx_l1d_flush_mutex); + return ret; +} + +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) +{ + return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); +} + +static const struct kernel_param_ops vmentry_l1d_flush_ops = { + .set = vmentry_l1d_flush_set, + .get = vmentry_l1d_flush_get, +}; +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); + + enum ept_pointers_status { + EPT_POINTERS_CHECK = 0, + EPT_POINTERS_MATCH = 1, + EPT_POINTERS_MISMATCH = 2 + }; + struct kvm_vmx { struct kvm kvm; @@@ -937,21 -828,14 +977,13 @@@ struct vcpu_vmx */ struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; + struct loaded_vmcs *loaded_cpu_state; bool __launched; /* temporary, used in vmx_vcpu_run */ struct msr_autoload { - unsigned nr; - struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; - struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; + struct vmx_msrs guest; + struct vmx_msrs host; } msr_autoload; - struct { - int loaded; - u16 fs_sel, gs_sel, ldt_sel; - #ifdef CONFIG_X86_64 - u16 ds_sel, es_sel; - #endif - int gs_ldt_reload_needed; - int fs_reload_needed; - u64 msr_host_bndcfgs; - } host_state; + struct { int vm86_active; ulong save_rflags; @@@ -10647,37 -10779,12 +11021,39 @@@ free_vcpu return ERR_PTR(err); } +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" + static int vmx_vm_init(struct kvm *kvm) { + spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); + if (!ple_gap) kvm->arch.pause_in_guest = true; + + if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + case L1TF_MITIGATION_FLUSH_NOWARN: + /* 'I explicitly don't care' is set */ + break; + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + case L1TF_MITIGATION_FULL: + /* + * Warn upon starting the first VM in a potentially + * insecure environment. + */ + if (cpu_smt_control == CPU_SMT_ENABLED) + pr_warn_once(L1TF_MSG_SMT); + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) + pr_warn_once(L1TF_MSG_L1D); + break; + case L1TF_MITIGATION_FULL_FORCE: + /* Flush is enforced */ + break; + } + } return 0; } @@@ -12164,15 -12375,25 +12644,28 @@@ static int nested_vmx_run(struct kvm_vc */ vmx->nested.nested_run_pending = 1; - ret = enter_vmx_non_root_mode(vcpu); + ret = enter_vmx_non_root_mode(vcpu, &exit_qual); if (ret) { + nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual); vmx->nested.nested_run_pending = 0; - return ret; + return 1; } + /* Hide L1D cache contents from the nested guest. */ + vmx->vcpu.arch.l1tf_flush_l1d = true; + + /* + * Must happen outside of enter_vmx_non_root_mode() as it will + * also be used as part of restoring nVMX state for + * snapshot restore (migration). + * + * In this flow, it is assumed that vmcs12 cache was + * trasferred as part of captured nVMX state and should + * therefore not be read from guest memory (which may not + * exist on destination host yet). + */ + nested_cache_shadow_vmcs12(vcpu, vmcs12); + /* * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken * by event injection, halt vcpu.