Blob Blame History Raw
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/vmx: Revert "VMX: use a single, global APIC access page"

The claim "No accesses would ever go to this page." is false.  A consequence
of how Intel's APIC Acceleration works, and Xen's choice to have per-domain
P2Ms (rather than per-vCPU P2Ms) means that the APIC page is fully read-write
to any vCPU which is not in xAPIC mode.

This reverts commit 58850b9074d3e7affdf3bc94c84e417ecfa4d165.

This is XSA-412 / CVE-2022-42327.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index d429d76c18c9..3f4276531322 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -66,7 +66,8 @@ boolean_param("force-ept", opt_force_ept);
 static void vmx_ctxt_switch_from(struct vcpu *v);
 static void vmx_ctxt_switch_to(struct vcpu *v);
 
-static int alloc_vlapic_mapping(void);
+static int  vmx_alloc_vlapic_mapping(struct domain *d);
+static void vmx_free_vlapic_mapping(struct domain *d);
 static void vmx_install_vlapic_mapping(struct vcpu *v);
 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr,
                                 unsigned int flags);
@@ -77,8 +78,6 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content);
 static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content);
 static void vmx_invlpg(struct vcpu *v, unsigned long linear);
 
-static mfn_t __read_mostly apic_access_mfn = INVALID_MFN_INITIALIZER;
-
 /* Values for domain's ->arch.hvm_domain.pi_ops.flags. */
 #define PI_CSW_FROM (1u << 0)
 #define PI_CSW_TO   (1u << 1)
@@ -402,6 +401,7 @@ static int vmx_domain_initialise(struct domain *d)
         .to   = vmx_ctxt_switch_to,
         .tail = vmx_do_resume,
     };
+    int rc;
 
     d->arch.ctxt_switch = &csw;
 
@@ -411,15 +411,24 @@ static int vmx_domain_initialise(struct domain *d)
      */
     d->arch.hvm.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp;
 
+    if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
+        return rc;
+
     return 0;
 }
 
+static void vmx_domain_relinquish_resources(struct domain *d)
+{
+    vmx_free_vlapic_mapping(d);
+}
+
 static void domain_creation_finished(struct domain *d)
 {
     gfn_t gfn = gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE);
+    mfn_t apic_access_mfn = d->arch.hvm.vmx.apic_access_mfn;
     bool ipat;
 
-    if ( !has_vlapic(d) || mfn_eq(apic_access_mfn, INVALID_MFN) )
+    if ( mfn_eq(apic_access_mfn, _mfn(0)) )
         return;
 
     ASSERT(epte_get_entry_emt(d, gfn, apic_access_mfn, 0, &ipat,
@@ -2481,6 +2490,7 @@ static struct hvm_function_table __initdata vmx_function_table = {
     .cpu_up_prepare       = vmx_cpu_up_prepare,
     .cpu_dead             = vmx_cpu_dead,
     .domain_initialise    = vmx_domain_initialise,
+    .domain_relinquish_resources = vmx_domain_relinquish_resources,
     .domain_creation_finished = domain_creation_finished,
     .vcpu_initialise      = vmx_vcpu_initialise,
     .vcpu_destroy         = vmx_vcpu_destroy,
@@ -2731,7 +2741,7 @@ const struct hvm_function_table * __init start_vmx(void)
 {
     set_in_cr4(X86_CR4_VMXE);
 
-    if ( vmx_vmcs_init() || alloc_vlapic_mapping() )
+    if ( vmx_vmcs_init() )
     {
         printk("VMX: failed to initialise.\n");
         return NULL;
@@ -3305,36 +3315,55 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
     return X86EMUL_EXCEPTION;
 }
 
-static int __init alloc_vlapic_mapping(void)
+static int vmx_alloc_vlapic_mapping(struct domain *d)
 {
     struct page_info *pg;
     mfn_t mfn;
 
-    if ( !cpu_has_vmx_virtualize_apic_accesses )
+    if ( !has_vlapic(d) || !cpu_has_vmx_virtualize_apic_accesses )
         return 0;
 
-    pg = alloc_domheap_page(NULL, 0);
+    pg = alloc_domheap_page(d, MEMF_no_refcount);
     if ( !pg )
         return -ENOMEM;
 
-    /*
-     * Signal to shadow code that this page cannot be refcounted. This also
-     * makes epte_get_entry_emt() recognize this page as "special".
-     */
-    page_suppress_refcounting(pg);
+    if ( !get_page_and_type(pg, d, PGT_writable_page) )
+    {
+        /*
+         * The domain can't possibly know about this page yet, so failure
+         * here is a clear indication of something fishy going on.
+         */
+        domain_crash(d);
+        return -ENODATA;
+    }
 
     mfn = page_to_mfn(pg);
     clear_domain_page(mfn);
-    apic_access_mfn = mfn;
+    d->arch.hvm.vmx.apic_access_mfn = mfn;
 
     return 0;
 }
 
+static void vmx_free_vlapic_mapping(struct domain *d)
+{
+    mfn_t mfn = d->arch.hvm.vmx.apic_access_mfn;
+
+    d->arch.hvm.vmx.apic_access_mfn = _mfn(0);
+    if ( !mfn_eq(mfn, _mfn(0)) )
+    {
+        struct page_info *pg = mfn_to_page(mfn);
+
+        put_page_alloc_ref(pg);
+        put_page_and_type(pg);
+    }
+}
+
 static void vmx_install_vlapic_mapping(struct vcpu *v)
 {
+    mfn_t apic_access_mfn = v->domain->arch.hvm.vmx.apic_access_mfn;
     paddr_t virt_page_ma, apic_page_ma;
 
-    if ( !has_vlapic(v->domain) || mfn_eq(apic_access_mfn, INVALID_MFN) )
+    if ( mfn_eq(apic_access_mfn, _mfn(0)) )
         return;
 
     ASSERT(cpu_has_vmx_virtualize_apic_accesses);
diff --git a/xen/arch/x86/mm/shadow/set.c b/xen/arch/x86/mm/shadow/set.c
index 87e9c6eeb219..bd6c68b547c9 100644
--- a/xen/arch/x86/mm/shadow/set.c
+++ b/xen/arch/x86/mm/shadow/set.c
@@ -101,14 +101,6 @@ shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d, p2m_type_t type)
         owner = page_get_owner(pg);
     }
 
-    /*
-     * Check whether refcounting is suppressed on this page. For example,
-     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
-     * get accessed, and hence there's no need to refcount it.
-     */
-    if ( pg && page_refcounting_suppressed(pg) )
-        return 0;
-
     if ( owner == dom_io )
         owner = NULL;
 
diff --git a/xen/arch/x86/mm/shadow/types.h b/xen/arch/x86/mm/shadow/types.h
index 6970e7d6ea4a..814a4018535a 100644
--- a/xen/arch/x86/mm/shadow/types.h
+++ b/xen/arch/x86/mm/shadow/types.h
@@ -276,16 +276,9 @@ int shadow_set_l4e(struct domain *d, shadow_l4e_t *sl4e,
 static void inline
 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
 {
-    mfn_t mfn = shadow_l1e_get_mfn(sl1e);
-
     if ( !shadow_mode_refcounts(d) )
         return;
 
-    if ( mfn_valid(mfn) &&
-         /* See the respective comment in shadow_get_page_from_l1e(). */
-         page_refcounting_suppressed(mfn_to_page(mfn)) )
-        return;
-
     put_page_from_l1e(sl1e, d);
 }
 
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 03c9ccf627ab..8073af323b96 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -58,6 +58,7 @@ struct ept_data {
 #define _VMX_DOMAIN_PML_ENABLED    0
 #define VMX_DOMAIN_PML_ENABLED     (1ul << _VMX_DOMAIN_PML_ENABLED)
 struct vmx_domain {
+    mfn_t apic_access_mfn;
     /* VMX_DOMAIN_* */
     unsigned int status;
 
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 7bdf9c2290d8..e1bcea57a8f5 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -83,7 +83,7 @@
 #define PGC_state_offlined  PG_mask(2, 6)
 #define PGC_state_free      PG_mask(3, 6)
 #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
-/* Page is not reference counted (see below for caveats) */
+/* Page is not reference counted */
 #define _PGC_extra        PG_shift(7)
 #define PGC_extra         PG_mask(1, 7)
 
@@ -375,24 +375,6 @@ void zap_ro_mpt(mfn_t mfn);
 
 bool is_iomem_page(mfn_t mfn);
 
-/*
- * Pages with no owner which may get passed to functions wanting to
- * refcount them can be marked PGC_extra to bypass this refcounting (which
- * would fail due to the lack of an owner).
- *
- * (For pages with owner PGC_extra has different meaning.)
- */
-static inline void page_suppress_refcounting(struct page_info *pg)
-{
-   ASSERT(!page_get_owner(pg));
-   pg->count_info |= PGC_extra;
-}
-
-static inline bool page_refcounting_suppressed(const struct page_info *pg)
-{
-    return !page_get_owner(pg) && (pg->count_info & PGC_extra);
-}
-
 struct platform_bad_page {
     unsigned long mfn;
     unsigned int order;