Blob Blame History Raw
From 39664d9ee041f96e9c7ee131ed8ef72a4d19c9f8 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:30 +0000
Subject: [PATCH 4/4] xen/arm: Allocate and free P2M pages from the P2M pool

This commit sets/tearsdown of p2m pages pool for non-privileged Arm
guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`.

- For dom0, P2M pages should come from heap directly instead of p2m
pool, so that the kernel may take advantage of the extended regions.

- For xl guests, the setting of the p2m pool is called in
`XEN_DOMCTL_shadow_op` and the p2m pool is destroyed in
`domain_relinquish_resources`. Note that domctl->u.shadow_op.mb is
updated with the new size when setting the p2m pool.

- For dom0less domUs, the setting of the p2m pool is called before
allocating memory during domain creation. Users can specify the p2m
pool size by `xen,domain-p2m-mem-mb` dts property.

To actually allocate/free pages from the p2m pool, this commit adds
two helper functions namely `p2m_alloc_page` and `p2m_free_page` to
`struct p2m_domain`. By replacing the `alloc_domheap_page` and
`free_domheap_page` with these two helper functions, p2m pages can
be added/removed from the list of p2m pool rather than from the heap.

Since page from `p2m_alloc_page` is cleaned, take the opportunity
to remove the redundant `clean_page` in `p2m_create_table`.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 docs/misc/arm/device-tree/booting.txt |  8 ++++
 xen/arch/arm/domain.c                 |  6 +++
 xen/arch/arm/domain_build.c           | 29 ++++++++++++++
 xen/arch/arm/domctl.c                 | 23 ++++++++++-
 xen/arch/arm/p2m.c                    | 57 +++++++++++++++++++++++++--
 5 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
index 71895663a4de..d92ccc56ffe0 100644
--- a/docs/misc/arm/device-tree/booting.txt
+++ b/docs/misc/arm/device-tree/booting.txt
@@ -182,6 +182,14 @@ with the following properties:
     Both #address-cells and #size-cells need to be specified because
     both sub-nodes (described shortly) have reg properties.
 
+- xen,domain-p2m-mem-mb
+
+    Optional. A 32-bit integer specifying the amount of megabytes of RAM
+    used for the domain P2M pool. This is in-sync with the shadow_memory
+    option in xl.cfg. Leaving this field empty in device tree will lead to
+    the default size of domain P2M pool, i.e. 1MB per guest vCPU plus 4KB
+    per MB of guest RAM plus 512KB for guest extended regions.
+
 Under the "xen,domain" compatible node, one or more sub-nodes are present
 for the DomU kernel and ramdisk.
 
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 2694c39127c5..a818f33a1afa 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -997,6 +997,7 @@ enum {
     PROG_page,
     PROG_mapping,
     PROG_p2m,
+    PROG_p2m_pool,
     PROG_done,
 };
 
@@ -1062,6 +1063,11 @@ int domain_relinquish_resources(struct domain *d)
         if ( ret )
             return ret;
 
+    PROGRESS(p2m_pool):
+        ret = p2m_teardown_allocation(d);
+        if( ret )
+            return ret;
+
     PROGRESS(done):
         break;
 
diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index d02bacbcd1ed..8aec3755ca5d 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -2833,6 +2833,21 @@ static void __init find_gnttab_region(struct domain *d,
            kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size);
 }
 
+static unsigned long __init domain_p2m_pages(unsigned long maxmem_kb,
+                                             unsigned int smp_cpus)
+{
+    /*
+     * Keep in sync with libxl__get_required_paging_memory().
+     * 256 pages (1MB) per vcpu, plus 1 page per MiB of RAM for the P2M map,
+     * plus 128 pages to cover extended regions.
+     */
+    unsigned long memkb = 4 * (256 * smp_cpus + (maxmem_kb / 1024) + 128);
+
+    BUILD_BUG_ON(PAGE_SIZE != SZ_4K);
+
+    return DIV_ROUND_UP(memkb, 1024) << (20 - PAGE_SHIFT);
+}
+
 static int __init construct_domain(struct domain *d, struct kernel_info *kinfo)
 {
     unsigned int i;
@@ -2924,6 +2939,8 @@ static int __init construct_domU(struct domain *d,
     struct kernel_info kinfo = {};
     int rc;
     u64 mem;
+    u32 p2m_mem_mb;
+    unsigned long p2m_pages;
 
     rc = dt_property_read_u64(node, "memory", &mem);
     if ( !rc )
@@ -2933,6 +2950,18 @@ static int __init construct_domU(struct domain *d,
     }
     kinfo.unassigned_mem = (paddr_t)mem * SZ_1K;
 
+    rc = dt_property_read_u32(node, "xen,domain-p2m-mem-mb", &p2m_mem_mb);
+    /* If xen,domain-p2m-mem-mb is not specified, use the default value. */
+    p2m_pages = rc ?
+                p2m_mem_mb << (20 - PAGE_SHIFT) :
+                domain_p2m_pages(mem, d->max_vcpus);
+
+    spin_lock(&d->arch.paging.lock);
+    rc = p2m_set_allocation(d, p2m_pages, NULL);
+    spin_unlock(&d->arch.paging.lock);
+    if ( rc != 0 )
+        return rc;
+
     printk("*** LOADING DOMU cpus=%u memory=%"PRIx64"KB ***\n", d->max_vcpus, mem);
 
     kinfo.vpl011 = dt_property_read_bool(node, "vpl011");
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 9bf72e693019..c8fdeb124084 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -50,6 +50,9 @@ static int handle_vuart_init(struct domain *d,
 static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
                        XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
+    long rc;
+    bool preempted = false;
+
     if ( unlikely(d == current->domain) )
     {
         printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
@@ -66,9 +69,27 @@ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
     switch ( sc->op )
     {
     case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
-        return 0;
+    {
+        /* Allow and handle preemption */
+        spin_lock(&d->arch.paging.lock);
+        rc = p2m_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
+        spin_unlock(&d->arch.paging.lock);
+
+        if ( preempted )
+            /* Not finished. Set up to re-run the call. */
+            rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h",
+                                               u_domctl);
+        else
+            /* Finished. Return the new allocation. */
+            sc->mb = p2m_get_allocation(d);
+
+        return rc;
+    }
     case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+    {
+        sc->mb = p2m_get_allocation(d);
         return 0;
+    }
     default:
     {
         printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 79f3d37f5230..1bf9cbeb53cf 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -50,6 +50,54 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
     return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
 }
 
+static struct page_info *p2m_alloc_page(struct domain *d)
+{
+    struct page_info *pg;
+
+    spin_lock(&d->arch.paging.lock);
+    /*
+     * For hardware domain, there should be no limit in the number of pages that
+     * can be allocated, so that the kernel may take advantage of the extended
+     * regions. Hence, allocate p2m pages for hardware domains from heap.
+     */
+    if ( is_hardware_domain(d) )
+    {
+        pg = alloc_domheap_page(NULL, 0);
+        if ( pg == NULL )
+        {
+            printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n");
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+    }
+    else
+    {
+        pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
+        if ( unlikely(!pg) )
+        {
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+        d->arch.paging.p2m_total_pages--;
+    }
+    spin_unlock(&d->arch.paging.lock);
+
+    return pg;
+}
+
+static void p2m_free_page(struct domain *d, struct page_info *pg)
+{
+    spin_lock(&d->arch.paging.lock);
+    if ( is_hardware_domain(d) )
+        free_domheap_page(pg);
+    else
+    {
+        d->arch.paging.p2m_total_pages++;
+        page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+    }
+    spin_unlock(&d->arch.paging.lock);
+}
+
 /* Return the size of the pool, rounded up to the nearest MB */
 unsigned int p2m_get_allocation(struct domain *d)
 {
@@ -751,7 +799,7 @@ static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry)
 
     ASSERT(!p2m_is_valid(*entry));
 
-    page = alloc_domheap_page(NULL, 0);
+    page = p2m_alloc_page(p2m->domain);
     if ( page == NULL )
         return -ENOMEM;
 
@@ -878,7 +926,7 @@ static void p2m_free_entry(struct p2m_domain *p2m,
     pg = mfn_to_page(mfn);
 
     page_list_del(pg, &p2m->pages);
-    free_domheap_page(pg);
+    p2m_free_page(p2m->domain, pg);
 }
 
 static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
@@ -902,7 +950,7 @@ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
     ASSERT(level < target);
     ASSERT(p2m_is_superpage(*entry, level));
 
-    page = alloc_domheap_page(NULL, 0);
+    page = p2m_alloc_page(p2m->domain);
     if ( !page )
         return false;
 
@@ -1641,7 +1689,7 @@ int p2m_teardown(struct domain *d)
 
     while ( (pg = page_list_remove_head(&p2m->pages)) )
     {
-        free_domheap_page(pg);
+        p2m_free_page(p2m->domain, pg);
         count++;
         /* Arbitrarily preempt every 512 iterations */
         if ( !(count % 512) && hypercall_preempt_check() )
@@ -1665,6 +1713,7 @@ void p2m_final_teardown(struct domain *d)
         return;
 
     ASSERT(page_list_empty(&p2m->pages));
+    ASSERT(page_list_empty(&d->arch.paging.p2m_freelist));
 
     if ( p2m->root )
         free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
-- 
2.37.1