diff --git a/xen.spec b/xen.spec index 29c69a4..fde47fb 100644 --- a/xen.spec +++ b/xen.spec @@ -55,7 +55,7 @@ Summary: Xen is a virtual machine monitor Name: xen Version: 4.17.2 -Release: 3%{?dist} +Release: 4%{?dist} License: GPLv2+ and LGPLv2+ and BSD URL: http://xen.org/ Source0: https://downloads.xenproject.org/release/xen/%{version}/xen-%{version}.tar.gz @@ -124,6 +124,21 @@ Patch59: xsa439-0007-x86-entry-Track-the-IST-ness-of-an-entry-for-the-exi.patch Patch60: xsa439-0008-x86-spec-ctrl-Issue-VERW-during-IST-exit-to-Xen.patch Patch61: xsa439-0009-x86-amd-Introduce-is_zen-1-2-_uarch-predicates.patch Patch62: xsa439-0010-x86-spec-ctrl-Mitigate-the-Zen1-DIV-leakage.patch +Patch63: xsa440-4.17.patch +Patch64: xsa442-4.17.patch +Patch65: xsa443-4.17-01.patch +Patch66: xsa443-4.17-02.patch +Patch67: xsa443-4.17-03.patch +Patch68: xsa443-4.17-04.patch +Patch69: xsa443-4.17-05.patch +Patch70: xsa443-4.17-06.patch +Patch71: xsa443-4.17-07.patch +Patch72: xsa443-4.17-08.patch +Patch73: xsa443-4.17-09.patch +Patch74: xsa443-4.17-10.patch +Patch75: xsa443-4.17-11.patch +Patch76: xsa444-4.17-1.patch +Patch77: xsa444-4.17-2.patch %if %build_qemutrad @@ -352,6 +367,21 @@ manage Xen virtual machines. %patch 60 -p1 %patch 61 -p1 %patch 62 -p1 +%patch 63 -p1 +%patch 64 -p1 +%patch 65 -p1 +%patch 66 -p1 +%patch 67 -p1 +%patch 68 -p1 +%patch 69 -p1 +%patch 70 -p1 +%patch 71 -p1 +%patch 72 -p1 +%patch 73 -p1 +%patch 74 -p1 +%patch 75 -p1 +%patch 76 -p1 +%patch 77 -p1 # qemu-xen-traditional patches pushd tools/qemu-xen-traditional @@ -959,6 +989,15 @@ fi %endif %changelog +* Tue Oct 10 2023 Michael Young - 4.17.2-4 +- xenstored: A transaction conflict can crash C Xenstored [XSA-440, + CVE-2023-34323] +- x86/AMD: missing IOMMU TLB flushing [XSA-442, CVE-2023-34326] +- Multiple vulnerabilities in libfsimage disk handling [XSA-443, + CVE-2023-34325] +- x86/AMD: Debug Mask handling [XSA-444, CVE-2023-34327, + CVE-2023-34328] + * Sun Oct 08 2023 Michael Young - 4.17.2-3 - rebuild (f40) for OCaml 5.1 diff --git a/xsa440-4.17.patch b/xsa440-4.17.patch new file mode 100644 index 0000000..4941afc --- /dev/null +++ b/xsa440-4.17.patch @@ -0,0 +1,58 @@ +From 5d8b3d1ec98e56155d9650d7f4a70cd8ba9dc27d Mon Sep 17 00:00:00 2001 +From: Julien Grall +Date: Fri, 22 Sep 2023 11:32:16 +0100 +Subject: tools/xenstored: domain_entry_fix(): Handle conflicting transaction + +The function domain_entry_fix() will be initially called to check if the +quota is correct before attempt to commit any nodes. So it would be +possible that accounting is temporarily negative. This is the case +in the following sequence: + + 1) Create 50 nodes + 2) Start two transactions + 3) Delete all the nodes in each transaction + 4) Commit the two transactions + +Because the first transaction will have succeed and updated the +accounting, there is no guarantee that 'd->nbentry + num' will still +be above 0. So the assert() would be triggered. +The assert() was introduced in dbef1f748289 ("tools/xenstore: simplify +and fix per domain node accounting") with the assumption that the +value can't be negative. As this is not true revert to the original +check but restricted to the path where we don't update. Take the +opportunity to explain the rationale behind the check. + +This CVE-2023-34323 / XSA-440. + +Reported-by: Stanislav Uschakow +Fixes: dbef1f748289 ("tools/xenstore: simplify and fix per domain node accounting") +Signed-off-by: Julien Grall +Reviewed-by: Juergen Gross + +diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c +index aa86892fed9e..6074df210c6e 100644 +--- a/tools/xenstore/xenstored_domain.c ++++ b/tools/xenstore/xenstored_domain.c +@@ -1094,10 +1094,20 @@ int domain_entry_fix(unsigned int domid, int num, bool update) + } + + cnt = d->nbentry + num; +- assert(cnt >= 0); + +- if (update) ++ if (update) { ++ assert(cnt >= 0); + d->nbentry = cnt; ++ } else if (cnt < 0) { ++ /* ++ * In a transaction when a node is being added/removed AND ++ * the same node has been added/removed outside the ++ * transaction in parallel, the result value may be negative. ++ * This is no problem, as the transaction will fail due to ++ * the resulting conflict. So override 'cnt'. ++ */ ++ cnt = 0; ++ } + + return domid_is_unprivileged(domid) ? cnt : 0; + } diff --git a/xsa442-4.17.patch b/xsa442-4.17.patch new file mode 100644 index 0000000..a78bfdd --- /dev/null +++ b/xsa442-4.17.patch @@ -0,0 +1,185 @@ +From 5b2ccb60ff22fbff44dd66214c2956a434ee6271 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Tue, 13 Jun 2023 15:01:05 +0200 +Subject: [PATCH] iommu/amd-vi: flush IOMMU TLB when flushing the DTE +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The caching invalidation guidelines from the AMD-Vi specification (48882—Rev +3.07-PUB—Oct 2022) seem to be misleading on some hardware, as devices will +malfunction (see stale DMA mappings) if some fields of the DTE are updated but +the IOMMU TLB is not flushed. This has been observed in practice on AMD +systems. Due to the lack of guidance from the currently published +specification this patch aims to increase the flushing done in order to prevent +device malfunction. + +In order to fix, issue an INVALIDATE_IOMMU_PAGES command from +amd_iommu_flush_device(), flushing all the address space. Note this requires +callers to be adjusted in order to pass the DomID on the DTE previous to the +modification. + +Some call sites don't provide a valid DomID to amd_iommu_flush_device() in +order to avoid the flush. That's because the device had address translations +disabled and hence the previous DomID on the DTE is not valid. Note the +current logic relies on the entity disabling address translations to also flush +the TLB of the in use DomID. + +Device I/O TLB flushing when ATS are enabled is not covered by the current +change, as ATS usage is not security supported. + +This is XSA-442 / CVE-2023-34326 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +--- + xen/drivers/passthrough/amd/iommu.h | 3 ++- + xen/drivers/passthrough/amd/iommu_cmd.c | 10 +++++++++- + xen/drivers/passthrough/amd/iommu_guest.c | 5 +++-- + xen/drivers/passthrough/amd/iommu_init.c | 6 +++++- + xen/drivers/passthrough/amd/pci_amd_iommu.c | 14 ++++++++++---- + 5 files changed, 29 insertions(+), 9 deletions(-) + +diff --git a/xen/drivers/passthrough/amd/iommu.h b/xen/drivers/passthrough/amd/iommu.h +index 5429ada58ef5..a58be28bf96d 100644 +--- a/xen/drivers/passthrough/amd/iommu.h ++++ b/xen/drivers/passthrough/amd/iommu.h +@@ -283,7 +283,8 @@ void amd_iommu_flush_pages(struct domain *d, unsigned long dfn, + unsigned int order); + void amd_iommu_flush_iotlb(u8 devfn, const struct pci_dev *pdev, + uint64_t gaddr, unsigned int order); +-void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf); ++void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf, ++ domid_t domid); + void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf); + void amd_iommu_flush_all_caches(struct amd_iommu *iommu); + +diff --git a/xen/drivers/passthrough/amd/iommu_cmd.c b/xen/drivers/passthrough/amd/iommu_cmd.c +index 40ddf366bb4d..cb28b36abc38 100644 +--- a/xen/drivers/passthrough/amd/iommu_cmd.c ++++ b/xen/drivers/passthrough/amd/iommu_cmd.c +@@ -363,10 +363,18 @@ void amd_iommu_flush_pages(struct domain *d, + _amd_iommu_flush_pages(d, __dfn_to_daddr(dfn), order); + } + +-void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf) ++void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf, ++ domid_t domid) + { + invalidate_dev_table_entry(iommu, bdf); + flush_command_buffer(iommu, 0); ++ ++ /* Also invalidate IOMMU TLB entries when flushing the DTE. */ ++ if ( domid != DOMID_INVALID ) ++ { ++ invalidate_iommu_pages(iommu, INV_IOMMU_ALL_PAGES_ADDRESS, domid, 0); ++ flush_command_buffer(iommu, 0); ++ } + } + + void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf) +diff --git a/xen/drivers/passthrough/amd/iommu_guest.c b/xen/drivers/passthrough/amd/iommu_guest.c +index 80a331f546ed..be86bce6fb03 100644 +--- a/xen/drivers/passthrough/amd/iommu_guest.c ++++ b/xen/drivers/passthrough/amd/iommu_guest.c +@@ -385,7 +385,7 @@ static int do_completion_wait(struct domain *d, cmd_entry_t *cmd) + + static int do_invalidate_dte(struct domain *d, cmd_entry_t *cmd) + { +- uint16_t gbdf, mbdf, req_id, gdom_id, hdom_id; ++ uint16_t gbdf, mbdf, req_id, gdom_id, hdom_id, prev_domid; + struct amd_iommu_dte *gdte, *mdte, *dte_base; + struct amd_iommu *iommu = NULL; + struct guest_iommu *g_iommu; +@@ -445,13 +445,14 @@ static int do_invalidate_dte(struct domain *d, cmd_entry_t *cmd) + req_id = get_dma_requestor_id(iommu->seg, mbdf); + dte_base = iommu->dev_table.buffer; + mdte = &dte_base[req_id]; ++ prev_domid = mdte->domain_id; + + spin_lock_irqsave(&iommu->lock, flags); + dte_set_gcr3_table(mdte, hdom_id, gcr3_mfn << PAGE_SHIFT, gv, glx); + + spin_unlock_irqrestore(&iommu->lock, flags); + +- amd_iommu_flush_device(iommu, req_id); ++ amd_iommu_flush_device(iommu, req_id, prev_domid); + + return 0; + } +diff --git a/xen/drivers/passthrough/amd/iommu_init.c b/xen/drivers/passthrough/amd/iommu_init.c +index 166570648d26..101a60ce1794 100644 +--- a/xen/drivers/passthrough/amd/iommu_init.c ++++ b/xen/drivers/passthrough/amd/iommu_init.c +@@ -1547,7 +1547,11 @@ static int cf_check _invalidate_all_devices( + req_id = ivrs_mappings[bdf].dte_requestor_id; + if ( iommu ) + { +- amd_iommu_flush_device(iommu, req_id); ++ /* ++ * IOMMU TLB flush performed separately (see ++ * invalidate_all_domain_pages()). ++ */ ++ amd_iommu_flush_device(iommu, req_id, DOMID_INVALID); + amd_iommu_flush_intremap(iommu, req_id); + } + } +diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c +index 94e37755064b..8641b84712a0 100644 +--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c ++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c +@@ -192,10 +192,13 @@ static int __must_check amd_iommu_setup_domain_device( + + spin_unlock_irqrestore(&iommu->lock, flags); + +- amd_iommu_flush_device(iommu, req_id); ++ /* DTE didn't have DMA translations enabled, do not flush the TLB. */ ++ amd_iommu_flush_device(iommu, req_id, DOMID_INVALID); + } + else if ( dte->pt_root != mfn_x(page_to_mfn(root_pg)) ) + { ++ domid_t prev_domid = dte->domain_id; ++ + /* + * Strictly speaking if the device is the only one with this requestor + * ID, it could be allowed to be re-assigned regardless of unity map +@@ -252,7 +255,7 @@ static int __must_check amd_iommu_setup_domain_device( + + spin_unlock_irqrestore(&iommu->lock, flags); + +- amd_iommu_flush_device(iommu, req_id); ++ amd_iommu_flush_device(iommu, req_id, prev_domid); + } + else + spin_unlock_irqrestore(&iommu->lock, flags); +@@ -421,6 +424,8 @@ static void amd_iommu_disable_domain_device(const struct domain *domain, + spin_lock_irqsave(&iommu->lock, flags); + if ( dte->tv || dte->v ) + { ++ domid_t prev_domid = dte->domain_id; ++ + /* See the comment in amd_iommu_setup_device_table(). */ + dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_ABORTED; + smp_wmb(); +@@ -439,7 +444,7 @@ static void amd_iommu_disable_domain_device(const struct domain *domain, + + spin_unlock_irqrestore(&iommu->lock, flags); + +- amd_iommu_flush_device(iommu, req_id); ++ amd_iommu_flush_device(iommu, req_id, prev_domid); + + AMD_IOMMU_DEBUG("Disable: device id = %#x, " + "domain = %d, paging mode = %d\n", +@@ -610,7 +615,8 @@ static int cf_check amd_iommu_add_device(u8 devfn, struct pci_dev *pdev) + + spin_unlock_irqrestore(&iommu->lock, flags); + +- amd_iommu_flush_device(iommu, bdf); ++ /* DTE didn't have DMA translations enabled, do not flush the TLB. */ ++ amd_iommu_flush_device(iommu, bdf, DOMID_INVALID); + } + + if ( amd_iommu_reserve_domain_unity_map( +-- +2.42.0 + diff --git a/xsa443-4.17-01.patch b/xsa443-4.17-01.patch new file mode 100644 index 0000000..d9ca3f8 --- /dev/null +++ b/xsa443-4.17-01.patch @@ -0,0 +1,70 @@ +From 7e48562bf34e90f907491a0595782d2daa1ff3ad Mon Sep 17 00:00:00 2001 +From: Alejandro Vallejo +Date: Thu, 14 Sep 2023 13:22:50 +0100 +Subject: [PATCH 01/11] libfsimage/xfs: Remove dead code + +xfs_info.agnolog (and related code) and XFS_INO_AGBNO_BITS are dead code +that serve no purpose. + +This is part of XSA-443 / CVE-2023-34325 + +Signed-off-by: Alejandro Vallejo +Reviewed-by: Jan Beulich +--- + tools/libfsimage/xfs/fsys_xfs.c | 18 ------------------ + 1 file changed, 18 deletions(-) + +diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c +index d735a88e55f3..2800699f5985 100644 +--- a/tools/libfsimage/xfs/fsys_xfs.c ++++ b/tools/libfsimage/xfs/fsys_xfs.c +@@ -37,7 +37,6 @@ struct xfs_info { + int blklog; + int inopblog; + int agblklog; +- int agnolog; + unsigned int nextents; + xfs_daddr_t next; + xfs_daddr_t daddr; +@@ -65,9 +64,7 @@ static struct xfs_info xfs; + + #define XFS_INO_MASK(k) ((xfs_uint32_t)((1ULL << (k)) - 1)) + #define XFS_INO_OFFSET_BITS xfs.inopblog +-#define XFS_INO_AGBNO_BITS xfs.agblklog + #define XFS_INO_AGINO_BITS (xfs.agblklog + xfs.inopblog) +-#define XFS_INO_AGNO_BITS xfs.agnolog + + static inline xfs_agblock_t + agino2agbno (xfs_agino_t agino) +@@ -149,20 +146,6 @@ xt_len (xfs_bmbt_rec_32_t *r) + return le32(r->l3) & mask32lo(21); + } + +-static inline int +-xfs_highbit32(xfs_uint32_t v) +-{ +- int i; +- +- if (--v) { +- for (i = 0; i < 31; i++, v >>= 1) { +- if (v == 0) +- return i; +- } +- } +- return 0; +-} +- + static int + isinxt (xfs_fileoff_t key, xfs_fileoff_t offset, xfs_filblks_t len) + { +@@ -472,7 +455,6 @@ xfs_mount (fsi_file_t *ffi, const char *options) + + xfs.inopblog = super.sb_inopblog; + xfs.agblklog = super.sb_agblklog; +- xfs.agnolog = xfs_highbit32 (le32(super.sb_agcount)); + + xfs.btnode_ptr0_off = + ((xfs.bsize - sizeof(xfs_btree_block_t)) / +-- +2.42.0 + diff --git a/xsa443-4.17-02.patch b/xsa443-4.17-02.patch new file mode 100644 index 0000000..0f2edaf --- /dev/null +++ b/xsa443-4.17-02.patch @@ -0,0 +1,32 @@ +From c26327795b78c93f6fa6d5d46e34f59dc4046601 Mon Sep 17 00:00:00 2001 +From: Alejandro Vallejo +Date: Thu, 14 Sep 2023 13:22:51 +0100 +Subject: [PATCH 02/11] libfsimage/xfs: Amend mask32lo() to allow the value 32 + +agblklog could plausibly be 32, but that would overflow this shift. +Perform the shift as ULL and cast to u32 at the end instead. + +This is part of XSA-443 / CVE-2023-34325 + +Signed-off-by: Alejandro Vallejo +Acked-by: Jan Beulich +--- + tools/libfsimage/xfs/fsys_xfs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c +index 2800699f5985..4720bb4505c8 100644 +--- a/tools/libfsimage/xfs/fsys_xfs.c ++++ b/tools/libfsimage/xfs/fsys_xfs.c +@@ -60,7 +60,7 @@ static struct xfs_info xfs; + #define inode ((xfs_dinode_t *)((char *)FSYS_BUF + 8192)) + #define icore (inode->di_core) + +-#define mask32lo(n) (((xfs_uint32_t)1 << (n)) - 1) ++#define mask32lo(n) ((xfs_uint32_t)((1ull << (n)) - 1)) + + #define XFS_INO_MASK(k) ((xfs_uint32_t)((1ULL << (k)) - 1)) + #define XFS_INO_OFFSET_BITS xfs.inopblog +-- +2.42.0 + diff --git a/xsa443-4.17-03.patch b/xsa443-4.17-03.patch new file mode 100644 index 0000000..b89721a --- /dev/null +++ b/xsa443-4.17-03.patch @@ -0,0 +1,137 @@ +From 199f0538bbec052028679a55ea512437170854c9 Mon Sep 17 00:00:00 2001 +From: Alejandro Vallejo +Date: Thu, 14 Sep 2023 13:22:52 +0100 +Subject: [PATCH 03/11] libfsimage/xfs: Sanity-check the superblock during + mounts + +Sanity-check the XFS superblock for wellformedness at the mount handler. +This forces pygrub to abort parsing a potentially malformed filesystem and +ensures the invariants assumed throughout the rest of the code hold. + +Also, derive parameters from previously sanitized parameters where possible +(rather than reading them off the superblock) + +The code doesn't try to avoid overflowing the end of the disk, because +that's an unlikely and benign error. Parameters used in calculations of +xfs_daddr_t (like the root inode index) aren't in critical need of being +sanitized. + +The sanitization of agblklog is basically checking that no obvious +overflows happen on agblklog, and then ensuring agblocks is contained in +the range (2^(sb_agblklog-1), 2^sb_agblklog]. + +This is part of XSA-443 / CVE-2023-34325 + +Reported-by: Ferdinand Nölscher +Signed-off-by: Alejandro Vallejo +Reviewed-by: Jan Beulich +--- + tools/libfsimage/xfs/fsys_xfs.c | 48 ++++++++++++++++++++++++++------- + tools/libfsimage/xfs/xfs.h | 12 +++++++++ + 2 files changed, 50 insertions(+), 10 deletions(-) + +diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c +index 4720bb4505c8..e4eb7e1ee26f 100644 +--- a/tools/libfsimage/xfs/fsys_xfs.c ++++ b/tools/libfsimage/xfs/fsys_xfs.c +@@ -17,6 +17,7 @@ + * along with this program; If not, see . + */ + ++#include + #include + #include "xfs.h" + +@@ -433,29 +434,56 @@ first_dentry (fsi_file_t *ffi, xfs_ino_t *ino) + return next_dentry (ffi, ino); + } + ++static bool ++xfs_sb_is_invalid (const xfs_sb_t *super) ++{ ++ return (le32(super->sb_magicnum) != XFS_SB_MAGIC) ++ || ((le16(super->sb_versionnum) & XFS_SB_VERSION_NUMBITS) != ++ XFS_SB_VERSION_4) ++ || (super->sb_inodelog < XFS_SB_INODELOG_MIN) ++ || (super->sb_inodelog > XFS_SB_INODELOG_MAX) ++ || (super->sb_blocklog < XFS_SB_BLOCKLOG_MIN) ++ || (super->sb_blocklog > XFS_SB_BLOCKLOG_MAX) ++ || (super->sb_blocklog < super->sb_inodelog) ++ || (super->sb_agblklog > XFS_SB_AGBLKLOG_MAX) ++ || ((1ull << super->sb_agblklog) < le32(super->sb_agblocks)) ++ || (((1ull << super->sb_agblklog) >> 1) >= ++ le32(super->sb_agblocks)) ++ || ((super->sb_blocklog + super->sb_dirblklog) >= ++ XFS_SB_DIRBLK_NUMBITS); ++} ++ + static int + xfs_mount (fsi_file_t *ffi, const char *options) + { + xfs_sb_t super; + + if (!devread (ffi, 0, 0, sizeof(super), (char *)&super) +- || (le32(super.sb_magicnum) != XFS_SB_MAGIC) +- || ((le16(super.sb_versionnum) +- & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4) ) { ++ || xfs_sb_is_invalid(&super)) { + return 0; + } + +- xfs.bsize = le32 (super.sb_blocksize); +- xfs.blklog = super.sb_blocklog; +- xfs.bdlog = xfs.blklog - SECTOR_BITS; ++ /* ++ * Not sanitized. It's exclusively used to generate disk addresses, ++ * so it's not important from a security standpoint. ++ */ + xfs.rootino = le64 (super.sb_rootino); +- xfs.isize = le16 (super.sb_inodesize); +- xfs.agblocks = le32 (super.sb_agblocks); +- xfs.dirbsize = xfs.bsize << super.sb_dirblklog; + +- xfs.inopblog = super.sb_inopblog; ++ /* ++ * Sanitized to be consistent with each other, only used to ++ * generate disk addresses, so it's safe ++ */ ++ xfs.agblocks = le32 (super.sb_agblocks); + xfs.agblklog = super.sb_agblklog; + ++ /* Derived from sanitized parameters */ ++ xfs.bsize = 1 << super.sb_blocklog; ++ xfs.blklog = super.sb_blocklog; ++ xfs.bdlog = super.sb_blocklog - SECTOR_BITS; ++ xfs.isize = 1 << super.sb_inodelog; ++ xfs.dirbsize = 1 << (super.sb_blocklog + super.sb_dirblklog); ++ xfs.inopblog = super.sb_blocklog - super.sb_inodelog; ++ + xfs.btnode_ptr0_off = + ((xfs.bsize - sizeof(xfs_btree_block_t)) / + (sizeof (xfs_bmbt_key_t) + sizeof (xfs_bmbt_ptr_t))) +diff --git a/tools/libfsimage/xfs/xfs.h b/tools/libfsimage/xfs/xfs.h +index 40699281e44d..b87e37d3d7e9 100644 +--- a/tools/libfsimage/xfs/xfs.h ++++ b/tools/libfsimage/xfs/xfs.h +@@ -134,6 +134,18 @@ typedef struct xfs_sb + xfs_uint8_t sb_dummy[7]; /* padding */ + } xfs_sb_t; + ++/* Bound taken from xfs.c in GRUB2. It doesn't exist in the spec */ ++#define XFS_SB_DIRBLK_NUMBITS 27 ++/* Implied by the XFS specification. The minimum block size is 512 octets */ ++#define XFS_SB_BLOCKLOG_MIN 9 ++/* Implied by the XFS specification. The maximum block size is 65536 octets */ ++#define XFS_SB_BLOCKLOG_MAX 16 ++/* Implied by the XFS specification. The minimum inode size is 256 octets */ ++#define XFS_SB_INODELOG_MIN 8 ++/* Implied by the XFS specification. The maximum inode size is 2048 octets */ ++#define XFS_SB_INODELOG_MAX 11 ++/* High bound for sb_agblklog */ ++#define XFS_SB_AGBLKLOG_MAX 32 + + /* those are from xfs_btree.h */ + +-- +2.42.0 + diff --git a/xsa443-4.17-04.patch b/xsa443-4.17-04.patch new file mode 100644 index 0000000..dde095e --- /dev/null +++ b/xsa443-4.17-04.patch @@ -0,0 +1,61 @@ +From c66fd01277939634c624c8340838682d9d4fd839 Mon Sep 17 00:00:00 2001 +From: Alejandro Vallejo +Date: Thu, 14 Sep 2023 13:22:53 +0100 +Subject: [PATCH 04/11] libfsimage/xfs: Add compile-time check to libfsimage + +Adds the common tools include folder to the -I compile flags +of libfsimage. This allows us to use: + xen-tools/common-macros.h:BUILD_BUG_ON() + +With it, statically assert a sanitized "blocklog - SECTOR_BITS" cannot +underflow. + +This is part of XSA-443 / CVE-2023-34325 + +Signed-off-by: Alejandro Vallejo +Reviewed-by: Jan Beulich +--- + tools/libfsimage/common.mk | 2 +- + tools/libfsimage/xfs/fsys_xfs.c | 4 +++- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/tools/libfsimage/common.mk b/tools/libfsimage/common.mk +index 4fc8c6679599..e4336837d045 100644 +--- a/tools/libfsimage/common.mk ++++ b/tools/libfsimage/common.mk +@@ -1,7 +1,7 @@ + include $(XEN_ROOT)/tools/Rules.mk + + FSDIR := $(libdir)/xenfsimage +-CFLAGS += -Wno-unknown-pragmas -I$(XEN_ROOT)/tools/libfsimage/common/ -DFSIMAGE_FSDIR=\"$(FSDIR)\" ++CFLAGS += -Wno-unknown-pragmas -I$(XEN_ROOT)/tools/libfsimage/common/ $(CFLAGS_xeninclude) -DFSIMAGE_FSDIR=\"$(FSDIR)\" + CFLAGS += -D_GNU_SOURCE + LDFLAGS += -L../common/ + +diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c +index e4eb7e1ee26f..4a8dd6f2397b 100644 +--- a/tools/libfsimage/xfs/fsys_xfs.c ++++ b/tools/libfsimage/xfs/fsys_xfs.c +@@ -19,6 +19,7 @@ + + #include + #include ++#include + #include "xfs.h" + + #define MAX_LINK_COUNT 8 +@@ -477,9 +478,10 @@ xfs_mount (fsi_file_t *ffi, const char *options) + xfs.agblklog = super.sb_agblklog; + + /* Derived from sanitized parameters */ ++ BUILD_BUG_ON(XFS_SB_BLOCKLOG_MIN < SECTOR_BITS); ++ xfs.bdlog = super.sb_blocklog - SECTOR_BITS; + xfs.bsize = 1 << super.sb_blocklog; + xfs.blklog = super.sb_blocklog; +- xfs.bdlog = super.sb_blocklog - SECTOR_BITS; + xfs.isize = 1 << super.sb_inodelog; + xfs.dirbsize = 1 << (super.sb_blocklog + super.sb_dirblklog); + xfs.inopblog = super.sb_blocklog - super.sb_inodelog; +-- +2.42.0 + diff --git a/xsa443-4.17-05.patch b/xsa443-4.17-05.patch new file mode 100644 index 0000000..b2f5daa --- /dev/null +++ b/xsa443-4.17-05.patch @@ -0,0 +1,59 @@ +From ad5d0db5e68e5d4e79255fa85d9cb0069bb1c5d5 Mon Sep 17 00:00:00 2001 +From: Alejandro Vallejo +Date: Mon, 25 Sep 2023 18:32:21 +0100 +Subject: [PATCH 05/11] tools/pygrub: Remove unnecessary hypercall + +There's a hypercall being issued in order to determine whether PV64 is +supported, but since Xen 4.3 that's strictly true so it's not required. + +Plus, this way we can avoid mapping the privcmd interface altogether in the +depriv pygrub. + +This is part of XSA-443 / CVE-2023-34325 + +Signed-off-by: Alejandro Vallejo +Reviewed-by: Andrew Cooper +--- + tools/pygrub/src/pygrub | 12 +----------- + 1 file changed, 1 insertion(+), 11 deletions(-) + +diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub +index ce7ab0eb8cf3..ce4e07d3e823 100755 +--- a/tools/pygrub/src/pygrub ++++ b/tools/pygrub/src/pygrub +@@ -18,7 +18,6 @@ import os, sys, string, struct, tempfile, re, traceback, stat, errno + import copy + import logging + import platform +-import xen.lowlevel.xc + + import curses, _curses, curses.textpad, curses.ascii + import getopt +@@ -668,14 +667,6 @@ def run_grub(file, entry, fs, cfg_args): + + return grubcfg + +-def supports64bitPVguest(): +- xc = xen.lowlevel.xc.xc() +- caps = xc.xeninfo()['xen_caps'].split(" ") +- for cap in caps: +- if cap == "xen-3.0-x86_64": +- return True +- return False +- + # If nothing has been specified, look for a Solaris domU. If found, perform the + # necessary tweaks. + def sniff_solaris(fs, cfg): +@@ -684,8 +675,7 @@ def sniff_solaris(fs, cfg): + return cfg + + if not cfg["kernel"]: +- if supports64bitPVguest() and \ +- fs.file_exists("/platform/i86xpv/kernel/amd64/unix"): ++ if fs.file_exists("/platform/i86xpv/kernel/amd64/unix"): + cfg["kernel"] = "/platform/i86xpv/kernel/amd64/unix" + cfg["ramdisk"] = "/platform/i86pc/amd64/boot_archive" + elif fs.file_exists("/platform/i86xpv/kernel/unix"): +-- +2.42.0 + diff --git a/xsa443-4.17-06.patch b/xsa443-4.17-06.patch new file mode 100644 index 0000000..22af109 --- /dev/null +++ b/xsa443-4.17-06.patch @@ -0,0 +1,65 @@ +From d3ceb0b314005a656dd2ca4b2821575a36f8426d Mon Sep 17 00:00:00 2001 +From: Alejandro Vallejo +Date: Mon, 25 Sep 2023 18:32:22 +0100 +Subject: [PATCH 06/11] tools/pygrub: Small refactors + +Small tidy up to ensure output_directory always has a trailing '/' to ease +concatenating paths and that `output` can only be a filename or None. + +This is part of XSA-443 / CVE-2023-34325 + +Signed-off-by: Alejandro Vallejo +Acked-by: Andrew Cooper +--- + tools/pygrub/src/pygrub | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub +index ce4e07d3e823..1042c05b8676 100755 +--- a/tools/pygrub/src/pygrub ++++ b/tools/pygrub/src/pygrub +@@ -793,7 +793,7 @@ if __name__ == "__main__": + debug = False + not_really = False + output_format = "sxp" +- output_directory = "/var/run/xen/pygrub" ++ output_directory = "/var/run/xen/pygrub/" + + # what was passed in + incfg = { "kernel": None, "ramdisk": None, "args": "" } +@@ -815,7 +815,8 @@ if __name__ == "__main__": + usage() + sys.exit() + elif o in ("--output",): +- output = a ++ if a != "-": ++ output = a + elif o in ("--kernel",): + incfg["kernel"] = a + elif o in ("--ramdisk",): +@@ -847,12 +848,11 @@ if __name__ == "__main__": + if not os.path.isdir(a): + print("%s is not an existing directory" % a) + sys.exit(1) +- output_directory = a ++ output_directory = a + '/' + + if debug: + logging.basicConfig(level=logging.DEBUG) + +- + try: + os.makedirs(output_directory, 0o700) + except OSError as e: +@@ -861,7 +861,7 @@ if __name__ == "__main__": + else: + raise + +- if output is None or output == "-": ++ if output is None: + fd = sys.stdout.fileno() + else: + fd = os.open(output, os.O_WRONLY) +-- +2.42.0 + diff --git a/xsa443-4.17-07.patch b/xsa443-4.17-07.patch new file mode 100644 index 0000000..94da883 --- /dev/null +++ b/xsa443-4.17-07.patch @@ -0,0 +1,105 @@ +From 9e80cfecde338cea0db136c2fb5ed78d6081e05f Mon Sep 17 00:00:00 2001 +From: Alejandro Vallejo +Date: Mon, 25 Sep 2023 18:32:23 +0100 +Subject: [PATCH 07/11] tools/pygrub: Open the output files earlier + +This patch allows pygrub to get ahold of every RW file descriptor it needs +early on. A later patch will clamp the filesystem it can access so it can't +obtain any others. + +This is part of XSA-443 / CVE-2023-34325 + +Signed-off-by: Alejandro Vallejo +Acked-by: Andrew Cooper +--- + tools/pygrub/src/pygrub | 37 ++++++++++++++++++++++--------------- + 1 file changed, 22 insertions(+), 15 deletions(-) + +diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub +index 1042c05b8676..91e2ec2ab105 100755 +--- a/tools/pygrub/src/pygrub ++++ b/tools/pygrub/src/pygrub +@@ -738,8 +738,7 @@ if __name__ == "__main__": + def usage(): + print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] [--output-format=sxp|simple|simple0] [--offset=] " %(sys.argv[0],), file=sys.stderr) + +- def copy_from_image(fs, file_to_read, file_type, output_directory, +- not_really): ++ def copy_from_image(fs, file_to_read, file_type, fd_dst, path_dst, not_really): + if not_really: + if fs.file_exists(file_to_read): + return "<%s:%s>" % (file_type, file_to_read) +@@ -750,21 +749,18 @@ if __name__ == "__main__": + except Exception as e: + print(e, file=sys.stderr) + sys.exit("Error opening %s in guest" % file_to_read) +- (tfd, ret) = tempfile.mkstemp(prefix="boot_"+file_type+".", +- dir=output_directory) + dataoff = 0 + while True: + data = datafile.read(FS_READ_MAX, dataoff) + if len(data) == 0: +- os.close(tfd) ++ os.close(fd_dst) + del datafile +- return ret ++ return + try: +- os.write(tfd, data) ++ os.write(fd_dst, data) + except Exception as e: + print(e, file=sys.stderr) +- os.close(tfd) +- os.unlink(ret) ++ os.unlink(path_dst) + del datafile + sys.exit("Error writing temporary copy of "+file_type) + dataoff += len(data) +@@ -861,6 +857,14 @@ if __name__ == "__main__": + else: + raise + ++ if not_really: ++ fd_kernel = path_kernel = fd_ramdisk = path_ramdisk = None ++ else: ++ (fd_kernel, path_kernel) = tempfile.mkstemp(prefix="boot_kernel.", ++ dir=output_directory) ++ (fd_ramdisk, path_ramdisk) = tempfile.mkstemp(prefix="boot_ramdisk.", ++ dir=output_directory) ++ + if output is None: + fd = sys.stdout.fileno() + else: +@@ -920,20 +924,23 @@ if __name__ == "__main__": + if fs is None: + raise RuntimeError("Unable to find partition containing kernel") + +- bootcfg["kernel"] = copy_from_image(fs, chosencfg["kernel"], "kernel", +- output_directory, not_really) ++ copy_from_image(fs, chosencfg["kernel"], "kernel", ++ fd_kernel, path_kernel, not_really) ++ bootcfg["kernel"] = path_kernel + + if chosencfg["ramdisk"]: + try: +- bootcfg["ramdisk"] = copy_from_image(fs, chosencfg["ramdisk"], +- "ramdisk", output_directory, +- not_really) ++ copy_from_image(fs, chosencfg["ramdisk"], "ramdisk", ++ fd_ramdisk, path_ramdisk, not_really) + except: + if not not_really: +- os.unlink(bootcfg["kernel"]) ++ os.unlink(path_kernel) + raise ++ bootcfg["ramdisk"] = path_ramdisk + else: + initrd = None ++ if not not_really: ++ os.unlink(path_ramdisk) + + args = None + if chosencfg["args"]: +-- +2.42.0 + diff --git a/xsa443-4.17-08.patch b/xsa443-4.17-08.patch new file mode 100644 index 0000000..bd7de1d --- /dev/null +++ b/xsa443-4.17-08.patch @@ -0,0 +1,126 @@ +From 2fb4cdcedd8720f78c4bd44739a5d30dd1a7d9a5 Mon Sep 17 00:00:00 2001 +From: Alejandro Vallejo +Date: Mon, 25 Sep 2023 18:32:24 +0100 +Subject: [PATCH 08/11] tools/libfsimage: Export a new function to preload all + plugins + +This is work required in order to let pygrub operate in highly deprivileged +chroot mode. This patch adds a function that preloads every plugin, hence +ensuring that a on function exit, every shared library is loaded in memory. + +The new "init" function is supposed to be used before depriv, but that's +fine because it's not acting on untrusted data. + +This is part of XSA-443 / CVE-2023-34325 + +Signed-off-by: Alejandro Vallejo +Acked-by: Andrew Cooper +--- + tools/libfsimage/common/fsimage_plugin.c | 4 ++-- + tools/libfsimage/common/mapfile-GNU | 1 + + tools/libfsimage/common/mapfile-SunOS | 1 + + tools/libfsimage/common/xenfsimage.h | 8 ++++++++ + tools/pygrub/src/fsimage/fsimage.c | 15 +++++++++++++++ + 5 files changed, 27 insertions(+), 2 deletions(-) + +diff --git a/tools/libfsimage/common/fsimage_plugin.c b/tools/libfsimage/common/fsimage_plugin.c +index de1412b4233a..d0cb9e96a654 100644 +--- a/tools/libfsimage/common/fsimage_plugin.c ++++ b/tools/libfsimage/common/fsimage_plugin.c +@@ -119,7 +119,7 @@ fail: + return (-1); + } + +-static int load_plugins(void) ++int fsi_init(void) + { + const char *fsdir = getenv("XEN_FSIMAGE_FSDIR"); + struct dirent *dp = NULL; +@@ -180,7 +180,7 @@ int find_plugin(fsi_t *fsi, const char *path, const char *options) + fsi_plugin_t *fp; + int ret = 0; + +- if (plugins == NULL && (ret = load_plugins()) != 0) ++ if (plugins == NULL && (ret = fsi_init()) != 0) + goto out; + + for (fp = plugins; fp != NULL; fp = fp->fp_next) { +diff --git a/tools/libfsimage/common/mapfile-GNU b/tools/libfsimage/common/mapfile-GNU +index 26d4d7a69ec7..2d54d527d7f5 100644 +--- a/tools/libfsimage/common/mapfile-GNU ++++ b/tools/libfsimage/common/mapfile-GNU +@@ -1,6 +1,7 @@ + VERSION { + libfsimage.so.1.0 { + global: ++ fsi_init; + fsi_open_fsimage; + fsi_close_fsimage; + fsi_file_exists; +diff --git a/tools/libfsimage/common/mapfile-SunOS b/tools/libfsimage/common/mapfile-SunOS +index e99b90b65077..48deedb4252f 100644 +--- a/tools/libfsimage/common/mapfile-SunOS ++++ b/tools/libfsimage/common/mapfile-SunOS +@@ -1,5 +1,6 @@ + libfsimage.so.1.0 { + global: ++ fsi_init; + fsi_open_fsimage; + fsi_close_fsimage; + fsi_file_exists; +diff --git a/tools/libfsimage/common/xenfsimage.h b/tools/libfsimage/common/xenfsimage.h +index 201abd54f23a..341883b2d71a 100644 +--- a/tools/libfsimage/common/xenfsimage.h ++++ b/tools/libfsimage/common/xenfsimage.h +@@ -35,6 +35,14 @@ extern C { + typedef struct fsi fsi_t; + typedef struct fsi_file fsi_file_t; + ++/* ++ * Optional initialization function. If invoked it loads the associated ++ * dynamic libraries for the backends ahead of time. This is required if ++ * the library is to run as part of a highly deprivileged executable, as ++ * the libraries may not be reachable after depriv. ++ */ ++int fsi_init(void); ++ + fsi_t *fsi_open_fsimage(const char *, uint64_t, const char *); + void fsi_close_fsimage(fsi_t *); + +diff --git a/tools/pygrub/src/fsimage/fsimage.c b/tools/pygrub/src/fsimage/fsimage.c +index 2ebbbe35df92..92fbf2851f01 100644 +--- a/tools/pygrub/src/fsimage/fsimage.c ++++ b/tools/pygrub/src/fsimage/fsimage.c +@@ -286,6 +286,15 @@ fsimage_getbootstring(PyObject *o, PyObject *args) + return Py_BuildValue("s", bootstring); + } + ++static PyObject * ++fsimage_init(PyObject *o, PyObject *args) ++{ ++ if (!PyArg_ParseTuple(args, "")) ++ return (NULL); ++ ++ return Py_BuildValue("i", fsi_init()); ++} ++ + PyDoc_STRVAR(fsimage_open__doc__, + "open(name, [offset=off]) - Open the given file as a filesystem image.\n" + "\n" +@@ -297,7 +306,13 @@ PyDoc_STRVAR(fsimage_getbootstring__doc__, + "getbootstring(fs) - Return the boot string needed for this file system " + "or NULL if none is needed.\n"); + ++PyDoc_STRVAR(fsimage_init__doc__, ++ "init() - Loads every dynamic library contained in xenfsimage " ++ "into memory so that it can be used in chrooted environments.\n"); ++ + static struct PyMethodDef fsimage_module_methods[] = { ++ { "init", (PyCFunction)fsimage_init, ++ METH_VARARGS, fsimage_init__doc__ }, + { "open", (PyCFunction)fsimage_open, + METH_VARARGS|METH_KEYWORDS, fsimage_open__doc__ }, + { "getbootstring", (PyCFunction)fsimage_getbootstring, +-- +2.42.0 + diff --git a/xsa443-4.17-09.patch b/xsa443-4.17-09.patch new file mode 100644 index 0000000..2e3ebd8 --- /dev/null +++ b/xsa443-4.17-09.patch @@ -0,0 +1,307 @@ +From 150771ce86a07e469e34941a63c56e2cf242223b Mon Sep 17 00:00:00 2001 +From: Alejandro Vallejo +Date: Mon, 25 Sep 2023 18:32:25 +0100 +Subject: [PATCH 09/11] tools/pygrub: Deprivilege pygrub + +Introduce a --runas= flag to deprivilege pygrub on Linux and *BSDs. It +also implicitly creates a chroot env where it drops a deprivileged forked +process. The chroot itself is cleaned up at the end. + +If the --runas arg is present, then pygrub forks, leaving the child to +deprivilege itself, and waiting for it to complete. When the child exists, +the parent performs cleanup and exits with the same error code. + +This is roughly what the child does: + 1. Initialize libfsimage (this loads every .so in memory so the chroot + can avoid bind-mounting /{,usr}/lib* + 2. Create a temporary empty chroot directory + 3. Mount tmpfs in it + 4. Bind mount the disk inside, because libfsimage expects a path, not a + file descriptor. + 5. Remount the root tmpfs to be stricter (ro,nosuid,nodev) + 6. Set RLIMIT_FSIZE to a sensibly high amount (128 MiB) + 7. Depriv gid, groups and uid + +With this scheme in place, the "output" files are writable (up to +RLIMIT_FSIZE octets) and the exposed filesystem is immutable and contains +the single only file we can't easily get rid of (the disk). + +If running on Linux, the child process also unshares mount, IPC, and +network namespaces before dropping its privileges. + +This is part of XSA-443 / CVE-2023-34325 + +Signed-off-by: Alejandro Vallejo +Acked-by: Andrew Cooper +--- + tools/pygrub/setup.py | 2 +- + tools/pygrub/src/pygrub | 162 +++++++++++++++++++++++++++++++++++++--- + 2 files changed, 154 insertions(+), 10 deletions(-) + +diff --git a/tools/pygrub/setup.py b/tools/pygrub/setup.py +index 0e4e3d02d372..06b96733d020 100644 +--- a/tools/pygrub/setup.py ++++ b/tools/pygrub/setup.py +@@ -17,7 +17,7 @@ xenfsimage = Extension("xenfsimage", + pkgs = [ 'grub' ] + + setup(name='pygrub', +- version='0.6', ++ version='0.7', + description='Boot loader that looks a lot like grub for Xen', + author='Jeremy Katz', + author_email='katzj@redhat.com', +diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub +index 91e2ec2ab105..7cea496ade08 100755 +--- a/tools/pygrub/src/pygrub ++++ b/tools/pygrub/src/pygrub +@@ -16,8 +16,11 @@ from __future__ import print_function + + import os, sys, string, struct, tempfile, re, traceback, stat, errno + import copy ++import ctypes, ctypes.util + import logging + import platform ++import resource ++import subprocess + + import curses, _curses, curses.textpad, curses.ascii + import getopt +@@ -27,10 +30,135 @@ import grub.GrubConf + import grub.LiloConf + import grub.ExtLinuxConf + +-PYGRUB_VER = 0.6 ++PYGRUB_VER = 0.7 + FS_READ_MAX = 1024 * 1024 + SECTOR_SIZE = 512 + ++# Unless provided through the env variable PYGRUB_MAX_FILE_SIZE_MB, then ++# this is the maximum filesize allowed for files written by the depriv ++# pygrub ++LIMIT_FSIZE = 128 << 20 ++ ++CLONE_NEWNS = 0x00020000 # mount namespace ++CLONE_NEWNET = 0x40000000 # network namespace ++CLONE_NEWIPC = 0x08000000 # IPC namespace ++ ++def unshare(flags): ++ if not sys.platform.startswith("linux"): ++ print("skip_unshare reason=not_linux platform=%s", sys.platform, file=sys.stderr) ++ return ++ ++ libc = ctypes.CDLL(ctypes.util.find_library('c'), use_errno=True) ++ unshare_prototype = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, use_errno=True) ++ unshare = unshare_prototype(('unshare', libc)) ++ ++ if unshare(flags) < 0: ++ raise OSError(ctypes.get_errno(), os.strerror(ctypes.get_errno())) ++ ++def bind_mount(src, dst, options): ++ open(dst, "a").close() # touch ++ ++ rc = subprocess.call(["mount", "--bind", "-o", options, src, dst]) ++ if rc != 0: ++ raise RuntimeError("bad_mount: src=%s dst=%s opts=%s" % ++ (src, dst, options)) ++ ++def downgrade_rlimits(): ++ # Wipe the authority to use unrequired resources ++ resource.setrlimit(resource.RLIMIT_NPROC, (0, 0)) ++ resource.setrlimit(resource.RLIMIT_CORE, (0, 0)) ++ resource.setrlimit(resource.RLIMIT_MEMLOCK, (0, 0)) ++ ++ # py2's resource module doesn't know about resource.RLIMIT_MSGQUEUE ++ # ++ # TODO: Use resource.RLIMIT_MSGQUEUE after python2 is deprecated ++ if sys.platform.startswith('linux'): ++ RLIMIT_MSGQUEUE = 12 ++ resource.setrlimit(RLIMIT_MSGQUEUE, (0, 0)) ++ ++ # The final look of the filesystem for this process is fully RO, but ++ # note we have some file descriptor already open (notably, kernel and ++ # ramdisk). In order to avoid a compromised pygrub from filling up the ++ # filesystem we set RLIMIT_FSIZE to a high bound, so that the file ++ # write permissions are bound. ++ fsize = LIMIT_FSIZE ++ if "PYGRUB_MAX_FILE_SIZE_MB" in os.environ.keys(): ++ fsize = os.environ["PYGRUB_MAX_FILE_SIZE_MB"] << 20 ++ ++ resource.setrlimit(resource.RLIMIT_FSIZE, (fsize, fsize)) ++ ++def depriv(output_directory, output, device, uid, path_kernel, path_ramdisk): ++ # The only point of this call is to force the loading of libfsimage. ++ # That way, we don't need to bind-mount it into the chroot ++ rc = xenfsimage.init() ++ if rc != 0: ++ os.unlink(path_ramdisk) ++ os.unlink(path_kernel) ++ raise RuntimeError("bad_xenfsimage: rc=%d" % rc) ++ ++ # Create a temporary directory for the chroot ++ chroot = tempfile.mkdtemp(prefix=str(uid)+'-', dir=output_directory) + '/' ++ device_path = '/device' ++ ++ pid = os.fork() ++ if pid: ++ # parent ++ _, rc = os.waitpid(pid, 0) ++ ++ for path in [path_kernel, path_ramdisk]: ++ # If the child didn't write anything, just get rid of it, ++ # otherwise we end up consuming a 0-size file when parsing ++ # systems without a ramdisk that the ultimate caller of pygrub ++ # may just be unaware of ++ if rc != 0 or os.path.getsize(path) == 0: ++ os.unlink(path) ++ ++ # Normally, unshare(CLONE_NEWNS) will ensure this is not required. ++ # However, this syscall doesn't exist in *BSD systems and doesn't ++ # auto-unmount everything on older Linux kernels (At least as of ++ # Linux 4.19, but it seems fixed in 5.15). Either way, ++ # recursively unmount everything if needed. Quietly. ++ with open('/dev/null', 'w') as devnull: ++ subprocess.call(["umount", "-f", chroot + device_path], ++ stdout=devnull, stderr=devnull) ++ subprocess.call(["umount", "-f", chroot], ++ stdout=devnull, stderr=devnull) ++ os.rmdir(chroot) ++ ++ sys.exit(rc) ++ ++ # By unsharing the namespace we're making sure it's all bulk-released ++ # at the end, when the namespaces disappear. This means the kernel does ++ # (almost) all the cleanup for us and the parent just has to remove the ++ # temporary directory. ++ unshare(CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWNET) ++ ++ # Set sensible limits using the setrlimit interface ++ downgrade_rlimits() ++ ++ # We'll mount tmpfs on the chroot to ensure the deprivileged child ++ # cannot affect the persistent state. It's RW now in order to ++ # bind-mount the device, but note it's remounted RO after that. ++ rc = subprocess.call(["mount", "-t", "tmpfs", "none", chroot]) ++ if rc != 0: ++ raise RuntimeError("mount_tmpfs rc=%d dst=\"%s\"" % (rc, chroot)) ++ ++ # Bind the untrusted device RO ++ bind_mount(device, chroot + device_path, "ro,nosuid,noexec") ++ ++ rc = subprocess.call(["mount", "-t", "tmpfs", "-o", "remount,ro,nosuid,noexec,nodev", "none", chroot]) ++ if rc != 0: ++ raise RuntimeError("remount_tmpfs rc=%d dst=\"%s\"" % (rc, chroot)) ++ ++ # Drop superpowers! ++ os.chroot(chroot) ++ os.chdir('/') ++ os.setgid(uid) ++ os.setgroups([uid]) ++ os.setuid(uid) ++ ++ return device_path ++ + def read_size_roundup(fd, size): + if platform.system() != 'FreeBSD': + return size +@@ -736,7 +864,7 @@ if __name__ == "__main__": + sel = None + + def usage(): +- print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] [--output-format=sxp|simple|simple0] [--offset=] " %(sys.argv[0],), file=sys.stderr) ++ print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] [--output-format=sxp|simple|simple0] [--runas=] [--offset=] " %(sys.argv[0],), file=sys.stderr) + + def copy_from_image(fs, file_to_read, file_type, fd_dst, path_dst, not_really): + if not_really: +@@ -760,7 +888,8 @@ if __name__ == "__main__": + os.write(fd_dst, data) + except Exception as e: + print(e, file=sys.stderr) +- os.unlink(path_dst) ++ if path_dst: ++ os.unlink(path_dst) + del datafile + sys.exit("Error writing temporary copy of "+file_type) + dataoff += len(data) +@@ -769,7 +898,7 @@ if __name__ == "__main__": + opts, args = getopt.gnu_getopt(sys.argv[1:], 'qilnh::', + ["quiet", "interactive", "list-entries", "not-really", "help", + "output=", "output-format=", "output-directory=", "offset=", +- "entry=", "kernel=", ++ "runas=", "entry=", "kernel=", + "ramdisk=", "args=", "isconfig", "debug"]) + except getopt.GetoptError: + usage() +@@ -790,6 +919,7 @@ if __name__ == "__main__": + not_really = False + output_format = "sxp" + output_directory = "/var/run/xen/pygrub/" ++ uid = None + + # what was passed in + incfg = { "kernel": None, "ramdisk": None, "args": "" } +@@ -813,6 +943,13 @@ if __name__ == "__main__": + elif o in ("--output",): + if a != "-": + output = a ++ elif o in ("--runas",): ++ try: ++ uid = int(a) ++ except ValueError: ++ print("runas value must be an integer user id") ++ usage() ++ sys.exit(1) + elif o in ("--kernel",): + incfg["kernel"] = a + elif o in ("--ramdisk",): +@@ -849,6 +986,10 @@ if __name__ == "__main__": + if debug: + logging.basicConfig(level=logging.DEBUG) + ++ if interactive and uid: ++ print("In order to use --runas, you must also set --entry or -q", file=sys.stderr) ++ sys.exit(1) ++ + try: + os.makedirs(output_directory, 0o700) + except OSError as e: +@@ -870,6 +1011,9 @@ if __name__ == "__main__": + else: + fd = os.open(output, os.O_WRONLY) + ++ if uid: ++ file = depriv(output_directory, output, file, uid, path_kernel, path_ramdisk) ++ + # debug + if isconfig: + chosencfg = run_grub(file, entry, fs, incfg["args"]) +@@ -925,21 +1069,21 @@ if __name__ == "__main__": + raise RuntimeError("Unable to find partition containing kernel") + + copy_from_image(fs, chosencfg["kernel"], "kernel", +- fd_kernel, path_kernel, not_really) ++ fd_kernel, None if uid else path_kernel, not_really) + bootcfg["kernel"] = path_kernel + + if chosencfg["ramdisk"]: + try: + copy_from_image(fs, chosencfg["ramdisk"], "ramdisk", +- fd_ramdisk, path_ramdisk, not_really) ++ fd_ramdisk, None if uid else path_ramdisk, not_really) + except: +- if not not_really: +- os.unlink(path_kernel) ++ if not uid and not not_really: ++ os.unlink(path_kernel) + raise + bootcfg["ramdisk"] = path_ramdisk + else: + initrd = None +- if not not_really: ++ if not uid and not not_really: + os.unlink(path_ramdisk) + + args = None +-- +2.42.0 + diff --git a/xsa443-4.17-10.patch b/xsa443-4.17-10.patch new file mode 100644 index 0000000..7c91f32 --- /dev/null +++ b/xsa443-4.17-10.patch @@ -0,0 +1,250 @@ +From 698b451473a6d868ca0f60a124fc4f31d81cd7b1 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Mon, 25 Sep 2023 14:30:20 +0200 +Subject: [PATCH 10/11] libxl: add support for running bootloader in restricted + mode +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Much like the device model depriv mode, add the same kind of support for the +bootloader. Such feature allows passing a UID as a parameter for the +bootloader to run as, together with the bootloader itself taking the necessary +actions to isolate. + +Note that the user to run the bootloader as must have the right permissions to +access the guest disk image (in read mode only), and that the bootloader will +be run in non-interactive mode when restricted. + +If enabled bootloader restrict mode will attempt to re-use the user(s) from the +QEMU depriv implementation if no user is provided on the configuration file or +the environment. See docs/features/qemu-deprivilege.pandoc for more +information about how to setup those users. + +Bootloader restrict mode is not enabled by default as it requires certain +setup to be done first (setup of the user(s) to use in restrict mode). + +This is part of XSA-443 / CVE-2023-34325 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Anthony PERARD +--- + docs/man/xl.1.pod.in | 33 +++++++++++ + tools/libs/light/libxl_bootloader.c | 89 ++++++++++++++++++++++++++++- + tools/libs/light/libxl_dm.c | 8 +-- + tools/libs/light/libxl_internal.h | 8 +++ + 4 files changed, 131 insertions(+), 7 deletions(-) + +diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in +index 101e14241d1c..4831e122427d 100644 +--- a/docs/man/xl.1.pod.in ++++ b/docs/man/xl.1.pod.in +@@ -1957,6 +1957,39 @@ ignored: + + =back + ++=head1 ENVIRONMENT VARIABLES ++ ++The following environment variables shall affect the execution of xl: ++ ++=over 4 ++ ++=item LIBXL_BOOTLOADER_RESTRICT ++ ++Attempt to restrict the bootloader after startup, to limit the ++consequences of security vulnerabilities due to parsing guest ++owned image files. ++ ++See docs/features/qemu-deprivilege.pandoc for more information ++on how to setup the unprivileged users. ++ ++Note that running the bootloader in restricted mode also implies using ++non-interactive mode, and the disk image must be readable by the ++restricted user. ++ ++Having this variable set is equivalent to enabling the option, even if the ++value is 0. ++ ++=item LIBXL_BOOTLOADER_USER ++ ++When using bootloader_restrict, run the bootloader as this user. If ++not set the default QEMU restrict users will be used. ++ ++NOTE: Each domain MUST have a SEPARATE username. ++ ++See docs/features/qemu-deprivilege.pandoc for more information. ++ ++=back ++ + =head1 SEE ALSO + + The following man pages: +diff --git a/tools/libs/light/libxl_bootloader.c b/tools/libs/light/libxl_bootloader.c +index 108329b4a5bb..23c0ef3e8935 100644 +--- a/tools/libs/light/libxl_bootloader.c ++++ b/tools/libs/light/libxl_bootloader.c +@@ -14,6 +14,7 @@ + + #include "libxl_osdeps.h" /* must come before any other headers */ + ++#include + #include + #ifdef HAVE_UTMP_H + #include +@@ -42,8 +43,71 @@ static void bootloader_arg(libxl__bootloader_state *bl, const char *arg) + bl->args[bl->nargs++] = arg; + } + +-static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl, +- const char *bootloader_path) ++static int bootloader_uid(libxl__gc *gc, domid_t guest_domid, ++ const char *user, uid_t *intended_uid) ++{ ++ struct passwd *user_base, user_pwbuf; ++ int rc; ++ ++ if (user) { ++ rc = userlookup_helper_getpwnam(gc, user, &user_pwbuf, &user_base); ++ if (rc) return rc; ++ ++ if (!user_base) { ++ LOGD(ERROR, guest_domid, "Couldn't find user %s", user); ++ return ERROR_INVAL; ++ } ++ ++ *intended_uid = user_base->pw_uid; ++ return 0; ++ } ++ ++ /* Re-use QEMU user range for the bootloader. */ ++ rc = userlookup_helper_getpwnam(gc, LIBXL_QEMU_USER_RANGE_BASE, ++ &user_pwbuf, &user_base); ++ if (rc) return rc; ++ ++ if (user_base) { ++ struct passwd *user_clash, user_clash_pwbuf; ++ uid_t temp_uid = user_base->pw_uid + guest_domid; ++ ++ rc = userlookup_helper_getpwuid(gc, temp_uid, &user_clash_pwbuf, ++ &user_clash); ++ if (rc) return rc; ++ ++ if (user_clash) { ++ LOGD(ERROR, guest_domid, ++ "wanted to use uid %ld (%s + %d) but that is user %s !", ++ (long)temp_uid, LIBXL_QEMU_USER_RANGE_BASE, ++ guest_domid, user_clash->pw_name); ++ return ERROR_INVAL; ++ } ++ ++ *intended_uid = temp_uid; ++ return 0; ++ } ++ ++ rc = userlookup_helper_getpwnam(gc, LIBXL_QEMU_USER_SHARED, &user_pwbuf, ++ &user_base); ++ if (rc) return rc; ++ ++ if (user_base) { ++ LOGD(WARN, guest_domid, "Could not find user %s, falling back to %s", ++ LIBXL_QEMU_USER_RANGE_BASE, LIBXL_QEMU_USER_SHARED); ++ *intended_uid = user_base->pw_uid; ++ ++ return 0; ++ } ++ ++ LOGD(ERROR, guest_domid, ++ "Could not find user %s or range base pseudo-user %s, cannot restrict", ++ LIBXL_QEMU_USER_SHARED, LIBXL_QEMU_USER_RANGE_BASE); ++ ++ return ERROR_INVAL; ++} ++ ++static int make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl, ++ const char *bootloader_path) + { + const libxl_domain_build_info *info = bl->info; + +@@ -61,6 +125,23 @@ static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl, + ARG(GCSPRINTF("--ramdisk=%s", info->ramdisk)); + if (info->cmdline && *info->cmdline != '\0') + ARG(GCSPRINTF("--args=%s", info->cmdline)); ++ if (getenv("LIBXL_BOOTLOADER_RESTRICT") || ++ getenv("LIBXL_BOOTLOADER_USER")) { ++ uid_t uid = -1; ++ int rc = bootloader_uid(gc, bl->domid, getenv("LIBXL_BOOTLOADER_USER"), ++ &uid); ++ ++ if (rc) return rc; ++ ++ assert(uid != -1); ++ if (!uid) { ++ LOGD(ERROR, bl->domid, "bootloader restrict UID is 0 (root)!"); ++ return ERROR_INVAL; ++ } ++ LOGD(DEBUG, bl->domid, "using uid %ld", (long)uid); ++ ARG(GCSPRINTF("--runas=%ld", (long)uid)); ++ ARG("--quiet"); ++ } + + ARG(GCSPRINTF("--output=%s", bl->outputpath)); + ARG("--output-format=simple0"); +@@ -79,6 +160,7 @@ static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl, + /* Sentinel for execv */ + ARG(NULL); + ++ return 0; + #undef ARG + } + +@@ -443,7 +525,8 @@ static void bootloader_disk_attached_cb(libxl__egc *egc, + bootloader = bltmp; + } + +- make_bootloader_args(gc, bl, bootloader); ++ rc = make_bootloader_args(gc, bl, bootloader); ++ if (rc) goto out; + + bl->openpty.ao = ao; + bl->openpty.callback = bootloader_gotptys; +diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c +index fc264a3a13a6..14b593110f7c 100644 +--- a/tools/libs/light/libxl_dm.c ++++ b/tools/libs/light/libxl_dm.c +@@ -80,10 +80,10 @@ static int libxl__create_qemu_logfile(libxl__gc *gc, char *name) + * On error, return a libxl-style error code. + */ + #define DEFINE_USERLOOKUP_HELPER(NAME,SPEC_TYPE,STRUCTNAME,SYSCONF) \ +- static int userlookup_helper_##NAME(libxl__gc *gc, \ +- SPEC_TYPE spec, \ +- struct STRUCTNAME *resultbuf, \ +- struct STRUCTNAME **out) \ ++ int userlookup_helper_##NAME(libxl__gc *gc, \ ++ SPEC_TYPE spec, \ ++ struct STRUCTNAME *resultbuf, \ ++ struct STRUCTNAME **out) \ + { \ + struct STRUCTNAME *resultp = NULL; \ + char *buf = NULL; \ +diff --git a/tools/libs/light/libxl_internal.h b/tools/libs/light/libxl_internal.h +index 7ad38de30e0b..f1e3a9a15b13 100644 +--- a/tools/libs/light/libxl_internal.h ++++ b/tools/libs/light/libxl_internal.h +@@ -4873,6 +4873,14 @@ struct libxl__cpu_policy { + struct xc_msr *msr; + }; + ++struct passwd; ++_hidden int userlookup_helper_getpwnam(libxl__gc*, const char *user, ++ struct passwd *res, ++ struct passwd **out); ++_hidden int userlookup_helper_getpwuid(libxl__gc*, uid_t uid, ++ struct passwd *res, ++ struct passwd **out); ++ + #endif + + /* +-- +2.42.0 + diff --git a/xsa443-4.17-11.patch b/xsa443-4.17-11.patch new file mode 100644 index 0000000..27e6f78 --- /dev/null +++ b/xsa443-4.17-11.patch @@ -0,0 +1,157 @@ +From 9d480426bfa2c68843ac8395b512e06fbdbcf53e Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 28 Sep 2023 12:22:35 +0200 +Subject: [PATCH 11/11] libxl: limit bootloader execution in restricted mode +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Introduce a timeout for bootloader execution when running in restricted mode. + +Allow overwriting the default time out with an environment provided value. + +This is part of XSA-443 / CVE-2023-34325 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Anthony PERARD +--- + docs/man/xl.1.pod.in | 8 ++++++ + tools/libs/light/libxl_bootloader.c | 40 +++++++++++++++++++++++++++++ + tools/libs/light/libxl_internal.h | 2 ++ + 3 files changed, 50 insertions(+) + +diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in +index 4831e122427d..c3eb6570ab8b 100644 +--- a/docs/man/xl.1.pod.in ++++ b/docs/man/xl.1.pod.in +@@ -1988,6 +1988,14 @@ NOTE: Each domain MUST have a SEPARATE username. + + See docs/features/qemu-deprivilege.pandoc for more information. + ++=item LIBXL_BOOTLOADER_TIMEOUT ++ ++Timeout in seconds for bootloader execution when running in restricted mode. ++Otherwise the build time default in LIBXL_BOOTLOADER_TIMEOUT will be used. ++ ++If defined the value must be an unsigned integer between 0 and INT_MAX, ++otherwise behavior is undefined. Setting to 0 disables the timeout. ++ + =back + + =head1 SEE ALSO +diff --git a/tools/libs/light/libxl_bootloader.c b/tools/libs/light/libxl_bootloader.c +index 23c0ef3e8935..ee26d08f3765 100644 +--- a/tools/libs/light/libxl_bootloader.c ++++ b/tools/libs/light/libxl_bootloader.c +@@ -30,6 +30,8 @@ static void bootloader_keystrokes_copyfail(libxl__egc *egc, + libxl__datacopier_state *dc, int rc, int onwrite, int errnoval); + static void bootloader_display_copyfail(libxl__egc *egc, + libxl__datacopier_state *dc, int rc, int onwrite, int errnoval); ++static void bootloader_timeout(libxl__egc *egc, libxl__ev_time *ev, ++ const struct timeval *requested_abs, int rc); + static void bootloader_domaindeath(libxl__egc*, libxl__domaindeathcheck *dc, + int rc); + static void bootloader_finished(libxl__egc *egc, libxl__ev_child *child, +@@ -297,6 +299,7 @@ void libxl__bootloader_init(libxl__bootloader_state *bl) + bl->ptys[0].master = bl->ptys[0].slave = 0; + bl->ptys[1].master = bl->ptys[1].slave = 0; + libxl__ev_child_init(&bl->child); ++ libxl__ev_time_init(&bl->time); + libxl__domaindeathcheck_init(&bl->deathcheck); + bl->keystrokes.ao = bl->ao; libxl__datacopier_init(&bl->keystrokes); + bl->display.ao = bl->ao; libxl__datacopier_init(&bl->display); +@@ -314,6 +317,7 @@ static void bootloader_cleanup(libxl__egc *egc, libxl__bootloader_state *bl) + libxl__domaindeathcheck_stop(gc,&bl->deathcheck); + libxl__datacopier_kill(&bl->keystrokes); + libxl__datacopier_kill(&bl->display); ++ libxl__ev_time_deregister(gc, &bl->time); + for (i=0; i<2; i++) { + libxl__carefd_close(bl->ptys[i].master); + libxl__carefd_close(bl->ptys[i].slave); +@@ -375,6 +379,7 @@ static void bootloader_stop(libxl__egc *egc, + + libxl__datacopier_kill(&bl->keystrokes); + libxl__datacopier_kill(&bl->display); ++ libxl__ev_time_deregister(gc, &bl->time); + if (libxl__ev_child_inuse(&bl->child)) { + r = kill(bl->child.pid, SIGTERM); + if (r) LOGED(WARN, bl->domid, "%sfailed to kill bootloader [%lu]", +@@ -637,6 +642,25 @@ static void bootloader_gotptys(libxl__egc *egc, libxl__openpty_state *op) + + struct termios termattr; + ++ if (getenv("LIBXL_BOOTLOADER_RESTRICT") || ++ getenv("LIBXL_BOOTLOADER_USER")) { ++ const char *timeout_env = getenv("LIBXL_BOOTLOADER_TIMEOUT"); ++ int timeout = timeout_env ? atoi(timeout_env) ++ : LIBXL_BOOTLOADER_TIMEOUT; ++ ++ if (timeout) { ++ /* Set execution timeout */ ++ rc = libxl__ev_time_register_rel(ao, &bl->time, ++ bootloader_timeout, ++ timeout * 1000); ++ if (rc) { ++ LOGED(ERROR, bl->domid, ++ "unable to register timeout for bootloader execution"); ++ goto out; ++ } ++ } ++ } ++ + pid_t pid = libxl__ev_child_fork(gc, &bl->child, bootloader_finished); + if (pid == -1) { + rc = ERROR_FAIL; +@@ -702,6 +726,21 @@ static void bootloader_display_copyfail(libxl__egc *egc, + libxl__bootloader_state *bl = CONTAINER_OF(dc, *bl, display); + bootloader_copyfail(egc, "bootloader output", bl, 1, rc,onwrite,errnoval); + } ++static void bootloader_timeout(libxl__egc *egc, libxl__ev_time *ev, ++ const struct timeval *requested_abs, int rc) ++{ ++ libxl__bootloader_state *bl = CONTAINER_OF(ev, *bl, time); ++ STATE_AO_GC(bl->ao); ++ ++ libxl__ev_time_deregister(gc, &bl->time); ++ ++ assert(libxl__ev_child_inuse(&bl->child)); ++ LOGD(ERROR, bl->domid, "killing bootloader because of timeout"); ++ ++ libxl__ev_child_kill_deregister(ao, &bl->child, SIGKILL); ++ ++ bootloader_callback(egc, bl, rc); ++} + + static void bootloader_domaindeath(libxl__egc *egc, + libxl__domaindeathcheck *dc, +@@ -718,6 +757,7 @@ static void bootloader_finished(libxl__egc *egc, libxl__ev_child *child, + STATE_AO_GC(bl->ao); + int rc; + ++ libxl__ev_time_deregister(gc, &bl->time); + libxl__datacopier_kill(&bl->keystrokes); + libxl__datacopier_kill(&bl->display); + +diff --git a/tools/libs/light/libxl_internal.h b/tools/libs/light/libxl_internal.h +index f1e3a9a15b13..d05783617ff5 100644 +--- a/tools/libs/light/libxl_internal.h ++++ b/tools/libs/light/libxl_internal.h +@@ -102,6 +102,7 @@ + #define LIBXL_QMP_CMD_TIMEOUT 10 + #define LIBXL_STUBDOM_START_TIMEOUT 30 + #define LIBXL_QEMU_BODGE_TIMEOUT 2 ++#define LIBXL_BOOTLOADER_TIMEOUT 120 + #define LIBXL_XENCONSOLE_LIMIT 1048576 + #define LIBXL_XENCONSOLE_PROTOCOL "vt100" + #define LIBXL_MAXMEM_CONSTANT 1024 +@@ -3744,6 +3745,7 @@ struct libxl__bootloader_state { + libxl__openpty_state openpty; + libxl__openpty_result ptys[2]; /* [0] is for bootloader */ + libxl__ev_child child; ++ libxl__ev_time time; + libxl__domaindeathcheck deathcheck; + int nargs, argsspace; + const char **args; +-- +2.42.0 + diff --git a/xsa444-4.17-1.patch b/xsa444-4.17-1.patch new file mode 100644 index 0000000..5a4b2e5 --- /dev/null +++ b/xsa444-4.17-1.patch @@ -0,0 +1,93 @@ +From: Andrew Cooper +Subject: x86/svm: Fix asymmetry with AMD DR MASK context switching + +The handling of MSR_DR{0..3}_MASK is asymmetric between PV and HVM guests. + +HVM guests context switch in based on the guest view of DBEXT, whereas PV +guest switch in base on the host capability. Both guest types leave the +context dirty for the next vCPU. + +This leads to the following issue: + + * PV or HVM guest has debugging active (%dr7 + mask) + * Switch-out deactivates %dr7 but leaves other state stale in hardware + * Another HVM guest with masks unavailable has debugging active + * Switch in loads %dr7 but leaves the mask MSRs alone + +Now, the second guest's vCPU is operating in the context of the prior vCPU's +mask MSR, while the environment the vCPU can see says there are no mask MSRs. + +As a stopgap, adjust the HVM path to switch in the masks based on host +capabilities rather than guest visibility (i.e. like the PV path). Adjustment +of the intercepts still needs to be dependent on the guest visibility of +DBEXT. + +This is part of XSA-444 / CVE-2023-34327 + +Fixes: c097f54912d3 ("x86/SVM: support data breakpoint extension registers") +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich + +diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c +index e8f50e7c5ec7..fd32600ae364 100644 +--- a/xen/arch/x86/hvm/svm/svm.c ++++ b/xen/arch/x86/hvm/svm/svm.c +@@ -339,6 +339,10 @@ static void svm_save_dr(struct vcpu *v) + v->arch.hvm.flag_dr_dirty = 0; + vmcb_set_dr_intercepts(vmcb, ~0u); + ++ /* ++ * The guest can only have changed the mask MSRs if we previous dropped ++ * intercepts. Re-read them from hardware. ++ */ + if ( v->domain->arch.cpuid->extd.dbext ) + { + svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_RW); +@@ -370,17 +374,25 @@ static void __restore_debug_registers(struct vmcb_struct *vmcb, struct vcpu *v) + + ASSERT(v == current); + +- if ( v->domain->arch.cpuid->extd.dbext ) ++ /* ++ * Both the PV and HVM paths leave stale DR_MASK values in hardware on ++ * context-switch-out. If we're activating %dr7 for the guest, we must ++ * sync the DR_MASKs too, whether or not the guest can see them. ++ */ ++ if ( boot_cpu_has(X86_FEATURE_DBEXT) ) + { +- svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE); +- svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE); +- svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE); +- svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE); +- + wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.msrs->dr_mask[0]); + wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.msrs->dr_mask[1]); + wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.msrs->dr_mask[2]); + wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.msrs->dr_mask[3]); ++ ++ if ( v->domain->arch.cpuid->extd.dbext ) ++ { ++ svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE); ++ svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE); ++ svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE); ++ svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE); ++ } + } + + write_debugreg(0, v->arch.dr[0]); +diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c +index e65cc6004148..06c4f3868b7a 100644 +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -2281,6 +2281,11 @@ void activate_debugregs(const struct vcpu *curr) + if ( curr->arch.dr7 & DR7_ACTIVE_MASK ) + write_debugreg(7, curr->arch.dr7); + ++ /* ++ * Both the PV and HVM paths leave stale DR_MASK values in hardware on ++ * context-switch-out. If we're activating %dr7 for the guest, we must ++ * sync the DR_MASKs too, whether or not the guest can see them. ++ */ + if ( boot_cpu_has(X86_FEATURE_DBEXT) ) + { + wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.msrs->dr_mask[0]); diff --git a/xsa444-4.17-2.patch b/xsa444-4.17-2.patch new file mode 100644 index 0000000..2687bd1 --- /dev/null +++ b/xsa444-4.17-2.patch @@ -0,0 +1,72 @@ +From: Andrew Cooper +Subject: x86/pv: Correct the auditing of guest breakpoint addresses + +The use of access_ok() is buggy, because it permits access to the compat +translation area. 64bit PV guests don't use the XLAT area, but on AMD +hardware, the DBEXT feature allows a breakpoint to match up to a 4G aligned +region, allowing the breakpoint to reach outside of the XLAT area. + +Prior to c/s cda16c1bb223 ("x86: mirror compat argument translation area for +32-bit PV"), the live GDT was within 4G of the XLAT area. + +All together, this allowed a malicious 64bit PV guest on AMD hardware to place +a breakpoint over the live GDT, and trigger a #DB livelock (CVE-2015-8104). + +Introduce breakpoint_addr_ok() and explain why __addr_ok() happens to be an +appropriate check in this case. + +For Xen 4.14 and later, this is a latent bug because the XLAT area has moved +to be on its own with nothing interesting adjacent. For Xen 4.13 and older on +AMD hardware, this fixes a PV-trigger-able DoS. + +This is part of XSA-444 / CVE-2023-34328. + +Fixes: 65e355490817 ("x86/PV: support data breakpoint extension registers") +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +Reviewed-by: Jan Beulich + +diff --git a/xen/arch/x86/include/asm/debugreg.h b/xen/arch/x86/include/asm/debugreg.h +index c57914efc6e8..cc298265244b 100644 +--- a/xen/arch/x86/include/asm/debugreg.h ++++ b/xen/arch/x86/include/asm/debugreg.h +@@ -77,6 +77,26 @@ + asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) ); \ + __val; \ + }) ++ ++/* ++ * Architecturally, %dr{0..3} can have any arbitrary value. However, Xen ++ * can't allow the guest to breakpoint the Xen address range, so we limit the ++ * guest to the lower canonical half, or above the Xen range in the higher ++ * canonical half. ++ * ++ * Breakpoint lengths are specified to mask the low order address bits, ++ * meaning all breakpoints are naturally aligned. With %dr7, the widest ++ * breakpoint is 8 bytes. With DBEXT, the widest breakpoint is 4G. Both of ++ * the Xen boundaries have >4G alignment. ++ * ++ * In principle we should account for HYPERVISOR_COMPAT_VIRT_START(d), but ++ * 64bit Xen has never enforced this for compat guests, and there's no problem ++ * (to Xen) if the guest breakpoints it's alias of the M2P. Skipping this ++ * aspect simplifies the logic, and causes us not to reject a migrating guest ++ * which operated fine on prior versions of Xen. ++ */ ++#define breakpoint_addr_ok(a) __addr_ok(a) ++ + long set_debugreg(struct vcpu *, unsigned int reg, unsigned long value); + void activate_debugregs(const struct vcpu *); + +diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c +index aaaf70eb6330..f8636de907ae 100644 +--- a/xen/arch/x86/pv/misc-hypercalls.c ++++ b/xen/arch/x86/pv/misc-hypercalls.c +@@ -72,7 +72,7 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) + switch ( reg ) + { + case 0 ... 3: +- if ( !access_ok(value, sizeof(long)) ) ++ if ( !breakpoint_addr_ok(value) ) + return -EPERM; + + v->arch.dr[reg] = value;