2f85fe0
From 94f984ff563d1777652b822d7a282cacc1e481c2 Mon Sep 17 00:00:00 2001
2f85fe0
From: Andrea Arcangeli <aarcange@redhat.com>
2f85fe0
Date: Wed, 27 Apr 2016 12:04:46 -0500
2f85fe0
Subject: [PATCH] mm: thp: kvm: fix memory corruption in KVM with THP enabled
2f85fe0
2f85fe0
After the THP refcounting change, obtaining a compound pages from
2f85fe0
get_user_pages() no longer allows us to assume the entire compound
2f85fe0
page is immediately mappable from a secondary MMU.
2f85fe0
2f85fe0
A secondary MMU doesn't want to call get_user_pages() more than once
2f85fe0
for each compound page, in order to know if it can map the whole
2f85fe0
compound page. So a secondary MMU needs to know from a single
2f85fe0
get_user_pages() invocation when it can map immediately the entire
2f85fe0
compound page to avoid a flood of unnecessary secondary MMU faults and
2f85fe0
spurious atomic_inc()/atomic_dec() (pages don't have to be pinned by
2f85fe0
MMU notifier users).
2f85fe0
2f85fe0
Ideally instead of the page->_mapcount < 1 check, get_user_pages()
2f85fe0
should return the granularity of the "page" mapping in the "mm" passed
2f85fe0
to get_user_pages(). However it's non trivial change to pass the "pmd"
2f85fe0
status belonging to the "mm" walked by get_user_pages up the stack (up
2f85fe0
to the caller of get_user_pages). So the fix just checks if there is
2f85fe0
not a single pte mapping on the page returned by get_user_pages, and
2f85fe0
in turn if the caller can assume that the whole compound page is
2f85fe0
mapped in the current "mm" (in a pmd_trans_huge()). In such case the
2f85fe0
entire compound page is safe to map into the secondary MMU without
2f85fe0
additional get_user_pages() calls on the surrounding tail/head
2f85fe0
pages. In addition of being faster, not having to run other
2f85fe0
get_user_pages() calls also reduces the memory footprint of the
2f85fe0
secondary MMU fault in case the pmd split happened as result of memory
2f85fe0
pressure.
2f85fe0
2f85fe0
Without this fix after a MADV_DONTNEED (like invoked by QEMU during
2f85fe0
postcopy live migration or balloning) or after generic swapping (with
2f85fe0
a failure in split_huge_page() that would only result in pmd splitting
2f85fe0
and not a physical page split), KVM would map the whole compound page
2f85fe0
into the shadow pagetables, despite regular faults or userfaults (like
2f85fe0
UFFDIO_COPY) may map regular pages into the primary MMU as result of
2f85fe0
the pte faults, leading to the guest mode and userland mode going out
2f85fe0
of sync and not working on the same memory at all times.
2f85fe0
2f85fe0
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
2f85fe0
---
2f85fe0
 arch/arm/kvm/mmu.c         |  2 +-
2f85fe0
 arch/x86/kvm/mmu.c         |  4 ++--
2f85fe0
 include/linux/page-flags.h | 22 ++++++++++++++++++++++
2f85fe0
 3 files changed, 25 insertions(+), 3 deletions(-)
2f85fe0
2f85fe0
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
2f85fe0
index aba61fd..8dafe97 100644
2f85fe0
--- a/arch/arm/kvm/mmu.c
2f85fe0
+++ b/arch/arm/kvm/mmu.c
2f85fe0
@@ -997,7 +997,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
2f85fe0
 	kvm_pfn_t pfn = *pfnp;
2f85fe0
 	gfn_t gfn = *ipap >> PAGE_SHIFT;
2f85fe0
2f85fe0
-	if (PageTransCompound(pfn_to_page(pfn))) {
2f85fe0
+	if (PageTransCompoundMap(pfn_to_page(pfn))) {
2f85fe0
 		unsigned long mask;
2f85fe0
 		/*
2f85fe0
 		 * The address we faulted on is backed by a transparent huge
2f85fe0
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
2f85fe0
index 1e7a49b..3a371f7 100644
2f85fe0
--- a/arch/x86/kvm/mmu.c
2f85fe0
+++ b/arch/x86/kvm/mmu.c
2f85fe0
@@ -2767,7 +2767,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2f85fe0
 	 */
2f85fe0
 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
2f85fe0
 	    level == PT_PAGE_TABLE_LEVEL &&
2f85fe0
-	    PageTransCompound(pfn_to_page(pfn)) &&
2f85fe0
+	    PageTransCompoundMap(pfn_to_page(pfn)) &&
2f85fe0
 	    !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
2f85fe0
 		unsigned long mask;
2f85fe0
 		/*
2f85fe0
@@ -4621,7 +4621,7 @@ restart:
2f85fe0
 		 */
2f85fe0
 		if (sp->role.direct &&
2f85fe0
 			!kvm_is_reserved_pfn(pfn) &&
2f85fe0
-			PageTransCompound(pfn_to_page(pfn))) {
2f85fe0
+			PageTransCompoundMap(pfn_to_page(pfn))) {
2f85fe0
 			drop_spte(kvm, sptep);
2f85fe0
 			need_tlb_flush = 1;
2f85fe0
 			goto restart;
2f85fe0
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
2f85fe0
index 19724e6..522bd6d 100644
2f85fe0
--- a/include/linux/page-flags.h
2f85fe0
+++ b/include/linux/page-flags.h
2f85fe0
@@ -517,6 +517,27 @@ static inline int PageTransCompound(struct page *page)
2f85fe0
 }
2f85fe0
2f85fe0
 /*
2f85fe0
+ * PageTransCompoundMap is the same as PageTransCompound, but it also
2f85fe0
+ * guarantees the primary MMU has the entire compound page mapped
2f85fe0
+ * through pmd_trans_huge, which in turn guarantees the secondary MMUs
2f85fe0
+ * can also map the entire compound page. This allows the secondary
2f85fe0
+ * MMUs to call get_user_pages() only once for each compound page and
2f85fe0
+ * to immediately map the entire compound page with a single secondary
2f85fe0
+ * MMU fault. If there will be a pmd split later, the secondary MMUs
2f85fe0
+ * will get an update through the MMU notifier invalidation through
2f85fe0
+ * split_huge_pmd().
2f85fe0
+ *
2f85fe0
+ * Unlike PageTransCompound, this is safe to be called only while
2f85fe0
+ * split_huge_pmd() cannot run from under us, like if protected by the
2f85fe0
+ * MMU notifier, otherwise it may result in page->_mapcount < 0 false
2f85fe0
+ * positives.
2f85fe0
+ */
2f85fe0
+static inline int PageTransCompoundMap(struct page *page)
2f85fe0
+{
2f85fe0
+	return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0;
2f85fe0
+}
2f85fe0
+
2f85fe0
+/*
2f85fe0
  * PageTransTail returns true for both transparent huge pages
2f85fe0
  * and hugetlbfs pages, so it should only be called when it's known
2f85fe0
  * that hugetlbfs pages aren't involved.
2f85fe0
@@ -559,6 +580,7 @@ static inline int TestClearPageDoubleMap(struct page *page)
2f85fe0
 #else
2f85fe0
 TESTPAGEFLAG_FALSE(TransHuge)
2f85fe0
 TESTPAGEFLAG_FALSE(TransCompound)
2f85fe0
+TESTPAGEFLAG_FALSE(TransCompoundMap)
2f85fe0
 TESTPAGEFLAG_FALSE(TransTail)
2f85fe0
 TESTPAGEFLAG_FALSE(DoubleMap)
2f85fe0
 	TESTSETFLAG_FALSE(DoubleMap)
2f85fe0
-- 
2f85fe0
2.7.4
2f85fe0