Skip to content

Commit b07398e

Browse files
davidhildenbrandgregkh
authored andcommitted
x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range()
[ Upstream commit dc84bc2aba85a1508f04a936f9f9a15f64ebfb31 ] If track_pfn_copy() fails, we already added the dst VMA to the maple tree. As fork() fails, we'll cleanup the maple tree, and stumble over the dst VMA for which we neither performed any reservation nor copied any page tables. Consequently untrack_pfn() will see VM_PAT and try obtaining the PAT information from the page table -- which fails because the page table was not copied. The easiest fix would be to simply clear the VM_PAT flag of the dst VMA if track_pfn_copy() fails. However, the whole thing is about "simply" clearing the VM_PAT flag is shaky as well: if we passed track_pfn_copy() and performed a reservation, but copying the page tables fails, we'll simply clear the VM_PAT flag, not properly undoing the reservation ... which is also wrong. So let's fix it properly: set the VM_PAT flag only if the reservation succeeded (leaving it clear initially), and undo the reservation if anything goes wrong while copying the page tables: clearing the VM_PAT flag after undoing the reservation. Note that any copied page table entries will get zapped when the VMA will get removed later, after copy_page_range() succeeded; as VM_PAT is not set then, we won't try cleaning VM_PAT up once more and untrack_pfn() will be happy. Note that leaving these page tables in place without a reservation is not a problem, as we are aborting fork(); this process will never run. A reproducer can trigger this usually at the first try: https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/reproducers/pat_fork.c WARNING: CPU: 26 PID: 11650 at arch/x86/mm/pat/memtype.c:983 get_pat_info+0xf6/0x110 Modules linked in: ... CPU: 26 UID: 0 PID: 11650 Comm: repro3 Not tainted 6.12.0-rc5+ #92 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014 RIP: 0010:get_pat_info+0xf6/0x110 ... Call Trace: <TASK> ... untrack_pfn+0x52/0x110 unmap_single_vma+0xa6/0xe0 unmap_vmas+0x105/0x1f0 exit_mmap+0xf6/0x460 __mmput+0x4b/0x120 copy_process+0x1bf6/0x2aa0 kernel_clone+0xab/0x440 __do_sys_clone+0x66/0x90 do_syscall_64+0x95/0x180 Likely this case was missed in: d155df5 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed") ... and instead of undoing the reservation we simply cleared the VM_PAT flag. Keep the documentation of these functions in include/linux/pgtable.h, one place is more than sufficient -- we should clean that up for the other functions like track_pfn_remap/untrack_pfn separately. Fixes: d155df5 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed") Fixes: 2ab6403 ("x86: PAT: hooks in generic vm code to help archs to track pfnmap regions - v3") Reported-by: xingwei lee <[email protected]> Reported-by: yuxin wang <[email protected]> Reported-by: Marius Fleischer <[email protected]> Signed-off-by: David Hildenbrand <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Rik van Riel <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Andrew Morton <[email protected]> Cc: [email protected] Link: https://lore.kernel.org/r/[email protected] Closes: https://lore.kernel.org/lkml/CABOYnLx_dnqzpCW99G81DmOr+2UzdmZMk=T3uxwNxwz+R1RAwg@mail.gmail.com/ Closes: https://lore.kernel.org/lkml/CAJg=8jwijTP5fre8woS4JVJQ8iUA6v+iNcsOgtj9Zfpc3obDOQ@mail.gmail.com/ Signed-off-by: Sasha Levin <[email protected]>
1 parent 22280de commit b07398e

File tree

4 files changed

+58
-37
lines changed

4 files changed

+58
-37
lines changed

arch/x86/mm/pat/memtype.c

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -982,29 +982,42 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
982982
return -EINVAL;
983983
}
984984

985-
/*
986-
* track_pfn_copy is called when vma that is covering the pfnmap gets
987-
* copied through copy_page_range().
988-
*
989-
* If the vma has a linear pfn mapping for the entire range, we get the prot
990-
* from pte and reserve the entire vma range with single reserve_pfn_range call.
991-
*/
992-
int track_pfn_copy(struct vm_area_struct *vma)
985+
int track_pfn_copy(struct vm_area_struct *dst_vma,
986+
struct vm_area_struct *src_vma, unsigned long *pfn)
993987
{
988+
const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start;
994989
resource_size_t paddr;
995-
unsigned long vma_size = vma->vm_end - vma->vm_start;
996990
pgprot_t pgprot;
991+
int rc;
997992

998-
if (vma->vm_flags & VM_PAT) {
999-
if (get_pat_info(vma, &paddr, &pgprot))
1000-
return -EINVAL;
1001-
/* reserve the whole chunk covered by vma. */
1002-
return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
1003-
}
993+
if (!(src_vma->vm_flags & VM_PAT))
994+
return 0;
995+
996+
/*
997+
* Duplicate the PAT information for the dst VMA based on the src
998+
* VMA.
999+
*/
1000+
if (get_pat_info(src_vma, &paddr, &pgprot))
1001+
return -EINVAL;
1002+
rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1);
1003+
if (rc)
1004+
return rc;
10041005

1006+
/* Reservation for the destination VMA succeeded. */
1007+
vm_flags_set(dst_vma, VM_PAT);
1008+
*pfn = PHYS_PFN(paddr);
10051009
return 0;
10061010
}
10071011

1012+
void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn)
1013+
{
1014+
untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true);
1015+
/*
1016+
* Reservation was freed, any copied page tables will get cleaned
1017+
* up later, but without getting PAT involved again.
1018+
*/
1019+
}
1020+
10081021
/*
10091022
* prot is passed in as a parameter for the new mapping. If the vma has
10101023
* a linear pfn mapping for the entire range, or no vma is provided,
@@ -1093,15 +1106,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
10931106
}
10941107
}
10951108

1096-
/*
1097-
* untrack_pfn_clear is called if the following situation fits:
1098-
*
1099-
* 1) while mremapping a pfnmap for a new region, with the old vma after
1100-
* its pfnmap page table has been removed. The new vma has a new pfnmap
1101-
* to the same pfn & cache type with VM_PAT set.
1102-
* 2) while duplicating vm area, the new vma fails to copy the pgtable from
1103-
* old vma.
1104-
*/
11051109
void untrack_pfn_clear(struct vm_area_struct *vma)
11061110
{
11071111
vm_flags_clear(vma, VM_PAT);

include/linux/pgtable.h

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,14 +1286,25 @@ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
12861286
}
12871287

12881288
/*
1289-
* track_pfn_copy is called when vma that is covering the pfnmap gets
1290-
* copied through copy_page_range().
1289+
* track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page
1290+
* tables copied during copy_page_range(). On success, stores the pfn to be
1291+
* passed to untrack_pfn_copy().
12911292
*/
1292-
static inline int track_pfn_copy(struct vm_area_struct *vma)
1293+
static inline int track_pfn_copy(struct vm_area_struct *dst_vma,
1294+
struct vm_area_struct *src_vma, unsigned long *pfn)
12931295
{
12941296
return 0;
12951297
}
12961298

1299+
/*
1300+
* untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during
1301+
* copy_page_range(), but after track_pfn_copy() was already called.
1302+
*/
1303+
static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma,
1304+
unsigned long pfn)
1305+
{
1306+
}
1307+
12971308
/*
12981309
* untrack_pfn is called while unmapping a pfnmap for a region.
12991310
* untrack can be called for a specific region indicated by pfn and size or
@@ -1306,8 +1317,10 @@ static inline void untrack_pfn(struct vm_area_struct *vma,
13061317
}
13071318

13081319
/*
1309-
* untrack_pfn_clear is called while mremapping a pfnmap for a new region
1310-
* or fails to copy pgtable during duplicate vm area.
1320+
* untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA:
1321+
*
1322+
* 1) During mremap() on the src VMA after the page tables were moved.
1323+
* 2) During fork() on the dst VMA, immediately after duplicating the src VMA.
13111324
*/
13121325
static inline void untrack_pfn_clear(struct vm_area_struct *vma)
13131326
{
@@ -1318,7 +1331,10 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
13181331
unsigned long size);
13191332
extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
13201333
pfn_t pfn);
1321-
extern int track_pfn_copy(struct vm_area_struct *vma);
1334+
extern int track_pfn_copy(struct vm_area_struct *dst_vma,
1335+
struct vm_area_struct *src_vma, unsigned long *pfn);
1336+
extern void untrack_pfn_copy(struct vm_area_struct *dst_vma,
1337+
unsigned long pfn);
13221338
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
13231339
unsigned long size, bool mm_wr_locked);
13241340
extern void untrack_pfn_clear(struct vm_area_struct *vma);

kernel/fork.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,10 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
518518
vma_numab_state_init(new);
519519
dup_anon_vma_name(orig, new);
520520

521+
/* track_pfn_copy() will later take care of copying internal state. */
522+
if (unlikely(new->vm_flags & VM_PFNMAP))
523+
untrack_pfn_clear(new);
524+
521525
return new;
522526
}
523527

mm/memory.c

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1268,12 +1268,12 @@ int
12681268
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
12691269
{
12701270
pgd_t *src_pgd, *dst_pgd;
1271-
unsigned long next;
12721271
unsigned long addr = src_vma->vm_start;
12731272
unsigned long end = src_vma->vm_end;
12741273
struct mm_struct *dst_mm = dst_vma->vm_mm;
12751274
struct mm_struct *src_mm = src_vma->vm_mm;
12761275
struct mmu_notifier_range range;
1276+
unsigned long next, pfn;
12771277
bool is_cow;
12781278
int ret;
12791279

@@ -1284,11 +1284,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
12841284
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
12851285

12861286
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1287-
/*
1288-
* We do not free on error cases below as remove_vma
1289-
* gets called on error from higher level routine
1290-
*/
1291-
ret = track_pfn_copy(src_vma);
1287+
ret = track_pfn_copy(dst_vma, src_vma, &pfn);
12921288
if (ret)
12931289
return ret;
12941290
}
@@ -1325,7 +1321,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
13251321
continue;
13261322
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
13271323
addr, next))) {
1328-
untrack_pfn_clear(dst_vma);
13291324
ret = -ENOMEM;
13301325
break;
13311326
}
@@ -1335,6 +1330,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
13351330
raw_write_seqcount_end(&src_mm->write_protect_seq);
13361331
mmu_notifier_invalidate_range_end(&range);
13371332
}
1333+
if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
1334+
untrack_pfn_copy(dst_vma, pfn);
13381335
return ret;
13391336
}
13401337

0 commit comments

Comments
 (0)