Skip to content

Commit a53190a

Browse files
Yang Shitorvalds
Yang Shi
authored andcommitted
mm: mempolicy: handle vma with unmovable pages mapped correctly in mbind
When running syzkaller internally, we ran into the below bug on 4.9.x kernel: kernel BUG at mm/huge_memory.c:2124! invalid opcode: 0000 [#1] SMP KASAN CPU: 0 PID: 1518 Comm: syz-executor107 Not tainted 4.9.168+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011 task: ffff880067b34900 task.stack: ffff880068998000 RIP: split_huge_page_to_list+0x8fb/0x1030 mm/huge_memory.c:2124 Call Trace: split_huge_page include/linux/huge_mm.h:100 [inline] queue_pages_pte_range+0x7e1/0x1480 mm/mempolicy.c:538 walk_pmd_range mm/pagewalk.c:50 [inline] walk_pud_range mm/pagewalk.c:90 [inline] walk_pgd_range mm/pagewalk.c:116 [inline] __walk_page_range+0x44a/0xdb0 mm/pagewalk.c:208 walk_page_range+0x154/0x370 mm/pagewalk.c:285 queue_pages_range+0x115/0x150 mm/mempolicy.c:694 do_mbind mm/mempolicy.c:1241 [inline] SYSC_mbind+0x3c3/0x1030 mm/mempolicy.c:1370 SyS_mbind+0x46/0x60 mm/mempolicy.c:1352 do_syscall_64+0x1d2/0x600 arch/x86/entry/common.c:282 entry_SYSCALL_64_after_swapgs+0x5d/0xdb Code: c7 80 1c 02 00 e8 26 0a 76 01 <0f> 0b 48 c7 c7 40 46 45 84 e8 4c RIP [<ffffffff81895d6b>] split_huge_page_to_list+0x8fb/0x1030 mm/huge_memory.c:2124 RSP <ffff88006899f980> with the below test: uint64_t r[1] = {0xffffffffffffffff}; int main(void) { syscall(__NR_mmap, 0x20000000, 0x1000000, 3, 0x32, -1, 0); intptr_t res = 0; res = syscall(__NR_socket, 0x11, 3, 0x300); if (res != -1) r[0] = res; *(uint32_t*)0x20000040 = 0x10000; *(uint32_t*)0x20000044 = 1; *(uint32_t*)0x20000048 = 0xc520; *(uint32_t*)0x2000004c = 1; syscall(__NR_setsockopt, r[0], 0x107, 0xd, 0x20000040, 0x10); syscall(__NR_mmap, 0x20fed000, 0x10000, 0, 0x8811, r[0], 0); *(uint64_t*)0x20000340 = 2; syscall(__NR_mbind, 0x20ff9000, 0x4000, 0x4002, 0x20000340, 0x45d4, 3); return 0; } Actually the test does: mmap(0x20000000, 16777216, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x20000000 socket(AF_PACKET, SOCK_RAW, 768) = 3 setsockopt(3, SOL_PACKET, PACKET_TX_RING, {block_size=65536, block_nr=1, frame_size=50464, frame_nr=1}, 16) = 0 mmap(0x20fed000, 65536, PROT_NONE, MAP_SHARED|MAP_FIXED|MAP_POPULATE|MAP_DENYWRITE, 3, 0) = 0x20fed000 mbind(..., MPOL_MF_STRICT|MPOL_MF_MOVE) = 0 The setsockopt() would allocate compound pages (16 pages in this test) for packet tx ring, then the mmap() would call packet_mmap() to map the pages into the user address space specified by the mmap() call. When calling mbind(), it would scan the vma to queue the pages for migration to the new node. It would split any huge page since 4.9 doesn't support THP migration, however, the packet tx ring compound pages are not THP and even not movable. So, the above bug is triggered. However, the later kernel is not hit by this issue due to commit d44d363 ("mm: don't assume anonymous pages have SwapBacked flag"), which just removes the PageSwapBacked check for a different reason. But, there is a deeper issue. According to the semantic of mbind(), it should return -EIO if MPOL_MF_MOVE or MPOL_MF_MOVE_ALL was specified and MPOL_MF_STRICT was also specified, but the kernel was unable to move all existing pages in the range. The tx ring of the packet socket is definitely not movable, however, mbind() returns success for this case. Although the most socket file associates with non-movable pages, but XDP may have movable pages from gup. So, it sounds not fine to just check the underlying file type of vma in vma_migratable(). Change migrate_page_add() to check if the page is movable or not, if it is unmovable, just return -EIO. But do not abort pte walk immediately, since there may be pages off LRU temporarily. We should migrate other pages if MPOL_MF_MOVE* is specified. Set has_unmovable flag if some paged could not be not moved, then return -EIO for mbind() eventually. With this change the above test would return -EIO as expected. [[email protected]: fix review comments from Vlastimil] Link: http://lkml.kernel.org/r/[email protected] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Yang Shi <[email protected]> Reviewed-by: Vlastimil Babka <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Mel Gorman <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent d883544 commit a53190a

File tree

1 file changed

+25
-7
lines changed

1 file changed

+25
-7
lines changed

mm/mempolicy.c

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
403403
},
404404
};
405405

406-
static void migrate_page_add(struct page *page, struct list_head *pagelist,
406+
static int migrate_page_add(struct page *page, struct list_head *pagelist,
407407
unsigned long flags);
408408

409409
struct queue_pages {
@@ -463,12 +463,11 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
463463
flags = qp->flags;
464464
/* go to thp migration */
465465
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
466-
if (!vma_migratable(walk->vma)) {
466+
if (!vma_migratable(walk->vma) ||
467+
migrate_page_add(page, qp->pagelist, flags)) {
467468
ret = 1;
468469
goto unlock;
469470
}
470-
471-
migrate_page_add(page, qp->pagelist, flags);
472471
} else
473472
ret = -EIO;
474473
unlock:
@@ -532,7 +531,14 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
532531
has_unmovable = true;
533532
break;
534533
}
535-
migrate_page_add(page, qp->pagelist, flags);
534+
535+
/*
536+
* Do not abort immediately since there may be
537+
* temporary off LRU pages in the range. Still
538+
* need migrate other LRU pages.
539+
*/
540+
if (migrate_page_add(page, qp->pagelist, flags))
541+
has_unmovable = true;
536542
} else
537543
break;
538544
}
@@ -961,7 +967,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
961967
/*
962968
* page migration, thp tail pages can be passed.
963969
*/
964-
static void migrate_page_add(struct page *page, struct list_head *pagelist,
970+
static int migrate_page_add(struct page *page, struct list_head *pagelist,
965971
unsigned long flags)
966972
{
967973
struct page *head = compound_head(page);
@@ -974,8 +980,19 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
974980
mod_node_page_state(page_pgdat(head),
975981
NR_ISOLATED_ANON + page_is_file_cache(head),
976982
hpage_nr_pages(head));
983+
} else if (flags & MPOL_MF_STRICT) {
984+
/*
985+
* Non-movable page may reach here. And, there may be
986+
* temporary off LRU pages or non-LRU movable pages.
987+
* Treat them as unmovable pages since they can't be
988+
* isolated, so they can't be moved at the moment. It
989+
* should return -EIO for this case too.
990+
*/
991+
return -EIO;
977992
}
978993
}
994+
995+
return 0;
979996
}
980997

981998
/* page allocation callback for NUMA node migration */
@@ -1178,9 +1195,10 @@ static struct page *new_page(struct page *page, unsigned long start)
11781195
}
11791196
#else
11801197

1181-
static void migrate_page_add(struct page *page, struct list_head *pagelist,
1198+
static int migrate_page_add(struct page *page, struct list_head *pagelist,
11821199
unsigned long flags)
11831200
{
1201+
return -EIO;
11841202
}
11851203

11861204
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,

0 commit comments

Comments
 (0)