Skip to content

Commit 63b2d41

Browse files
aagittorvalds
authored andcommitted
userfaultfd: wp: add the writeprotect API to userfaultfd ioctl
Introduce the new uffd-wp APIs for userspace. Firstly, we'll allow to do UFFDIO_REGISTER with write protection tracking using the new UFFDIO_REGISTER_MODE_WP flag. Note that this flag can co-exist with the existing UFFDIO_REGISTER_MODE_MISSING, in which case the userspace program can not only resolve missing page faults, and at the same time tracking page data changes along the way. Secondly, we introduced the new UFFDIO_WRITEPROTECT API to do page level write protection tracking. Note that we will need to register the memory region with UFFDIO_REGISTER_MODE_WP before that. [[email protected]: write up the commit message] [[email protected]: remove useless block, write commit message, check against VM_MAYWRITE rather than VM_WRITE when register] Signed-off-by: Andrea Arcangeli <[email protected]> Signed-off-by: Peter Xu <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Reviewed-by: Jerome Glisse <[email protected]> Cc: Bobby Powers <[email protected]> Cc: Brian Geffon <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Denis Plotnikov <[email protected]> Cc: "Dr . David Alan Gilbert" <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: "Kirill A . Shutemov" <[email protected]> Cc: Martin Cracauer <[email protected]> Cc: Marty McFadden <[email protected]> Cc: Maya Gokhale <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Mike Kravetz <[email protected]> Cc: Mike Rapoport <[email protected]> Cc: Pavel Emelyanov <[email protected]> Cc: Rik van Riel <[email protected]> Cc: Shaohua Li <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Linus Torvalds <[email protected]>
1 parent ffd0579 commit 63b2d41

File tree

2 files changed

+89
-16
lines changed

2 files changed

+89
-16
lines changed

fs/userfaultfd.c

Lines changed: 66 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
314314
if (!pmd_present(_pmd))
315315
goto out;
316316

317-
if (pmd_trans_huge(_pmd))
317+
if (pmd_trans_huge(_pmd)) {
318+
if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
319+
ret = true;
318320
goto out;
321+
}
319322

320323
/*
321324
* the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
@@ -328,6 +331,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
328331
*/
329332
if (pte_none(*pte))
330333
ret = true;
334+
if (!pte_write(*pte) && (reason & VM_UFFD_WP))
335+
ret = true;
331336
pte_unmap(pte);
332337

333338
out:
@@ -1287,10 +1292,13 @@ static __always_inline int validate_range(struct mm_struct *mm,
12871292
return 0;
12881293
}
12891294

1290-
static inline bool vma_can_userfault(struct vm_area_struct *vma)
1295+
static inline bool vma_can_userfault(struct vm_area_struct *vma,
1296+
unsigned long vm_flags)
12911297
{
1292-
return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
1293-
vma_is_shmem(vma);
1298+
/* FIXME: add WP support to hugetlbfs and shmem */
1299+
return vma_is_anonymous(vma) ||
1300+
((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
1301+
!(vm_flags & VM_UFFD_WP));
12941302
}
12951303

12961304
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
@@ -1322,15 +1330,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
13221330
vm_flags = 0;
13231331
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
13241332
vm_flags |= VM_UFFD_MISSING;
1325-
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1333+
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
13261334
vm_flags |= VM_UFFD_WP;
1327-
/*
1328-
* FIXME: remove the below error constraint by
1329-
* implementing the wprotect tracking mode.
1330-
*/
1331-
ret = -EINVAL;
1332-
goto out;
1333-
}
13341335

13351336
ret = validate_range(mm, &uffdio_register.range.start,
13361337
uffdio_register.range.len);
@@ -1380,7 +1381,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
13801381

13811382
/* check not compatible vmas */
13821383
ret = -EINVAL;
1383-
if (!vma_can_userfault(cur))
1384+
if (!vma_can_userfault(cur, vm_flags))
13841385
goto out_unlock;
13851386

13861387
/*
@@ -1408,6 +1409,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
14081409
if (end & (vma_hpagesize - 1))
14091410
goto out_unlock;
14101411
}
1412+
if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1413+
goto out_unlock;
14111414

14121415
/*
14131416
* Check that this vma isn't already owned by a
@@ -1437,7 +1440,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
14371440
do {
14381441
cond_resched();
14391442

1440-
BUG_ON(!vma_can_userfault(vma));
1443+
BUG_ON(!vma_can_userfault(vma, vm_flags));
14411444
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
14421445
vma->vm_userfaultfd_ctx.ctx != ctx);
14431446
WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1575,7 +1578,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
15751578
* provides for more strict behavior to notice
15761579
* unregistration errors.
15771580
*/
1578-
if (!vma_can_userfault(cur))
1581+
if (!vma_can_userfault(cur, cur->vm_flags))
15791582
goto out_unlock;
15801583

15811584
found = true;
@@ -1589,7 +1592,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
15891592
do {
15901593
cond_resched();
15911594

1592-
BUG_ON(!vma_can_userfault(vma));
1595+
BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
15931596

15941597
/*
15951598
* Nothing to do: this vma is already registered into this
@@ -1802,6 +1805,50 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
18021805
return ret;
18031806
}
18041807

1808+
static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1809+
unsigned long arg)
1810+
{
1811+
int ret;
1812+
struct uffdio_writeprotect uffdio_wp;
1813+
struct uffdio_writeprotect __user *user_uffdio_wp;
1814+
struct userfaultfd_wake_range range;
1815+
1816+
if (READ_ONCE(ctx->mmap_changing))
1817+
return -EAGAIN;
1818+
1819+
user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1820+
1821+
if (copy_from_user(&uffdio_wp, user_uffdio_wp,
1822+
sizeof(struct uffdio_writeprotect)))
1823+
return -EFAULT;
1824+
1825+
ret = validate_range(ctx->mm, &uffdio_wp.range.start,
1826+
uffdio_wp.range.len);
1827+
if (ret)
1828+
return ret;
1829+
1830+
if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
1831+
UFFDIO_WRITEPROTECT_MODE_WP))
1832+
return -EINVAL;
1833+
if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
1834+
(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
1835+
return -EINVAL;
1836+
1837+
ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
1838+
uffdio_wp.range.len, uffdio_wp.mode &
1839+
UFFDIO_WRITEPROTECT_MODE_WP,
1840+
&ctx->mmap_changing);
1841+
if (ret)
1842+
return ret;
1843+
1844+
if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) {
1845+
range.start = uffdio_wp.range.start;
1846+
range.len = uffdio_wp.range.len;
1847+
wake_userfault(ctx, &range);
1848+
}
1849+
return ret;
1850+
}
1851+
18051852
static inline unsigned int uffd_ctx_features(__u64 user_features)
18061853
{
18071854
/*
@@ -1883,6 +1930,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
18831930
case UFFDIO_ZEROPAGE:
18841931
ret = userfaultfd_zeropage(ctx, arg);
18851932
break;
1933+
case UFFDIO_WRITEPROTECT:
1934+
ret = userfaultfd_writeprotect(ctx, arg);
1935+
break;
18861936
}
18871937
return ret;
18881938
}

include/uapi/linux/userfaultfd.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
#define _UFFDIO_WAKE (0x02)
5353
#define _UFFDIO_COPY (0x03)
5454
#define _UFFDIO_ZEROPAGE (0x04)
55+
#define _UFFDIO_WRITEPROTECT (0x06)
5556
#define _UFFDIO_API (0x3F)
5657

5758
/* userfaultfd ioctl ids */
@@ -68,6 +69,8 @@
6869
struct uffdio_copy)
6970
#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
7071
struct uffdio_zeropage)
72+
#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
73+
struct uffdio_writeprotect)
7174

7275
/* read() structure */
7376
struct uffd_msg {
@@ -232,4 +235,24 @@ struct uffdio_zeropage {
232235
__s64 zeropage;
233236
};
234237

238+
struct uffdio_writeprotect {
239+
struct uffdio_range range;
240+
/*
241+
* UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range,
242+
* unset the flag to undo protection of a range which was previously
243+
* write protected.
244+
*
245+
* UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up
246+
* any wait thread after the operation succeeds.
247+
*
248+
* NOTE: Write protecting a region (WP=1) is unrelated to page faults,
249+
* therefore DONTWAKE flag is meaningless with WP=1. Removing write
250+
* protection (WP=0) in response to a page fault wakes the faulting
251+
* task unless DONTWAKE is set.
252+
*/
253+
#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0)
254+
#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1)
255+
__u64 mode;
256+
};
257+
235258
#endif /* _LINUX_USERFAULTFD_H */

0 commit comments

Comments
 (0)