Skip to content

Commit dee4107

Browse files
committed
/dev/dax, core: file operations and dax-mmap
The "Device DAX" core enables dax mappings of performance / feature differentiated memory. An open mapping or file handle keeps the backing struct device live, but new mappings are only possible while the device is enabled. Faults are handled under rcu_read_lock to synchronize with the enabled state of the device. Similar to the filesystem-dax case the backing memory may optionally have struct page entries. However, unlike fs-dax there is no support for private mappings, or mappings that are not backed by media (see use of zero-page in fs-dax). Mappings are always guaranteed to match the alignment of the dax_region. If the dax_region is configured to have a 2MB alignment, all mappings are guaranteed to be backed by a pmd entry. Contrast this determinism with the fs-dax case where pmd mappings are opportunistic. If userspace attempts to force a misaligned mapping, the driver will fail the mmap attempt. See dax_dev_check_vma() for other scenarios that are rejected, like MAP_PRIVATE mappings. Cc: Hannes Reinecke <[email protected]> Cc: Jeff Moyer <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: Andrew Morton <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Ross Zwisler <[email protected]> Acked-by: "Paul E. McKenney" <[email protected]> Reviewed-by: Johannes Thumshirn <[email protected]> Signed-off-by: Dan Williams <[email protected]>
1 parent ab68f26 commit dee4107

File tree

4 files changed

+325
-0
lines changed

4 files changed

+325
-0
lines changed

drivers/dax/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
menuconfig DEV_DAX
22
tristate "DAX: direct access to differentiated memory"
33
default m if NVDIMM_DAX
4+
depends on TRANSPARENT_HUGEPAGE
45
help
56
Support raw access to differentiated (persistence, bandwidth,
67
latency...) memory via an mmap(2) capable character

drivers/dax/dax.c

Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ struct dax_region {
4949
* @region - parent region
5050
* @dev - device backing the character device
5151
* @kref - enable this data to be tracked in filp->private_data
52+
* @alive - !alive + rcu grace period == no new mappings can be established
5253
* @id - child id in the region
5354
* @num_resources - number of physical address extents in this device
5455
* @res - array of physical address ranges
@@ -57,6 +58,7 @@ struct dax_dev {
5758
struct dax_region *region;
5859
struct device *dev;
5960
struct kref kref;
61+
bool alive;
6062
int id;
6163
int num_resources;
6264
struct resource res[0];
@@ -150,6 +152,16 @@ static void unregister_dax_dev(void *_dev)
150152

151153
dev_dbg(dev, "%s\n", __func__);
152154

155+
/*
156+
* Note, rcu is not protecting the liveness of dax_dev, rcu is
157+
* ensuring that any fault handlers that might have seen
158+
* dax_dev->alive == true, have completed. Any fault handlers
159+
* that start after synchronize_rcu() has started will abort
160+
* upon seeing dax_dev->alive == false.
161+
*/
162+
dax_dev->alive = false;
163+
synchronize_rcu();
164+
153165
get_device(dev);
154166
device_unregister(dev);
155167
ida_simple_remove(&dax_region->ida, dax_dev->id);
@@ -173,6 +185,7 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
173185
memcpy(dax_dev->res, res, sizeof(*res) * count);
174186
dax_dev->num_resources = count;
175187
kref_init(&dax_dev->kref);
188+
dax_dev->alive = true;
176189
dax_dev->region = dax_region;
177190
kref_get(&dax_region->kref);
178191

@@ -217,9 +230,318 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
217230
}
218231
EXPORT_SYMBOL_GPL(devm_create_dax_dev);
219232

233+
/* return an unmapped area aligned to the dax region specified alignment */
234+
static unsigned long dax_dev_get_unmapped_area(struct file *filp,
235+
unsigned long addr, unsigned long len, unsigned long pgoff,
236+
unsigned long flags)
237+
{
238+
unsigned long off, off_end, off_align, len_align, addr_align, align;
239+
struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
240+
struct dax_region *dax_region;
241+
242+
if (!dax_dev || addr)
243+
goto out;
244+
245+
dax_region = dax_dev->region;
246+
align = dax_region->align;
247+
off = pgoff << PAGE_SHIFT;
248+
off_end = off + len;
249+
off_align = round_up(off, align);
250+
251+
if ((off_end <= off_align) || ((off_end - off_align) < align))
252+
goto out;
253+
254+
len_align = len + align;
255+
if ((off + len_align) < off)
256+
goto out;
257+
258+
addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
259+
pgoff, flags);
260+
if (!IS_ERR_VALUE(addr_align)) {
261+
addr_align += (off - addr_align) & (align - 1);
262+
return addr_align;
263+
}
264+
out:
265+
return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
266+
}
267+
268+
static int __match_devt(struct device *dev, const void *data)
269+
{
270+
const dev_t *devt = data;
271+
272+
return dev->devt == *devt;
273+
}
274+
275+
static struct device *dax_dev_find(dev_t dev_t)
276+
{
277+
return class_find_device(dax_class, NULL, &dev_t, __match_devt);
278+
}
279+
280+
static int dax_dev_open(struct inode *inode, struct file *filp)
281+
{
282+
struct dax_dev *dax_dev = NULL;
283+
struct device *dev;
284+
285+
dev = dax_dev_find(inode->i_rdev);
286+
if (!dev)
287+
return -ENXIO;
288+
289+
device_lock(dev);
290+
dax_dev = dev_get_drvdata(dev);
291+
if (dax_dev) {
292+
dev_dbg(dev, "%s\n", __func__);
293+
filp->private_data = dax_dev;
294+
kref_get(&dax_dev->kref);
295+
inode->i_flags = S_DAX;
296+
}
297+
device_unlock(dev);
298+
299+
if (!dax_dev) {
300+
put_device(dev);
301+
return -ENXIO;
302+
}
303+
return 0;
304+
}
305+
306+
static int dax_dev_release(struct inode *inode, struct file *filp)
307+
{
308+
struct dax_dev *dax_dev = filp->private_data;
309+
struct device *dev = dax_dev->dev;
310+
311+
dev_dbg(dax_dev->dev, "%s\n", __func__);
312+
dax_dev_put(dax_dev);
313+
put_device(dev);
314+
315+
return 0;
316+
}
317+
318+
static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
319+
const char *func)
320+
{
321+
struct dax_region *dax_region = dax_dev->region;
322+
struct device *dev = dax_dev->dev;
323+
unsigned long mask;
324+
325+
if (!dax_dev->alive)
326+
return -ENXIO;
327+
328+
/* prevent private / writable mappings from being established */
329+
if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) {
330+
dev_info(dev, "%s: %s: fail, attempted private mapping\n",
331+
current->comm, func);
332+
return -EINVAL;
333+
}
334+
335+
mask = dax_region->align - 1;
336+
if (vma->vm_start & mask || vma->vm_end & mask) {
337+
dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
338+
current->comm, func, vma->vm_start, vma->vm_end,
339+
mask);
340+
return -EINVAL;
341+
}
342+
343+
if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
344+
&& (vma->vm_flags & VM_DONTCOPY) == 0) {
345+
dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
346+
current->comm, func);
347+
return -EINVAL;
348+
}
349+
350+
if (!vma_is_dax(vma)) {
351+
dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
352+
current->comm, func);
353+
return -EINVAL;
354+
}
355+
356+
return 0;
357+
}
358+
359+
static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
360+
unsigned long size)
361+
{
362+
struct resource *res;
363+
phys_addr_t phys;
364+
int i;
365+
366+
for (i = 0; i < dax_dev->num_resources; i++) {
367+
res = &dax_dev->res[i];
368+
phys = pgoff * PAGE_SIZE + res->start;
369+
if (phys >= res->start && phys <= res->end)
370+
break;
371+
pgoff -= PHYS_PFN(resource_size(res));
372+
}
373+
374+
if (i < dax_dev->num_resources) {
375+
res = &dax_dev->res[i];
376+
if (phys + size - 1 <= res->end)
377+
return phys;
378+
}
379+
380+
return -1;
381+
}
382+
383+
static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
384+
struct vm_fault *vmf)
385+
{
386+
unsigned long vaddr = (unsigned long) vmf->virtual_address;
387+
struct device *dev = dax_dev->dev;
388+
struct dax_region *dax_region;
389+
int rc = VM_FAULT_SIGBUS;
390+
phys_addr_t phys;
391+
pfn_t pfn;
392+
393+
if (check_vma(dax_dev, vma, __func__))
394+
return VM_FAULT_SIGBUS;
395+
396+
dax_region = dax_dev->region;
397+
if (dax_region->align > PAGE_SIZE) {
398+
dev_dbg(dev, "%s: alignment > fault size\n", __func__);
399+
return VM_FAULT_SIGBUS;
400+
}
401+
402+
phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
403+
if (phys == -1) {
404+
dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
405+
vmf->pgoff);
406+
return VM_FAULT_SIGBUS;
407+
}
408+
409+
pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
410+
411+
rc = vm_insert_mixed(vma, vaddr, pfn);
412+
413+
if (rc == -ENOMEM)
414+
return VM_FAULT_OOM;
415+
if (rc < 0 && rc != -EBUSY)
416+
return VM_FAULT_SIGBUS;
417+
418+
return VM_FAULT_NOPAGE;
419+
}
420+
421+
static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
422+
{
423+
int rc;
424+
struct file *filp = vma->vm_file;
425+
struct dax_dev *dax_dev = filp->private_data;
426+
427+
dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
428+
current->comm, (vmf->flags & FAULT_FLAG_WRITE)
429+
? "write" : "read", vma->vm_start, vma->vm_end);
430+
rcu_read_lock();
431+
rc = __dax_dev_fault(dax_dev, vma, vmf);
432+
rcu_read_unlock();
433+
434+
return rc;
435+
}
436+
437+
static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
438+
struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
439+
unsigned int flags)
440+
{
441+
unsigned long pmd_addr = addr & PMD_MASK;
442+
struct device *dev = dax_dev->dev;
443+
struct dax_region *dax_region;
444+
phys_addr_t phys;
445+
pgoff_t pgoff;
446+
pfn_t pfn;
447+
448+
if (check_vma(dax_dev, vma, __func__))
449+
return VM_FAULT_SIGBUS;
450+
451+
dax_region = dax_dev->region;
452+
if (dax_region->align > PMD_SIZE) {
453+
dev_dbg(dev, "%s: alignment > fault size\n", __func__);
454+
return VM_FAULT_SIGBUS;
455+
}
456+
457+
/* dax pmd mappings require pfn_t_devmap() */
458+
if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
459+
dev_dbg(dev, "%s: alignment > fault size\n", __func__);
460+
return VM_FAULT_SIGBUS;
461+
}
462+
463+
pgoff = linear_page_index(vma, pmd_addr);
464+
phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE);
465+
if (phys == -1) {
466+
dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
467+
pgoff);
468+
return VM_FAULT_SIGBUS;
469+
}
470+
471+
pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
472+
473+
return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
474+
flags & FAULT_FLAG_WRITE);
475+
}
476+
477+
static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
478+
pmd_t *pmd, unsigned int flags)
479+
{
480+
int rc;
481+
struct file *filp = vma->vm_file;
482+
struct dax_dev *dax_dev = filp->private_data;
483+
484+
dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
485+
current->comm, (flags & FAULT_FLAG_WRITE)
486+
? "write" : "read", vma->vm_start, vma->vm_end);
487+
488+
rcu_read_lock();
489+
rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
490+
rcu_read_unlock();
491+
492+
return rc;
493+
}
494+
495+
static void dax_dev_vm_open(struct vm_area_struct *vma)
496+
{
497+
struct file *filp = vma->vm_file;
498+
struct dax_dev *dax_dev = filp->private_data;
499+
500+
dev_dbg(dax_dev->dev, "%s\n", __func__);
501+
kref_get(&dax_dev->kref);
502+
}
503+
504+
static void dax_dev_vm_close(struct vm_area_struct *vma)
505+
{
506+
struct file *filp = vma->vm_file;
507+
struct dax_dev *dax_dev = filp->private_data;
508+
509+
dev_dbg(dax_dev->dev, "%s\n", __func__);
510+
dax_dev_put(dax_dev);
511+
}
512+
513+
static const struct vm_operations_struct dax_dev_vm_ops = {
514+
.fault = dax_dev_fault,
515+
.pmd_fault = dax_dev_pmd_fault,
516+
.open = dax_dev_vm_open,
517+
.close = dax_dev_vm_close,
518+
};
519+
520+
static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma)
521+
{
522+
struct dax_dev *dax_dev = filp->private_data;
523+
int rc;
524+
525+
dev_dbg(dax_dev->dev, "%s\n", __func__);
526+
527+
rc = check_vma(dax_dev, vma, __func__);
528+
if (rc)
529+
return rc;
530+
531+
kref_get(&dax_dev->kref);
532+
vma->vm_ops = &dax_dev_vm_ops;
533+
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
534+
return 0;
535+
536+
}
537+
220538
static const struct file_operations dax_fops = {
221539
.llseek = noop_llseek,
222540
.owner = THIS_MODULE,
541+
.open = dax_dev_open,
542+
.release = dax_dev_release,
543+
.get_unmapped_area = dax_dev_get_unmapped_area,
544+
.mmap = dax_dev_mmap,
223545
};
224546

225547
static int __init dax_init(void)

mm/huge_memory.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,6 +1013,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
10131013
insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
10141014
return VM_FAULT_NOPAGE;
10151015
}
1016+
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
10161017

10171018
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
10181019
pmd_t *pmd)

mm/hugetlb.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,7 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
624624
{
625625
return vma_hugecache_offset(hstate_vma(vma), vma, address);
626626
}
627+
EXPORT_SYMBOL_GPL(linear_hugepage_index);
627628

628629
/*
629630
* Return the size of the pages allocated when backing a VMA. In the majority

0 commit comments

Comments
 (0)