Skip to content

Commit 5c2c258

Browse files
djbwtorvalds
authored andcommitted
mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gup
get_dev_page() enables paths like get_user_pages() to pin a dynamically mapped pfn-range (devm_memremap_pages()) while the resulting struct page objects are in use. Unlike get_page() it may fail if the device is, or is in the process of being, disabled. While the initial lookup of the range may be an expensive list walk, the result is cached to speed up subsequent lookups which are likely to be in the same mapped range. devm_memremap_pages() now requires a reference counter to be specified at init time. For pmem this means moving request_queue allocation into pmem_alloc() so the existing queue usage counter can track "device pages". ZONE_DEVICE pages always have an elevated count and will never be on an lru reclaim list. That space in 'struct page' can be redirected for other uses, but for safety introduce a poison value that will always trip __list_add() to assert. This allows half of the struct list_head storage to be reclaimed with some assurance to back up the assumption that the page count never goes to zero and a list_add() is never attempted. Signed-off-by: Dan Williams <[email protected]> Tested-by: Logan Gunthorpe <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Matthew Wilcox <[email protected]> Cc: Ross Zwisler <[email protected]> Cc: Alexander Viro <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 468ded0 commit 5c2c258

File tree

6 files changed

+125
-8
lines changed

6 files changed

+125
-8
lines changed

drivers/nvdimm/pmem.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ static struct pmem_device *pmem_alloc(struct device *dev,
184184
pmem->pfn_flags = PFN_DEV;
185185
if (pmem_should_map_pages(dev)) {
186186
pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res,
187-
NULL);
187+
&q->q_usage_counter, NULL);
188188
pmem->pfn_flags |= PFN_MAP;
189189
} else
190190
pmem->virt_addr = (void __pmem *) devm_memremap(dev,
@@ -365,6 +365,7 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
365365
struct vmem_altmap *altmap;
366366
struct nd_pfn_sb *pfn_sb;
367367
struct pmem_device *pmem;
368+
struct request_queue *q;
368369
phys_addr_t offset;
369370
int rc;
370371
struct vmem_altmap __altmap = {
@@ -406,9 +407,10 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
406407

407408
/* establish pfn range for lookup, and switch to direct map */
408409
pmem = dev_get_drvdata(dev);
410+
q = pmem->pmem_queue;
409411
devm_memunmap(dev, (void __force *) pmem->virt_addr);
410412
pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res,
411-
altmap);
413+
&q->q_usage_counter, altmap);
412414
pmem->pfn_flags |= PFN_MAP;
413415
if (IS_ERR(pmem->virt_addr)) {
414416
rc = PTR_ERR(pmem->virt_addr);

include/linux/list.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,17 @@ extern void __list_del_entry(struct list_head *entry);
113113
extern void list_del(struct list_head *entry);
114114
#endif
115115

116+
#ifdef CONFIG_DEBUG_LIST
117+
/*
118+
* See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one
119+
* of the pages it allocates is ever passed to list_add()
120+
*/
121+
extern void list_force_poison(struct list_head *entry);
122+
#else
123+
/* fallback to the less strict LIST_POISON* definitions */
124+
#define list_force_poison list_del
125+
#endif
126+
116127
/**
117128
* list_replace - replace old entry by new one
118129
* @old : the element to be replaced

include/linux/memremap.h

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#ifndef _LINUX_MEMREMAP_H_
22
#define _LINUX_MEMREMAP_H_
33
#include <linux/mm.h>
4+
#include <linux/ioport.h>
5+
#include <linux/percpu-refcount.h>
46

57
struct resource;
68
struct device;
@@ -36,21 +38,25 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
3638
/**
3739
* struct dev_pagemap - metadata for ZONE_DEVICE mappings
3840
* @altmap: pre-allocated/reserved memory for vmemmap allocations
41+
* @res: physical address range covered by @ref
42+
* @ref: reference count that pins the devm_memremap_pages() mapping
3943
* @dev: host device of the mapping for debug
4044
*/
4145
struct dev_pagemap {
4246
struct vmem_altmap *altmap;
4347
const struct resource *res;
48+
struct percpu_ref *ref;
4449
struct device *dev;
4550
};
4651

4752
#ifdef CONFIG_ZONE_DEVICE
4853
void *devm_memremap_pages(struct device *dev, struct resource *res,
49-
struct vmem_altmap *altmap);
54+
struct percpu_ref *ref, struct vmem_altmap *altmap);
5055
struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
5156
#else
5257
static inline void *devm_memremap_pages(struct device *dev,
53-
struct resource *res, struct vmem_altmap *altmap)
58+
struct resource *res, struct percpu_ref *ref,
59+
struct vmem_altmap *altmap)
5460
{
5561
/*
5662
* Fail attempts to call devm_memremap_pages() without
@@ -66,4 +72,43 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
6672
return NULL;
6773
}
6874
#endif
75+
76+
/**
77+
* get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
78+
* @pfn: page frame number to lookup page_map
79+
* @pgmap: optional known pgmap that already has a reference
80+
*
81+
* @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the
82+
* same mapping.
83+
*/
84+
static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
85+
struct dev_pagemap *pgmap)
86+
{
87+
const struct resource *res = pgmap ? pgmap->res : NULL;
88+
resource_size_t phys = PFN_PHYS(pfn);
89+
90+
/*
91+
* In the cached case we're already holding a live reference so
92+
* we can simply do a blind increment
93+
*/
94+
if (res && phys >= res->start && phys <= res->end) {
95+
percpu_ref_get(pgmap->ref);
96+
return pgmap;
97+
}
98+
99+
/* fall back to slow path lookup */
100+
rcu_read_lock();
101+
pgmap = find_dev_pagemap(phys);
102+
if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
103+
pgmap = NULL;
104+
rcu_read_unlock();
105+
106+
return pgmap;
107+
}
108+
109+
static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
110+
{
111+
if (pgmap)
112+
percpu_ref_put(pgmap->ref);
113+
}
69114
#endif /* _LINUX_MEMREMAP_H_ */

include/linux/mm_types.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,11 @@ struct page {
116116
* Can be used as a generic list
117117
* by the page owner.
118118
*/
119+
struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
120+
* lru or handled by a slab
121+
* allocator, this points to the
122+
* hosting device page map.
123+
*/
119124
struct { /* slub per cpu partial pages */
120125
struct page *next; /* Next partial slab */
121126
#ifdef CONFIG_64BIT

kernel/memremap.c

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,13 +179,41 @@ static void pgmap_radix_release(struct resource *res)
179179
mutex_unlock(&pgmap_lock);
180180
}
181181

182+
static unsigned long pfn_first(struct page_map *page_map)
183+
{
184+
struct dev_pagemap *pgmap = &page_map->pgmap;
185+
const struct resource *res = &page_map->res;
186+
struct vmem_altmap *altmap = pgmap->altmap;
187+
unsigned long pfn;
188+
189+
pfn = res->start >> PAGE_SHIFT;
190+
if (altmap)
191+
pfn += vmem_altmap_offset(altmap);
192+
return pfn;
193+
}
194+
195+
static unsigned long pfn_end(struct page_map *page_map)
196+
{
197+
const struct resource *res = &page_map->res;
198+
199+
return (res->start + resource_size(res)) >> PAGE_SHIFT;
200+
}
201+
202+
#define for_each_device_pfn(pfn, map) \
203+
for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
204+
182205
static void devm_memremap_pages_release(struct device *dev, void *data)
183206
{
184207
struct page_map *page_map = data;
185208
struct resource *res = &page_map->res;
186209
resource_size_t align_start, align_size;
187210
struct dev_pagemap *pgmap = &page_map->pgmap;
188211

212+
if (percpu_ref_tryget_live(pgmap->ref)) {
213+
dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
214+
percpu_ref_put(pgmap->ref);
215+
}
216+
189217
pgmap_radix_release(res);
190218

191219
/* pages are dead and unused, undo the arch mapping */
@@ -211,20 +239,26 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
211239
* devm_memremap_pages - remap and provide memmap backing for the given resource
212240
* @dev: hosting device for @res
213241
* @res: "host memory" address range
242+
* @ref: a live per-cpu reference count
214243
* @altmap: optional descriptor for allocating the memmap from @res
215244
*
216-
* Note, the expectation is that @res is a host memory range that could
217-
* feasibly be treated as a "System RAM" range, i.e. not a device mmio
218-
* range, but this is not enforced.
245+
* Notes:
246+
* 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
247+
* (or devm release event).
248+
*
249+
* 2/ @res is expected to be a host memory range that could feasibly be
250+
* treated as a "System RAM" range, i.e. not a device mmio range, but
251+
* this is not enforced.
219252
*/
220253
void *devm_memremap_pages(struct device *dev, struct resource *res,
221-
struct vmem_altmap *altmap)
254+
struct percpu_ref *ref, struct vmem_altmap *altmap)
222255
{
223256
int is_ram = region_intersects(res->start, resource_size(res),
224257
"System RAM");
225258
resource_size_t key, align_start, align_size;
226259
struct dev_pagemap *pgmap;
227260
struct page_map *page_map;
261+
unsigned long pfn;
228262
int error, nid;
229263

230264
if (is_ram == REGION_MIXED) {
@@ -242,6 +276,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
242276
return ERR_PTR(-ENXIO);
243277
}
244278

279+
if (!ref)
280+
return ERR_PTR(-EINVAL);
281+
245282
page_map = devres_alloc_node(devm_memremap_pages_release,
246283
sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
247284
if (!page_map)
@@ -255,6 +292,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
255292
memcpy(&page_map->altmap, altmap, sizeof(*altmap));
256293
pgmap->altmap = &page_map->altmap;
257294
}
295+
pgmap->ref = ref;
258296
pgmap->res = &page_map->res;
259297

260298
mutex_lock(&pgmap_lock);
@@ -292,6 +330,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
292330
if (error)
293331
goto err_add_memory;
294332

333+
for_each_device_pfn(pfn, page_map) {
334+
struct page *page = pfn_to_page(pfn);
335+
336+
/* ZONE_DEVICE pages must never appear on a slab lru */
337+
list_force_poison(&page->lru);
338+
page->pgmap = pgmap;
339+
}
295340
devres_add(dev, page_map);
296341
return __va(res->start);
297342

lib/list_debug.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@
1212
#include <linux/kernel.h>
1313
#include <linux/rculist.h>
1414

15+
static struct list_head force_poison;
16+
void list_force_poison(struct list_head *entry)
17+
{
18+
entry->next = &force_poison;
19+
entry->prev = &force_poison;
20+
}
21+
1522
/*
1623
* Insert a new entry between two known consecutive entries.
1724
*
@@ -23,6 +30,8 @@ void __list_add(struct list_head *new,
2330
struct list_head *prev,
2431
struct list_head *next)
2532
{
33+
WARN(new->next == &force_poison || new->prev == &force_poison,
34+
"list_add attempted on force-poisoned entry\n");
2635
WARN(next->prev != prev,
2736
"list_add corruption. next->prev should be "
2837
"prev (%p), but was %p. (next=%p).\n",

0 commit comments

Comments
 (0)