Skip to content

Commit 033fbae

Browse files
committed
mm: ZONE_DEVICE for "device memory"
While pmem is usable as a block device or via DAX mappings to userspace there are several usage scenarios that can not target pmem due to its lack of struct page coverage. In preparation for "hot plugging" pmem into the vmemmap add ZONE_DEVICE as a new zone to tag these pages separately from the ones that are subject to standard page allocations. Importantly "device memory" can be removed at will by userspace unbinding the driver of the device. Having a separate zone prevents allocation and otherwise marks these pages that are distinct from typical uniform memory. Device memory has different lifetime and performance characteristics than RAM. However, since we have run out of ZONES_SHIFT bits this functionality currently depends on sacrificing ZONE_DMA. Cc: H. Peter Anvin <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Rik van Riel <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Jerome Glisse <[email protected]> [hch: various simplifications in the arch interface] Signed-off-by: Christoph Hellwig <[email protected]> Signed-off-by: Dan Williams <[email protected]>
1 parent 012dcef commit 033fbae

File tree

12 files changed

+70
-17
lines changed

12 files changed

+70
-17
lines changed

arch/ia64/mm/init.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,7 @@ mem_init (void)
645645
}
646646

647647
#ifdef CONFIG_MEMORY_HOTPLUG
648-
int arch_add_memory(int nid, u64 start, u64 size)
648+
int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
649649
{
650650
pg_data_t *pgdat;
651651
struct zone *zone;
@@ -656,7 +656,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
656656
pgdat = NODE_DATA(nid);
657657

658658
zone = pgdat->node_zones +
659-
zone_for_memory(nid, start, size, ZONE_NORMAL);
659+
zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
660660
ret = __add_pages(nid, zone, start_pfn, nr_pages);
661661

662662
if (ret)

arch/powerpc/mm/mem.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ int memory_add_physaddr_to_nid(u64 start)
113113
}
114114
#endif
115115

116-
int arch_add_memory(int nid, u64 start, u64 size)
116+
int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
117117
{
118118
struct pglist_data *pgdata;
119119
struct zone *zone;
@@ -128,7 +128,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
128128

129129
/* this should work for most non-highmem platforms */
130130
zone = pgdata->node_zones +
131-
zone_for_memory(nid, start, size, 0);
131+
zone_for_memory(nid, start, size, 0, for_device);
132132

133133
return __add_pages(nid, zone, start_pfn, nr_pages);
134134
}

arch/s390/mm/init.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
168168
#endif
169169

170170
#ifdef CONFIG_MEMORY_HOTPLUG
171-
int arch_add_memory(int nid, u64 start, u64 size)
171+
int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
172172
{
173173
unsigned long zone_start_pfn, zone_end_pfn, nr_pages;
174174
unsigned long start_pfn = PFN_DOWN(start);

arch/sh/mm/init.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
485485
#endif
486486

487487
#ifdef CONFIG_MEMORY_HOTPLUG
488-
int arch_add_memory(int nid, u64 start, u64 size)
488+
int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
489489
{
490490
pg_data_t *pgdat;
491491
unsigned long start_pfn = start >> PAGE_SHIFT;
@@ -496,7 +496,8 @@ int arch_add_memory(int nid, u64 start, u64 size)
496496

497497
/* We only have ZONE_NORMAL, so this is easy.. */
498498
ret = __add_pages(nid, pgdat->node_zones +
499-
zone_for_memory(nid, start, size, ZONE_NORMAL),
499+
zone_for_memory(nid, start, size, ZONE_NORMAL,
500+
for_device),
500501
start_pfn, nr_pages);
501502
if (unlikely(ret))
502503
printk("%s: Failed, __add_pages() == %d\n", __func__, ret);

arch/tile/mm/init.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -863,7 +863,7 @@ void __init mem_init(void)
863863
* memory to the highmem for now.
864864
*/
865865
#ifndef CONFIG_NEED_MULTIPLE_NODES
866-
int arch_add_memory(u64 start, u64 size)
866+
int arch_add_memory(u64 start, u64 size, bool for_device)
867867
{
868868
struct pglist_data *pgdata = &contig_page_data;
869869
struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;

arch/x86/mm/init_32.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -822,11 +822,11 @@ void __init mem_init(void)
822822
}
823823

824824
#ifdef CONFIG_MEMORY_HOTPLUG
825-
int arch_add_memory(int nid, u64 start, u64 size)
825+
int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
826826
{
827827
struct pglist_data *pgdata = NODE_DATA(nid);
828828
struct zone *zone = pgdata->node_zones +
829-
zone_for_memory(nid, start, size, ZONE_HIGHMEM);
829+
zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
830830
unsigned long start_pfn = start >> PAGE_SHIFT;
831831
unsigned long nr_pages = size >> PAGE_SHIFT;
832832

arch/x86/mm/init_64.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -687,11 +687,11 @@ static void update_end_of_memory_vars(u64 start, u64 size)
687687
* Memory is added always to NORMAL zone. This means you will never get
688688
* additional DMA/DMA32 memory.
689689
*/
690-
int arch_add_memory(int nid, u64 start, u64 size)
690+
int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
691691
{
692692
struct pglist_data *pgdat = NODE_DATA(nid);
693693
struct zone *zone = pgdat->node_zones +
694-
zone_for_memory(nid, start, size, ZONE_NORMAL);
694+
zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
695695
unsigned long start_pfn = start >> PAGE_SHIFT;
696696
unsigned long nr_pages = size >> PAGE_SHIFT;
697697
int ret;

include/linux/memory_hotplug.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,8 +266,9 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
266266
extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
267267
void *arg, int (*func)(struct memory_block *, void *));
268268
extern int add_memory(int nid, u64 start, u64 size);
269-
extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default);
270-
extern int arch_add_memory(int nid, u64 start, u64 size);
269+
extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
270+
bool for_device);
271+
extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
271272
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
272273
extern bool is_memblock_offlined(struct memory_block *mem);
273274
extern void remove_memory(int nid, u64 start, u64 size);

include/linux/mmzone.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,11 @@ enum zone_type {
319319
ZONE_HIGHMEM,
320320
#endif
321321
ZONE_MOVABLE,
322+
#ifdef CONFIG_ZONE_DEVICE
323+
ZONE_DEVICE,
324+
#endif
322325
__MAX_NR_ZONES
326+
323327
};
324328

325329
#ifndef __GENERATING_BOUNDS_H
@@ -794,6 +798,25 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
794798
return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
795799
}
796800

801+
static inline int zone_id(const struct zone *zone)
802+
{
803+
struct pglist_data *pgdat = zone->zone_pgdat;
804+
805+
return zone - pgdat->node_zones;
806+
}
807+
808+
#ifdef CONFIG_ZONE_DEVICE
809+
static inline bool is_dev_zone(const struct zone *zone)
810+
{
811+
return zone_id(zone) == ZONE_DEVICE;
812+
}
813+
#else
814+
static inline bool is_dev_zone(const struct zone *zone)
815+
{
816+
return false;
817+
}
818+
#endif
819+
797820
#include <linux/memory_hotplug.h>
798821

799822
extern struct mutex zonelists_mutex;

mm/Kconfig

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,3 +654,20 @@ config DEFERRED_STRUCT_PAGE_INIT
654654
when kswapd starts. This has a potential performance impact on
655655
processes running early in the lifetime of the systemm until kswapd
656656
finishes the initialisation.
657+
658+
config ZONE_DEVICE
659+
bool "Device memory (pmem, etc...) hotplug support" if EXPERT
660+
default !ZONE_DMA
661+
depends on !ZONE_DMA
662+
depends on MEMORY_HOTPLUG
663+
depends on MEMORY_HOTREMOVE
664+
depends on X86_64 #arch_add_memory() comprehends device memory
665+
666+
help
667+
Device memory hotplug support allows for establishing pmem,
668+
or other device driver discovered memory regions, in the
669+
memmap. This allows pfn_to_page() lookups of otherwise
670+
"device-physical" addresses which is needed for using a DAX
671+
mapping in an O_DIRECT operation, among other things.
672+
673+
If FS_DAX is enabled, then say Y.

mm/memory_hotplug.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -770,7 +770,10 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
770770

771771
start = phys_start_pfn << PAGE_SHIFT;
772772
size = nr_pages * PAGE_SIZE;
773-
ret = release_mem_region_adjustable(&iomem_resource, start, size);
773+
774+
/* in the ZONE_DEVICE case device driver owns the memory region */
775+
if (!is_dev_zone(zone))
776+
ret = release_mem_region_adjustable(&iomem_resource, start, size);
774777
if (ret) {
775778
resource_size_t endres = start + size - 1;
776779

@@ -1207,8 +1210,13 @@ static int should_add_memory_movable(int nid, u64 start, u64 size)
12071210
return 0;
12081211
}
12091212

1210-
int zone_for_memory(int nid, u64 start, u64 size, int zone_default)
1213+
int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
1214+
bool for_device)
12111215
{
1216+
#ifdef CONFIG_ZONE_DEVICE
1217+
if (for_device)
1218+
return ZONE_DEVICE;
1219+
#endif
12121220
if (should_add_memory_movable(nid, start, size))
12131221
return ZONE_MOVABLE;
12141222

@@ -1249,7 +1257,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
12491257
}
12501258

12511259
/* call arch's memory hotadd */
1252-
ret = arch_add_memory(nid, start, size);
1260+
ret = arch_add_memory(nid, start, size, false);
12531261

12541262
if (ret < 0)
12551263
goto error;

mm/page_alloc.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,9 @@ static char * const zone_names[MAX_NR_ZONES] = {
207207
"HighMem",
208208
#endif
209209
"Movable",
210+
#ifdef CONFIG_ZONE_DEVICE
211+
"Device",
212+
#endif
210213
};
211214

212215
int min_free_kbytes = 1024;

0 commit comments

Comments
 (0)