Skip to content

Commit 449dd69

Browse files
hnaztorvalds
authored andcommitted
mm: keep page cache radix tree nodes in check
Previously, page cache radix tree nodes were freed after reclaim emptied out their page pointers. But now reclaim stores shadow entries in their place, which are only reclaimed when the inodes themselves are reclaimed. This is problematic for bigger files that are still in use after they have a significant amount of their cache reclaimed, without any of those pages actually refaulting. The shadow entries will just sit there and waste memory. In the worst case, the shadow entries will accumulate until the machine runs out of memory. To get this under control, the VM will track radix tree nodes exclusively containing shadow entries on a per-NUMA node list. Per-NUMA rather than global because we expect the radix tree nodes themselves to be allocated node-locally and we want to reduce cross-node references of otherwise independent cache workloads. A simple shrinker will then reclaim these nodes on memory pressure. A few things need to be stored in the radix tree node to implement the shadow node LRU and allow tree deletions coming from the list: 1. There is no index available that would describe the reverse path from the node up to the tree root, which is needed to perform a deletion. To solve this, encode in each node its offset inside the parent. This can be stored in the unused upper bits of the same member that stores the node's height at no extra space cost. 2. The number of shadow entries needs to be counted in addition to the regular entries, to quickly detect when the node is ready to go to the shadow node LRU list. The current entry count is an unsigned int but the maximum number of entries is 64, so a shadow counter can easily be stored in the unused upper bits. 3. Tree modification needs tree lock and tree root, which are located in the address space, so store an address_space backpointer in the node. The parent pointer of the node is in a union with the 2-word rcu_head, so the backpointer comes at no extra cost as well. 4. The node needs to be linked to an LRU list, which requires a list head inside the node. This does increase the size of the node, but it does not change the number of objects that fit into a slab page. [[email protected]: export the right function] Signed-off-by: Johannes Weiner <[email protected]> Reviewed-by: Rik van Riel <[email protected]> Reviewed-by: Minchan Kim <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Bob Liu <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: Dave Chinner <[email protected]> Cc: Greg Thelen <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Jan Kara <[email protected]> Cc: KOSAKI Motohiro <[email protected]> Cc: Luigi Semenzato <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Metin Doslu <[email protected]> Cc: Michel Lespinasse <[email protected]> Cc: Ozgun Erdogan <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Roman Gushchin <[email protected]> Cc: Ryan Mallon <[email protected]> Cc: Tejun Heo <[email protected]> Cc: Vlastimil Babka <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 139e561 commit 449dd69

File tree

10 files changed

+359
-43
lines changed

10 files changed

+359
-43
lines changed

include/linux/list_lru.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
/* list_lru_walk_cb has to always return one of those */
1414
enum lru_status {
1515
LRU_REMOVED, /* item removed from list */
16+
LRU_REMOVED_RETRY, /* item removed, but lock has been
17+
dropped and reacquired */
1618
LRU_ROTATE, /* item referenced, give another pass */
1719
LRU_SKIP, /* item cannot be locked, skip */
1820
LRU_RETRY, /* item not freeable. May drop the lock
@@ -32,7 +34,11 @@ struct list_lru {
3234
};
3335

3436
void list_lru_destroy(struct list_lru *lru);
35-
int list_lru_init(struct list_lru *lru);
37+
int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
38+
static inline int list_lru_init(struct list_lru *lru)
39+
{
40+
return list_lru_init_key(lru, NULL);
41+
}
3642

3743
/**
3844
* list_lru_add: add an element to the lru list's tail

include/linux/mmzone.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ enum zone_stat_item {
144144
#endif
145145
WORKINGSET_REFAULT,
146146
WORKINGSET_ACTIVATE,
147+
WORKINGSET_NODERECLAIM,
147148
NR_ANON_TRANSPARENT_HUGEPAGES,
148149
NR_FREE_CMA_PAGES,
149150
NR_VM_ZONE_STAT_ITEMS };

include/linux/radix-tree.h

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,21 +72,37 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
7272
#define RADIX_TREE_TAG_LONGS \
7373
((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
7474

75+
#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
76+
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
77+
RADIX_TREE_MAP_SHIFT))
78+
79+
/* Height component in node->path */
80+
#define RADIX_TREE_HEIGHT_SHIFT (RADIX_TREE_MAX_PATH + 1)
81+
#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1)
82+
83+
/* Internally used bits of node->count */
84+
#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1)
85+
#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
86+
7587
struct radix_tree_node {
76-
unsigned int height; /* Height from the bottom */
88+
unsigned int path; /* Offset in parent & height from the bottom */
7789
unsigned int count;
7890
union {
79-
struct radix_tree_node *parent; /* Used when ascending tree */
80-
struct rcu_head rcu_head; /* Used when freeing node */
91+
struct {
92+
/* Used when ascending tree */
93+
struct radix_tree_node *parent;
94+
/* For tree user */
95+
void *private_data;
96+
};
97+
/* Used when freeing node */
98+
struct rcu_head rcu_head;
8199
};
100+
/* For tree user */
101+
struct list_head private_list;
82102
void __rcu *slots[RADIX_TREE_MAP_SIZE];
83103
unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
84104
};
85105

86-
#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
87-
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
88-
RADIX_TREE_MAP_SHIFT))
89-
90106
/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
91107
struct radix_tree_root {
92108
unsigned int height;
@@ -251,7 +267,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
251267
struct radix_tree_node **nodep, void ***slotp);
252268
void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
253269
void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
254-
bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
270+
bool __radix_tree_delete_node(struct radix_tree_root *root,
255271
struct radix_tree_node *node);
256272
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
257273
void *radix_tree_delete(struct radix_tree_root *, unsigned long);

include/linux/swap.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,37 @@ struct swap_list_t {
264264
void *workingset_eviction(struct address_space *mapping, struct page *page);
265265
bool workingset_refault(void *shadow);
266266
void workingset_activation(struct page *page);
267+
extern struct list_lru workingset_shadow_nodes;
268+
269+
static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
270+
{
271+
return node->count & RADIX_TREE_COUNT_MASK;
272+
}
273+
274+
static inline void workingset_node_pages_inc(struct radix_tree_node *node)
275+
{
276+
node->count++;
277+
}
278+
279+
static inline void workingset_node_pages_dec(struct radix_tree_node *node)
280+
{
281+
node->count--;
282+
}
283+
284+
static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
285+
{
286+
return node->count >> RADIX_TREE_COUNT_SHIFT;
287+
}
288+
289+
static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
290+
{
291+
node->count += 1U << RADIX_TREE_COUNT_SHIFT;
292+
}
293+
294+
static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
295+
{
296+
node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
297+
}
267298

268299
/* linux/mm/page_alloc.c */
269300
extern unsigned long totalram_pages;

lib/radix-tree.c

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,8 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
342342

343343
/* Increase the height. */
344344
newheight = root->height+1;
345-
node->height = newheight;
345+
BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
346+
node->path = newheight;
346347
node->count = 1;
347348
node->parent = NULL;
348349
slot = root->rnode;
@@ -400,11 +401,12 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
400401
/* Have to add a child node. */
401402
if (!(slot = radix_tree_node_alloc(root)))
402403
return -ENOMEM;
403-
slot->height = height;
404+
slot->path = height;
404405
slot->parent = node;
405406
if (node) {
406407
rcu_assign_pointer(node->slots[offset], slot);
407408
node->count++;
409+
slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
408410
} else
409411
rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
410412
}
@@ -498,7 +500,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
498500
}
499501
node = indirect_to_ptr(node);
500502

501-
height = node->height;
503+
height = node->path & RADIX_TREE_HEIGHT_MASK;
502504
if (index > radix_tree_maxindex(height))
503505
return NULL;
504506

@@ -704,7 +706,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
704706
return (index == 0);
705707
node = indirect_to_ptr(node);
706708

707-
height = node->height;
709+
height = node->path & RADIX_TREE_HEIGHT_MASK;
708710
if (index > radix_tree_maxindex(height))
709711
return 0;
710712

@@ -741,7 +743,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
741743
{
742744
unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
743745
struct radix_tree_node *rnode, *node;
744-
unsigned long index, offset;
746+
unsigned long index, offset, height;
745747

746748
if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
747749
return NULL;
@@ -772,7 +774,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
772774
return NULL;
773775

774776
restart:
775-
shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT;
777+
height = rnode->path & RADIX_TREE_HEIGHT_MASK;
778+
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
776779
offset = index >> shift;
777780

778781
/* Index outside of the tree */
@@ -1142,7 +1145,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
11421145
unsigned int shift, height;
11431146
unsigned long i;
11441147

1145-
height = slot->height;
1148+
height = slot->path & RADIX_TREE_HEIGHT_MASK;
11461149
shift = (height-1) * RADIX_TREE_MAP_SHIFT;
11471150

11481151
for ( ; height > 1; height--) {
@@ -1205,7 +1208,8 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
12051208
}
12061209

12071210
node = indirect_to_ptr(node);
1208-
max_index = radix_tree_maxindex(node->height);
1211+
max_index = radix_tree_maxindex(node->path &
1212+
RADIX_TREE_HEIGHT_MASK);
12091213
if (cur_index > max_index) {
12101214
rcu_read_unlock();
12111215
break;
@@ -1301,7 +1305,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
13011305
*
13021306
* Returns %true if @node was freed, %false otherwise.
13031307
*/
1304-
bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
1308+
bool __radix_tree_delete_node(struct radix_tree_root *root,
13051309
struct radix_tree_node *node)
13061310
{
13071311
bool deleted = false;
@@ -1320,9 +1324,10 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
13201324

13211325
parent = node->parent;
13221326
if (parent) {
1323-
index >>= RADIX_TREE_MAP_SHIFT;
1327+
unsigned int offset;
13241328

1325-
parent->slots[index & RADIX_TREE_MAP_MASK] = NULL;
1329+
offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
1330+
parent->slots[offset] = NULL;
13261331
parent->count--;
13271332
} else {
13281333
root_tag_clear_all(root);
@@ -1386,7 +1391,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
13861391
node->slots[offset] = NULL;
13871392
node->count--;
13881393

1389-
__radix_tree_delete_node(root, index, node);
1394+
__radix_tree_delete_node(root, node);
13901395

13911396
return entry;
13921397
}
@@ -1419,9 +1424,12 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
14191424
EXPORT_SYMBOL(radix_tree_tagged);
14201425

14211426
static void
1422-
radix_tree_node_ctor(void *node)
1427+
radix_tree_node_ctor(void *arg)
14231428
{
1424-
memset(node, 0, sizeof(struct radix_tree_node));
1429+
struct radix_tree_node *node = arg;
1430+
1431+
memset(node, 0, sizeof(*node));
1432+
INIT_LIST_HEAD(&node->private_list);
14251433
}
14261434

14271435
static __init unsigned long __maxindex(unsigned int height)

mm/filemap.c

Lines changed: 74 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -110,11 +110,17 @@
110110
static void page_cache_tree_delete(struct address_space *mapping,
111111
struct page *page, void *shadow)
112112
{
113-
if (shadow) {
114-
void **slot;
113+
struct radix_tree_node *node;
114+
unsigned long index;
115+
unsigned int offset;
116+
unsigned int tag;
117+
void **slot;
115118

116-
slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
117-
radix_tree_replace_slot(slot, shadow);
119+
VM_BUG_ON(!PageLocked(page));
120+
121+
__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
122+
123+
if (shadow) {
118124
mapping->nrshadows++;
119125
/*
120126
* Make sure the nrshadows update is committed before
@@ -123,9 +129,45 @@ static void page_cache_tree_delete(struct address_space *mapping,
123129
* same time and miss a shadow entry.
124130
*/
125131
smp_wmb();
126-
} else
127-
radix_tree_delete(&mapping->page_tree, page->index);
132+
}
128133
mapping->nrpages--;
134+
135+
if (!node) {
136+
/* Clear direct pointer tags in root node */
137+
mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
138+
radix_tree_replace_slot(slot, shadow);
139+
return;
140+
}
141+
142+
/* Clear tree tags for the removed page */
143+
index = page->index;
144+
offset = index & RADIX_TREE_MAP_MASK;
145+
for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
146+
if (test_bit(offset, node->tags[tag]))
147+
radix_tree_tag_clear(&mapping->page_tree, index, tag);
148+
}
149+
150+
/* Delete page, swap shadow entry */
151+
radix_tree_replace_slot(slot, shadow);
152+
workingset_node_pages_dec(node);
153+
if (shadow)
154+
workingset_node_shadows_inc(node);
155+
else
156+
if (__radix_tree_delete_node(&mapping->page_tree, node))
157+
return;
158+
159+
/*
160+
* Track node that only contains shadow entries.
161+
*
162+
* Avoid acquiring the list_lru lock if already tracked. The
163+
* list_empty() test is safe as node->private_list is
164+
* protected by mapping->tree_lock.
165+
*/
166+
if (!workingset_node_pages(node) &&
167+
list_empty(&node->private_list)) {
168+
node->private_data = mapping;
169+
list_lru_add(&workingset_shadow_nodes, &node->private_list);
170+
}
129171
}
130172

131173
/*
@@ -471,27 +513,43 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page);
471513
static int page_cache_tree_insert(struct address_space *mapping,
472514
struct page *page, void **shadowp)
473515
{
516+
struct radix_tree_node *node;
474517
void **slot;
475518
int error;
476519

477-
slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
478-
if (slot) {
520+
error = __radix_tree_create(&mapping->page_tree, page->index,
521+
&node, &slot);
522+
if (error)
523+
return error;
524+
if (*slot) {
479525
void *p;
480526

481527
p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
482528
if (!radix_tree_exceptional_entry(p))
483529
return -EEXIST;
484-
radix_tree_replace_slot(slot, page);
485-
mapping->nrshadows--;
486-
mapping->nrpages++;
487530
if (shadowp)
488531
*shadowp = p;
489-
return 0;
532+
mapping->nrshadows--;
533+
if (node)
534+
workingset_node_shadows_dec(node);
490535
}
491-
error = radix_tree_insert(&mapping->page_tree, page->index, page);
492-
if (!error)
493-
mapping->nrpages++;
494-
return error;
536+
radix_tree_replace_slot(slot, page);
537+
mapping->nrpages++;
538+
if (node) {
539+
workingset_node_pages_inc(node);
540+
/*
541+
* Don't track node that contains actual pages.
542+
*
543+
* Avoid acquiring the list_lru lock if already
544+
* untracked. The list_empty() test is safe as
545+
* node->private_list is protected by
546+
* mapping->tree_lock.
547+
*/
548+
if (!list_empty(&node->private_list))
549+
list_lru_del(&workingset_shadow_nodes,
550+
&node->private_list);
551+
}
552+
return 0;
495553
}
496554

497555
static int __add_to_page_cache_locked(struct page *page,

0 commit comments

Comments
 (0)