diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 3ce541753c88..f3434533fbf8 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -13,6 +13,8 @@ /* list_lru_walk_cb has to always return one of those */ enum lru_status { LRU_REMOVED, /* item removed from list */ + LRU_REMOVED_RETRY, /* item removed, but lock has been + dropped and reacquired */ LRU_ROTATE, /* item referenced, give another pass */ LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock @@ -32,7 +34,11 @@ struct list_lru { }; void list_lru_destroy(struct list_lru *lru); -int list_lru_init(struct list_lru *lru); +int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key); +static inline int list_lru_init(struct list_lru *lru) +{ + return list_lru_init_key(lru, NULL); +} /** * list_lru_add: add an element to the lru list's tail diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f25db1d74a21..fac5509c18f0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -144,6 +144,7 @@ enum zone_stat_item { #endif WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, + WORKINGSET_NODERECLAIM, NR_ANON_TRANSPARENT_HUGEPAGES, NR_FREE_CMA_PAGES, NR_VM_ZONE_STAT_ITEMS }; diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 13636c40bc42..33170dbd9db4 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -72,21 +72,37 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) #define RADIX_TREE_TAG_LONGS \ ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) -struct radix_tree_node { - unsigned int height; /* Height from the bottom */ - unsigned int count; - union { - struct radix_tree_node *parent; /* Used when ascending tree */ - struct rcu_head rcu_head; /* Used when freeing node */ - }; - void __rcu *slots[RADIX_TREE_MAP_SIZE]; - unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; -}; - #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) +/* Height component in node->path */ +#define RADIX_TREE_HEIGHT_SHIFT (RADIX_TREE_MAX_PATH + 1) +#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1) + +/* Internally used bits of node->count */ +#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1) +#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) + +struct radix_tree_node { + unsigned int path; /* Offset in parent & height from the bottom */ + unsigned int count; + union { + struct { + /* Used when ascending tree */ + struct radix_tree_node *parent; + /* For tree user */ + void *private_data; + }; + /* Used when freeing node */ + struct rcu_head rcu_head; + }; + /* For tree user */ + struct list_head private_list; + void __rcu *slots[RADIX_TREE_MAP_SIZE]; + unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; +}; + /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ struct radix_tree_root { unsigned int height; @@ -251,7 +267,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); -bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, +bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); diff --git a/include/linux/swap.h b/include/linux/swap.h index b83cf61403ed..350711560753 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -264,6 +264,37 @@ struct swap_list_t { void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); +extern struct list_lru workingset_shadow_nodes; + +static inline unsigned int workingset_node_pages(struct radix_tree_node *node) +{ + return node->count & RADIX_TREE_COUNT_MASK; +} + +static inline void workingset_node_pages_inc(struct radix_tree_node *node) +{ + node->count++; +} + +static inline void workingset_node_pages_dec(struct radix_tree_node *node) +{ + node->count--; +} + +static inline unsigned int workingset_node_shadows(struct radix_tree_node *node) +{ + return node->count >> RADIX_TREE_COUNT_SHIFT; +} + +static inline void workingset_node_shadows_inc(struct radix_tree_node *node) +{ + node->count += 1U << RADIX_TREE_COUNT_SHIFT; +} + +static inline void workingset_node_shadows_dec(struct radix_tree_node *node) +{ + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; +} /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index d60be40c111b..9599aa72d7a0 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -342,7 +342,8 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) /* Increase the height. */ newheight = root->height+1; - node->height = newheight; + BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK); + node->path = newheight; node->count = 1; node->parent = NULL; slot = root->rnode; @@ -400,11 +401,12 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, /* Have to add a child node. */ if (!(slot = radix_tree_node_alloc(root))) return -ENOMEM; - slot->height = height; + slot->path = height; slot->parent = node; if (node) { rcu_assign_pointer(node->slots[offset], slot); node->count++; + slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT; } else rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); } @@ -498,7 +500,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, } node = indirect_to_ptr(node); - height = node->height; + height = node->path & RADIX_TREE_HEIGHT_MASK; if (index > radix_tree_maxindex(height)) return NULL; @@ -704,7 +706,7 @@ int radix_tree_tag_get(struct radix_tree_root *root, return (index == 0); node = indirect_to_ptr(node); - height = node->height; + height = node->path & RADIX_TREE_HEIGHT_MASK; if (index > radix_tree_maxindex(height)) return 0; @@ -741,7 +743,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, { unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK; struct radix_tree_node *rnode, *node; - unsigned long index, offset; + unsigned long index, offset, height; if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) return NULL; @@ -772,7 +774,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, return NULL; restart: - shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT; + height = rnode->path & RADIX_TREE_HEIGHT_MASK; + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; offset = index >> shift; /* Index outside of the tree */ @@ -1142,7 +1145,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item, unsigned int shift, height; unsigned long i; - height = slot->height; + height = slot->path & RADIX_TREE_HEIGHT_MASK; shift = (height-1) * RADIX_TREE_MAP_SHIFT; for ( ; height > 1; height--) { @@ -1205,7 +1208,8 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) } node = indirect_to_ptr(node); - max_index = radix_tree_maxindex(node->height); + max_index = radix_tree_maxindex(node->path & + RADIX_TREE_HEIGHT_MASK); if (cur_index > max_index) { rcu_read_unlock(); break; @@ -1301,7 +1305,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) * * Returns %true if @node was freed, %false otherwise. */ -bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, +bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { bool deleted = false; @@ -1320,9 +1324,10 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, parent = node->parent; if (parent) { - index >>= RADIX_TREE_MAP_SHIFT; + unsigned int offset; - parent->slots[index & RADIX_TREE_MAP_MASK] = NULL; + offset = node->path >> RADIX_TREE_HEIGHT_SHIFT; + parent->slots[offset] = NULL; parent->count--; } else { root_tag_clear_all(root); @@ -1386,7 +1391,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, node->slots[offset] = NULL; node->count--; - __radix_tree_delete_node(root, index, node); + __radix_tree_delete_node(root, node); return entry; } @@ -1419,9 +1424,12 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) EXPORT_SYMBOL(radix_tree_tagged); static void -radix_tree_node_ctor(void *node) +radix_tree_node_ctor(void *arg) { - memset(node, 0, sizeof(struct radix_tree_node)); + struct radix_tree_node *node = arg; + + memset(node, 0, sizeof(*node)); + INIT_LIST_HEAD(&node->private_list); } static __init unsigned long __maxindex(unsigned int height) diff --git a/mm/filemap.c b/mm/filemap.c index a603c4d7d3c9..d6df3bacb0fb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -110,11 +110,17 @@ static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { - if (shadow) { - void **slot; + struct radix_tree_node *node; + unsigned long index; + unsigned int offset; + unsigned int tag; + void **slot; - slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); - radix_tree_replace_slot(slot, shadow); + VM_BUG_ON(!PageLocked(page)); + + __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + + if (shadow) { mapping->nrshadows++; /* * Make sure the nrshadows update is committed before @@ -123,9 +129,45 @@ static void page_cache_tree_delete(struct address_space *mapping, * same time and miss a shadow entry. */ smp_wmb(); - } else - radix_tree_delete(&mapping->page_tree, page->index); + } mapping->nrpages--; + + if (!node) { + /* Clear direct pointer tags in root node */ + mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; + radix_tree_replace_slot(slot, shadow); + return; + } + + /* Clear tree tags for the removed page */ + index = page->index; + offset = index & RADIX_TREE_MAP_MASK; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (test_bit(offset, node->tags[tag])) + radix_tree_tag_clear(&mapping->page_tree, index, tag); + } + + /* Delete page, swap shadow entry */ + radix_tree_replace_slot(slot, shadow); + workingset_node_pages_dec(node); + if (shadow) + workingset_node_shadows_inc(node); + else + if (__radix_tree_delete_node(&mapping->page_tree, node)) + return; + + /* + * Track node that only contains shadow entries. + * + * Avoid acquiring the list_lru lock if already tracked. The + * list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_pages(node) && + list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&workingset_shadow_nodes, &node->private_list); + } } /* @@ -471,27 +513,43 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page); static int page_cache_tree_insert(struct address_space *mapping, struct page *page, void **shadowp) { + struct radix_tree_node *node; void **slot; int error; - slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); - if (slot) { + error = __radix_tree_create(&mapping->page_tree, page->index, + &node, &slot); + if (error) + return error; + if (*slot) { void *p; p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; - radix_tree_replace_slot(slot, page); - mapping->nrshadows--; - mapping->nrpages++; if (shadowp) *shadowp = p; - return 0; + mapping->nrshadows--; + if (node) + workingset_node_shadows_dec(node); } - error = radix_tree_insert(&mapping->page_tree, page->index, page); - if (!error) - mapping->nrpages++; - return error; + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + } + return 0; } static int __add_to_page_cache_locked(struct page *page, diff --git a/mm/list_lru.c b/mm/list_lru.c index 72f9decb0104..f1a0db194173 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -87,11 +87,20 @@ restart: ret = isolate(item, &nlru->lock, cb_arg); switch (ret) { + case LRU_REMOVED_RETRY: + assert_spin_locked(&nlru->lock); case LRU_REMOVED: if (--nlru->nr_items == 0) node_clear(nid, lru->active_nodes); WARN_ON_ONCE(nlru->nr_items < 0); isolated++; + /* + * If the lru lock has been dropped, our list + * traversal is now invalid and so we have to + * restart from scratch. + */ + if (ret == LRU_REMOVED_RETRY) + goto restart; break; case LRU_ROTATE: list_move_tail(item, &nlru->list); @@ -103,6 +112,7 @@ restart: * The lru lock has been dropped, our list traversal is * now invalid and so we have to restart from scratch. */ + assert_spin_locked(&nlru->lock); goto restart; default: BUG(); @@ -114,7 +124,7 @@ restart: } EXPORT_SYMBOL_GPL(list_lru_walk_node); -int list_lru_init(struct list_lru *lru) +int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) { int i; size_t size = sizeof(*lru->node) * nr_node_ids; @@ -126,12 +136,14 @@ int list_lru_init(struct list_lru *lru) nodes_clear(lru->active_nodes); for (i = 0; i < nr_node_ids; i++) { spin_lock_init(&lru->node[i].lock); + if (key) + lockdep_set_class(&lru->node[i].lock, key); INIT_LIST_HEAD(&lru->node[i].list); lru->node[i].nr_items = 0; } return 0; } -EXPORT_SYMBOL_GPL(list_lru_init); +EXPORT_SYMBOL_GPL(list_lru_init_key); void list_lru_destroy(struct list_lru *lru) { diff --git a/mm/truncate.c b/mm/truncate.c index 0db9258319f0..e5cc39ab0751 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -25,6 +25,9 @@ static void clear_exceptional_entry(struct address_space *mapping, pgoff_t index, void *entry) { + struct radix_tree_node *node; + void **slot; + /* Handled by shmem itself */ if (shmem_mapping(mapping)) return; @@ -35,8 +38,27 @@ static void clear_exceptional_entry(struct address_space *mapping, * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ - if (radix_tree_delete_item(&mapping->page_tree, index, entry) == entry) - mapping->nrshadows--; + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrshadows--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); +unlock: spin_unlock_irq(&mapping->tree_lock); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 843125a0e255..f3155d51acfd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -772,6 +772,7 @@ const char * const vmstat_text[] = { #endif "workingset_refault", "workingset_activate", + "workingset_nodereclaim", "nr_anon_transparent_hugepages", "nr_free_cma", "nr_dirty_threshold", diff --git a/mm/workingset.c b/mm/workingset.c index 8a6c7cff4923..f7216fa7da27 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -251,3 +251,164 @@ void workingset_activation(struct page *page) { atomic_long_inc(&page_zone(page)->inactive_age); } + +/* + * Shadow entries reflect the share of the working set that does not + * fit into memory, so their number depends on the access pattern of + * the workload. In most cases, they will refault or get reclaimed + * along with the inode, but a (malicious) workload that streams + * through files with a total size several times that of available + * memory, while preventing the inodes from being reclaimed, can + * create excessive amounts of shadow nodes. To keep a lid on this, + * track shadow nodes and reclaim them when they grow way past the + * point where they would still be useful. + */ + +struct list_lru workingset_shadow_nodes; + +static unsigned long count_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long shadow_nodes; + unsigned long max_nodes; + unsigned long pages; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_irq_disable(); + shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); + local_irq_enable(); + + pages = node_present_pages(sc->nid); + /* + * Active cache pages are limited to 50% of memory, and shadow + * entries that represent a refault distance bigger than that + * do not have any effect. Limit the number of shadow nodes + * such that shadow entries do not exceed the number of active + * cache pages, assuming a worst-case node population density + * of 1/8th on average. + * + * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * each, this will reclaim shadow entries when they consume + * ~2% of available memory: + * + * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE + */ + max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); + + if (shadow_nodes <= max_nodes) + return 0; + + return shadow_nodes - max_nodes; +} + +static enum lru_status shadow_lru_isolate(struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct address_space *mapping; + struct radix_tree_node *node; + unsigned int i; + int ret; + + /* + * Page cache insertions and deletions synchroneously maintain + * the shadow node LRU under the mapping->tree_lock and the + * lru_lock. Because the page cache tree is emptied before + * the inode can be destroyed, holding the lru_lock pins any + * address_space that has radix tree nodes on the LRU. + * + * We can then safely transition to the mapping->tree_lock to + * pin only the address_space of the particular node we want + * to reclaim, take the node off-LRU, and drop the lru_lock. + */ + + node = container_of(item, struct radix_tree_node, private_list); + mapping = node->private_data; + + /* Coming from the list, invert the lock order */ + if (!spin_trylock(&mapping->tree_lock)) { + spin_unlock(lru_lock); + ret = LRU_RETRY; + goto out; + } + + list_del_init(item); + spin_unlock(lru_lock); + + /* + * The nodes should only contain one or more shadow entries, + * no pages, so we expect to be able to remove them all and + * delete and free the empty node afterwards. + */ + + BUG_ON(!node->count); + BUG_ON(node->count & RADIX_TREE_COUNT_MASK); + + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[i]) { + BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); + node->slots[i] = NULL; + BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; + BUG_ON(!mapping->nrshadows); + mapping->nrshadows--; + } + } + BUG_ON(node->count); + inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); + if (!__radix_tree_delete_node(&mapping->page_tree, node)) + BUG(); + + spin_unlock(&mapping->tree_lock); + ret = LRU_REMOVED_RETRY; +out: + local_irq_enable(); + cond_resched(); + local_irq_disable(); + spin_lock(lru_lock); + return ret; +} + +static unsigned long scan_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long ret; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_irq_disable(); + ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, + shadow_lru_isolate, NULL, &sc->nr_to_scan); + local_irq_enable(); + return ret; +} + +static struct shrinker workingset_shadow_shrinker = { + .count_objects = count_shadow_nodes, + .scan_objects = scan_shadow_nodes, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + +/* + * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe + * mapping->tree_lock. + */ +static struct lock_class_key shadow_nodes_key; + +static int __init workingset_init(void) +{ + int ret; + + ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + if (ret) + goto err; + ret = register_shrinker(&workingset_shadow_shrinker); + if (ret) + goto err_list_lru; + return 0; +err_list_lru: + list_lru_destroy(&workingset_shadow_nodes); +err: + return ret; +} +module_init(workingset_init);