forked from Minki/linux
72abc8f4b4
I hit the same assert failed as Dolev Raviv reported in Kernel v3.10 shows like this: [ 9641.164028] UBIFS assert failed in shrink_tnc at 131 (pid 13297) [ 9641.234078] CPU: 1 PID: 13297 Comm: mmap.test Tainted: G O 3.10.40 #1 [ 9641.234116] [<c0011a6c>] (unwind_backtrace+0x0/0x12c) from [<c000d0b0>] (show_stack+0x20/0x24) [ 9641.234137] [<c000d0b0>] (show_stack+0x20/0x24) from [<c0311134>] (dump_stack+0x20/0x28) [ 9641.234188] [<c0311134>] (dump_stack+0x20/0x28) from [<bf22425c>] (shrink_tnc_trees+0x25c/0x350 [ubifs]) [ 9641.234265] [<bf22425c>] (shrink_tnc_trees+0x25c/0x350 [ubifs]) from [<bf2245ac>] (ubifs_shrinker+0x25c/0x310 [ubifs]) [ 9641.234307] [<bf2245ac>] (ubifs_shrinker+0x25c/0x310 [ubifs]) from [<c00cdad8>] (shrink_slab+0x1d4/0x2f8) [ 9641.234327] [<c00cdad8>] (shrink_slab+0x1d4/0x2f8) from [<c00d03d0>] (do_try_to_free_pages+0x300/0x544) [ 9641.234344] [<c00d03d0>] (do_try_to_free_pages+0x300/0x544) from [<c00d0a44>] (try_to_free_pages+0x2d0/0x398) [ 9641.234363] [<c00d0a44>] (try_to_free_pages+0x2d0/0x398) from [<c00c6a60>] (__alloc_pages_nodemask+0x494/0x7e8) [ 9641.234382] [<c00c6a60>] (__alloc_pages_nodemask+0x494/0x7e8) from [<c00f62d8>] (new_slab+0x78/0x238) [ 9641.234400] [<c00f62d8>] (new_slab+0x78/0x238) from [<c031081c>] (__slab_alloc.constprop.42+0x1a4/0x50c) [ 9641.234419] [<c031081c>] (__slab_alloc.constprop.42+0x1a4/0x50c) from [<c00f80e8>] (kmem_cache_alloc_trace+0x54/0x188) [ 9641.234459] [<c00f80e8>] (kmem_cache_alloc_trace+0x54/0x188) from [<bf227908>] (do_readpage+0x168/0x468 [ubifs]) [ 9641.234553] [<bf227908>] (do_readpage+0x168/0x468 [ubifs]) from [<bf2296a0>] (ubifs_readpage+0x424/0x464 [ubifs]) [ 9641.234606] [<bf2296a0>] (ubifs_readpage+0x424/0x464 [ubifs]) from [<c00c17c0>] (filemap_fault+0x304/0x418) [ 9641.234638] [<c00c17c0>] (filemap_fault+0x304/0x418) from [<c00de694>] (__do_fault+0xd4/0x530) [ 9641.234665] [<c00de694>] (__do_fault+0xd4/0x530) from [<c00e10c0>] (handle_pte_fault+0x480/0xf54) [ 9641.234690] [<c00e10c0>] (handle_pte_fault+0x480/0xf54) from [<c00e2bf8>] (handle_mm_fault+0x140/0x184) [ 9641.234716] [<c00e2bf8>] (handle_mm_fault+0x140/0x184) from [<c0316688>] (do_page_fault+0x150/0x3ac) [ 9641.234737] [<c0316688>] (do_page_fault+0x150/0x3ac) from [<c000842c>] (do_DataAbort+0x3c/0xa0) [ 9641.234759] [<c000842c>] (do_DataAbort+0x3c/0xa0) from [<c0314e38>] (__dabt_usr+0x38/0x40) After analyzing the code, I found a condition that may cause this failed in correct operations. Thus, I think this assertion is wrong and should be removed. Suppose there are two clean znodes and one dirty znode in TNC. So the per-filesystem atomic_t @clean_zn_cnt is (2). If commit start, dirty_znode is set to COW_ZNODE in get_znodes_to_commit() in case of potentially ops on this znode. We clear COW bit and DIRTY bit in write_index() without @tnc_mutex locked. We don't increase @clean_zn_cnt in this place. As the comments in write_index() shows, if another process hold @tnc_mutex and dirty this znode after we clean it, @clean_zn_cnt would be decreased to (1). We will increase @clean_zn_cnt to (2) with @tnc_mutex locked in free_obsolete_znodes() to keep it right. If shrink_tnc() performs between decrease and increase, it will release other 2 clean znodes it holds and found @clean_zn_cnt is less than zero (1 - 2 = -1), then hit the assertion. Because free_obsolete_znodes() will soon correct @clean_zn_cnt and no harm to fs in this case, I think this assertion could be removed. 2 clean zondes and 1 dirty znode, @clean_zn_cnt == 2 Thread A (commit) Thread B (write or others) Thread C (shrinker) ->write_index ->clear_bit(DIRTY_NODE) ->clear_bit(COW_ZNODE) @clean_zn_cnt == 2 ->mutex_locked(&tnc_mutex) ->dirty_cow_znode ->!ubifs_zn_cow(znode) ->!test_and_set_bit(DIRTY_NODE) ->atomic_dec(&clean_zn_cnt) ->mutex_unlocked(&tnc_mutex) @clean_zn_cnt == 1 ->mutex_locked(&tnc_mutex) ->shrink_tnc ->destroy_tnc_subtree ->atomic_sub(&clean_zn_cnt, 2) ->ubifs_assert <- hit ->mutex_unlocked(&tnc_mutex) @clean_zn_cnt == -1 ->mutex_lock(&tnc_mutex) ->free_obsolete_znodes ->atomic_inc(&clean_zn_cnt) ->mutux_unlock(&tnc_mutex) @clean_zn_cnt == 0 (correct after shrink) Signed-off-by: hujianyang <hujianyang@huawei.com> Cc: stable@vger.kernel.org Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
332 lines
9.6 KiB
C
332 lines
9.6 KiB
C
/*
|
|
* This file is part of UBIFS.
|
|
*
|
|
* Copyright (C) 2006-2008 Nokia Corporation.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 as published by
|
|
* the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
* more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License along with
|
|
* this program; if not, write to the Free Software Foundation, Inc., 51
|
|
* Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*
|
|
* Authors: Artem Bityutskiy (Битюцкий Артём)
|
|
* Adrian Hunter
|
|
*/
|
|
|
|
/*
|
|
* This file implements UBIFS shrinker which evicts clean znodes from the TNC
|
|
* tree when Linux VM needs more RAM.
|
|
*
|
|
* We do not implement any LRU lists to find oldest znodes to free because it
|
|
* would add additional overhead to the file system fast paths. So the shrinker
|
|
* just walks the TNC tree when searching for znodes to free.
|
|
*
|
|
* If the root of a TNC sub-tree is clean and old enough, then the children are
|
|
* also clean and old enough. So the shrinker walks the TNC in level order and
|
|
* dumps entire sub-trees.
|
|
*
|
|
* The age of znodes is just the time-stamp when they were last looked at.
|
|
* The current shrinker first tries to evict old znodes, then young ones.
|
|
*
|
|
* Since the shrinker is global, it has to protect against races with FS
|
|
* un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'.
|
|
*/
|
|
|
|
#include "ubifs.h"
|
|
|
|
/* List of all UBIFS file-system instances */
|
|
LIST_HEAD(ubifs_infos);
|
|
|
|
/*
|
|
* We number each shrinker run and record the number on the ubifs_info structure
|
|
* so that we can easily work out which ubifs_info structures have already been
|
|
* done by the current run.
|
|
*/
|
|
static unsigned int shrinker_run_no;
|
|
|
|
/* Protects 'ubifs_infos' list */
|
|
DEFINE_SPINLOCK(ubifs_infos_lock);
|
|
|
|
/* Global clean znode counter (for all mounted UBIFS instances) */
|
|
atomic_long_t ubifs_clean_zn_cnt;
|
|
|
|
/**
|
|
* shrink_tnc - shrink TNC tree.
|
|
* @c: UBIFS file-system description object
|
|
* @nr: number of znodes to free
|
|
* @age: the age of znodes to free
|
|
* @contention: if any contention, this is set to %1
|
|
*
|
|
* This function traverses TNC tree and frees clean znodes. It does not free
|
|
* clean znodes which younger then @age. Returns number of freed znodes.
|
|
*/
|
|
static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
|
|
{
|
|
int total_freed = 0;
|
|
struct ubifs_znode *znode, *zprev;
|
|
int time = get_seconds();
|
|
|
|
ubifs_assert(mutex_is_locked(&c->umount_mutex));
|
|
ubifs_assert(mutex_is_locked(&c->tnc_mutex));
|
|
|
|
if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0)
|
|
return 0;
|
|
|
|
/*
|
|
* Traverse the TNC tree in levelorder manner, so that it is possible
|
|
* to destroy large sub-trees. Indeed, if a znode is old, then all its
|
|
* children are older or of the same age.
|
|
*
|
|
* Note, we are holding 'c->tnc_mutex', so we do not have to lock the
|
|
* 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is
|
|
* changed only when the 'c->tnc_mutex' is held.
|
|
*/
|
|
zprev = NULL;
|
|
znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
|
|
while (znode && total_freed < nr &&
|
|
atomic_long_read(&c->clean_zn_cnt) > 0) {
|
|
int freed;
|
|
|
|
/*
|
|
* If the znode is clean, but it is in the 'c->cnext' list, this
|
|
* means that this znode has just been written to flash as a
|
|
* part of commit and was marked clean. They will be removed
|
|
* from the list at end commit. We cannot change the list,
|
|
* because it is not protected by any mutex (design decision to
|
|
* make commit really independent and parallel to main I/O). So
|
|
* we just skip these znodes.
|
|
*
|
|
* Note, the 'clean_zn_cnt' counters are not updated until
|
|
* after the commit, so the UBIFS shrinker does not report
|
|
* the znodes which are in the 'c->cnext' list as freeable.
|
|
*
|
|
* Also note, if the root of a sub-tree is not in 'c->cnext',
|
|
* then the whole sub-tree is not in 'c->cnext' as well, so it
|
|
* is safe to dump whole sub-tree.
|
|
*/
|
|
|
|
if (znode->cnext) {
|
|
/*
|
|
* Very soon these znodes will be removed from the list
|
|
* and become freeable.
|
|
*/
|
|
*contention = 1;
|
|
} else if (!ubifs_zn_dirty(znode) &&
|
|
abs(time - znode->time) >= age) {
|
|
if (znode->parent)
|
|
znode->parent->zbranch[znode->iip].znode = NULL;
|
|
else
|
|
c->zroot.znode = NULL;
|
|
|
|
freed = ubifs_destroy_tnc_subtree(znode);
|
|
atomic_long_sub(freed, &ubifs_clean_zn_cnt);
|
|
atomic_long_sub(freed, &c->clean_zn_cnt);
|
|
total_freed += freed;
|
|
znode = zprev;
|
|
}
|
|
|
|
if (unlikely(!c->zroot.znode))
|
|
break;
|
|
|
|
zprev = znode;
|
|
znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
|
|
cond_resched();
|
|
}
|
|
|
|
return total_freed;
|
|
}
|
|
|
|
/**
|
|
* shrink_tnc_trees - shrink UBIFS TNC trees.
|
|
* @nr: number of znodes to free
|
|
* @age: the age of znodes to free
|
|
* @contention: if any contention, this is set to %1
|
|
*
|
|
* This function walks the list of mounted UBIFS file-systems and frees clean
|
|
* znodes which are older than @age, until at least @nr znodes are freed.
|
|
* Returns the number of freed znodes.
|
|
*/
|
|
static int shrink_tnc_trees(int nr, int age, int *contention)
|
|
{
|
|
struct ubifs_info *c;
|
|
struct list_head *p;
|
|
unsigned int run_no;
|
|
int freed = 0;
|
|
|
|
spin_lock(&ubifs_infos_lock);
|
|
do {
|
|
run_no = ++shrinker_run_no;
|
|
} while (run_no == 0);
|
|
/* Iterate over all mounted UBIFS file-systems and try to shrink them */
|
|
p = ubifs_infos.next;
|
|
while (p != &ubifs_infos) {
|
|
c = list_entry(p, struct ubifs_info, infos_list);
|
|
/*
|
|
* We move the ones we do to the end of the list, so we stop
|
|
* when we see one we have already done.
|
|
*/
|
|
if (c->shrinker_run_no == run_no)
|
|
break;
|
|
if (!mutex_trylock(&c->umount_mutex)) {
|
|
/* Some un-mount is in progress, try next FS */
|
|
*contention = 1;
|
|
p = p->next;
|
|
continue;
|
|
}
|
|
/*
|
|
* We're holding 'c->umount_mutex', so the file-system won't go
|
|
* away.
|
|
*/
|
|
if (!mutex_trylock(&c->tnc_mutex)) {
|
|
mutex_unlock(&c->umount_mutex);
|
|
*contention = 1;
|
|
p = p->next;
|
|
continue;
|
|
}
|
|
spin_unlock(&ubifs_infos_lock);
|
|
/*
|
|
* OK, now we have TNC locked, the file-system cannot go away -
|
|
* it is safe to reap the cache.
|
|
*/
|
|
c->shrinker_run_no = run_no;
|
|
freed += shrink_tnc(c, nr, age, contention);
|
|
mutex_unlock(&c->tnc_mutex);
|
|
spin_lock(&ubifs_infos_lock);
|
|
/* Get the next list element before we move this one */
|
|
p = p->next;
|
|
/*
|
|
* Move this one to the end of the list to provide some
|
|
* fairness.
|
|
*/
|
|
list_move_tail(&c->infos_list, &ubifs_infos);
|
|
mutex_unlock(&c->umount_mutex);
|
|
if (freed >= nr)
|
|
break;
|
|
}
|
|
spin_unlock(&ubifs_infos_lock);
|
|
return freed;
|
|
}
|
|
|
|
/**
|
|
* kick_a_thread - kick a background thread to start commit.
|
|
*
|
|
* This function kicks a background thread to start background commit. Returns
|
|
* %-1 if a thread was kicked or there is another reason to assume the memory
|
|
* will soon be freed or become freeable. If there are no dirty znodes, returns
|
|
* %0.
|
|
*/
|
|
static int kick_a_thread(void)
|
|
{
|
|
int i;
|
|
struct ubifs_info *c;
|
|
|
|
/*
|
|
* Iterate over all mounted UBIFS file-systems and find out if there is
|
|
* already an ongoing commit operation there. If no, then iterate for
|
|
* the second time and initiate background commit.
|
|
*/
|
|
spin_lock(&ubifs_infos_lock);
|
|
for (i = 0; i < 2; i++) {
|
|
list_for_each_entry(c, &ubifs_infos, infos_list) {
|
|
long dirty_zn_cnt;
|
|
|
|
if (!mutex_trylock(&c->umount_mutex)) {
|
|
/*
|
|
* Some un-mount is in progress, it will
|
|
* certainly free memory, so just return.
|
|
*/
|
|
spin_unlock(&ubifs_infos_lock);
|
|
return -1;
|
|
}
|
|
|
|
dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
|
|
|
|
if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
|
|
c->ro_mount || c->ro_error) {
|
|
mutex_unlock(&c->umount_mutex);
|
|
continue;
|
|
}
|
|
|
|
if (c->cmt_state != COMMIT_RESTING) {
|
|
spin_unlock(&ubifs_infos_lock);
|
|
mutex_unlock(&c->umount_mutex);
|
|
return -1;
|
|
}
|
|
|
|
if (i == 1) {
|
|
list_move_tail(&c->infos_list, &ubifs_infos);
|
|
spin_unlock(&ubifs_infos_lock);
|
|
|
|
ubifs_request_bg_commit(c);
|
|
mutex_unlock(&c->umount_mutex);
|
|
return -1;
|
|
}
|
|
mutex_unlock(&c->umount_mutex);
|
|
}
|
|
}
|
|
spin_unlock(&ubifs_infos_lock);
|
|
|
|
return 0;
|
|
}
|
|
|
|
unsigned long ubifs_shrink_count(struct shrinker *shrink,
|
|
struct shrink_control *sc)
|
|
{
|
|
long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
|
|
|
|
/*
|
|
* Due to the way UBIFS updates the clean znode counter it may
|
|
* temporarily be negative.
|
|
*/
|
|
return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
|
|
}
|
|
|
|
unsigned long ubifs_shrink_scan(struct shrinker *shrink,
|
|
struct shrink_control *sc)
|
|
{
|
|
unsigned long nr = sc->nr_to_scan;
|
|
int contention = 0;
|
|
unsigned long freed;
|
|
long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
|
|
|
|
if (!clean_zn_cnt) {
|
|
/*
|
|
* No clean znodes, nothing to reap. All we can do in this case
|
|
* is to kick background threads to start commit, which will
|
|
* probably make clean znodes which, in turn, will be freeable.
|
|
* And we return -1 which means will make VM call us again
|
|
* later.
|
|
*/
|
|
dbg_tnc("no clean znodes, kick a thread");
|
|
return kick_a_thread();
|
|
}
|
|
|
|
freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention);
|
|
if (freed >= nr)
|
|
goto out;
|
|
|
|
dbg_tnc("not enough old znodes, try to free young ones");
|
|
freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention);
|
|
if (freed >= nr)
|
|
goto out;
|
|
|
|
dbg_tnc("not enough young znodes, free all");
|
|
freed += shrink_tnc_trees(nr - freed, 0, &contention);
|
|
|
|
if (!freed && contention) {
|
|
dbg_tnc("freed nothing, but contention");
|
|
return SHRINK_STOP;
|
|
}
|
|
|
|
out:
|
|
dbg_tnc("%lu znodes were freed, requested %lu", freed, nr);
|
|
return freed;
|
|
}
|