diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8f441dab0396..7116fda7077f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1726,16 +1726,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted. option description. memmap=nn[KMG]@ss[KMG] - [KNL] Force usage of a specific region of memory - Region of memory to be used, from ss to ss+nn. + [KNL] Force usage of a specific region of memory. + Region of memory to be used is from ss to ss+nn. memmap=nn[KMG]#ss[KMG] [KNL,ACPI] Mark specific memory as ACPI data. - Region of memory to be used, from ss to ss+nn. + Region of memory to be marked is from ss to ss+nn. memmap=nn[KMG]$ss[KMG] [KNL,ACPI] Mark specific memory as reserved. - Region of memory to be used, from ss to ss+nn. + Region of memory to be reserved is from ss to ss+nn. Example: Exclude memory from 0x18690000-0x1869ffff memmap=64K$0x18690000 or diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 81b2750f3666..27aa0455fab3 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -493,14 +493,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) struct numa_memblk *mb = &mi->blk[i]; memblock_set_node(mb->start, mb->end - mb->start, &memblock.memory, mb->nid); - - /* - * At this time, all memory regions reserved by memblock are - * used by the kernel. Set the nid in memblock.reserved will - * mark out all the nodes the kernel resides in. - */ - memblock_set_node(mb->start, mb->end - mb->start, - &memblock.reserved, mb->nid); } /* @@ -565,10 +557,21 @@ static void __init numa_init_array(void) static void __init numa_clear_kernel_node_hotplug(void) { int i, nid; - nodemask_t numa_kernel_nodes; + nodemask_t numa_kernel_nodes = NODE_MASK_NONE; unsigned long start, end; struct memblock_type *type = &memblock.reserved; + /* + * At this time, all memory regions reserved by memblock are + * used by the kernel. Set the nid in memblock.reserved will + * mark out all the nodes the kernel resides in. + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = &numa_meminfo.blk[i]; + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.reserved, mb->nid); + } + /* Mark all kernel nodes. */ for (i = 0; i < type->cnt; i++) node_set(type->regions[i].nid, numa_kernel_nodes); diff --git a/fs/buffer.c b/fs/buffer.c index 651dba10b9c2..27265a8b43c1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -654,14 +654,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); static void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { - spin_lock_irq(&mapping->tree_lock); + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 8750ae1b8636..aada5801567a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4742,6 +4742,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, enum ocfs2_alloc_restarted *reason_ret) { int status = 0, err = 0; + int need_free = 0; int free_extents; enum ocfs2_alloc_restarted reason = RESTART_NONE; u32 bit_off, num_bits; @@ -4796,7 +4797,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); - goto leave; + need_free = 1; + goto bail; } block = ocfs2_clusters_to_blocks(osb->sb, bit_off); @@ -4807,7 +4809,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, num_bits, flags, meta_ac); if (status < 0) { mlog_errno(status); - goto leave; + need_free = 1; + goto bail; } ocfs2_journal_dirty(handle, et->et_root_bh); @@ -4821,6 +4824,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, reason = RESTART_TRANS; } +bail: + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num_bits); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num_bits); + } + leave: if (reason_ret) *reason_ret = reason; @@ -6805,6 +6821,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct buffer_head *di_bh) { int ret, i, has_data, num_pages = 0; + int need_free = 0; + u32 bit_off, num; handle_t *handle; u64 uninitialized_var(block); struct ocfs2_inode_info *oi = OCFS2_I(inode); @@ -6850,7 +6868,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } if (has_data) { - u32 bit_off, num; unsigned int page_end; u64 phys; @@ -6886,6 +6903,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6896,6 +6914,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_read_inline_data(inode, pages[0], di_bh); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6927,6 +6946,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6938,6 +6958,18 @@ out_commit: dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num); + } + ocfs2_commit_trans(osb, handle); out_unlock: diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index cd5496b7a0a3..044013455621 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -781,6 +781,48 @@ bail: return status; } +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bit_off, + u32 num_bits) +{ + int status, start; + u32 clear_bits; + struct inode *local_alloc_inode; + void *bitmap; + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); + + local_alloc_inode = ac->ac_inode; + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + bitmap = la->la_bitmap; + start = bit_off - le32_to_cpu(la->la_bm_off); + clear_bits = num_bits; + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while (clear_bits--) + ocfs2_clear_bit(start++, bitmap); + + le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits); + ocfs2_journal_dirty(handle, osb->local_alloc_bh); + +bail: + return status; +} + static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) { u32 count; diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index 1be9b5864460..44a7d1fb2dec 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -55,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, u32 *bit_off, u32 *num_bits); +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bit_off, + u32 num_bits); + void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, unsigned int num_clusters); void ocfs2_la_enable_worker(struct work_struct *work); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e464b4e987e8..d1fe1a761047 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -228,9 +228,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) PAGEFLAG(MappedToDisk, mappedtodisk) -/* PG_readahead is only used for file reads; PG_reclaim is only for writes */ +/* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) -PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */ +PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) #ifdef CONFIG_HIGHMEM /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2d30e2cfe804..7106cb1aca8e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2173,11 +2173,12 @@ int __set_page_dirty_nobuffers(struct page *page) if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; + unsigned long flags; if (!mapping) return 1; - spin_lock_irq(&mapping->tree_lock); + spin_lock_irqsave(&mapping->tree_lock, flags); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); @@ -2186,7 +2187,7 @@ int __set_page_dirty_nobuffers(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/mm/swap_state.c b/mm/swap_state.c index 98e85e9c2b2d..e76ace30d436 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void) return ret; } +static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); + void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); @@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), entry.val); - if (page) + if (page) { INC_CACHE_INFO(find_success); + if (TestClearPageReadahead(page)) + atomic_inc(&swapin_readahead_hits); + } INC_CACHE_INFO(find_total); return page; @@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return found_page; } +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int pages, max_pages, last_ra; + static atomic_t last_readahead_pages; + + max_pages = 1 << ACCESS_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + /* + * This heuristic has been found to work well on both sequential and + * random loads, swapping to hard disk or to SSD: please don't ask + * what the "+ 2" means, it just happens to work well, that's all. + */ + pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + if (pages == 2) { + /* + * We can have no readahead hits to judge by: but must not get + * stuck here forever, so check for an adjacent offset instead + * (and don't even bother to check whether swap type is same). + */ + if (offset != prev_offset + 1 && offset != prev_offset - 1) + pages = 1; + prev_offset = offset; + } else { + unsigned int roundup = 4; + while (roundup < pages) + roundup <<= 1; + pages = roundup; + } + + if (pages > max_pages) + pages = max_pages; + + /* Don't shrink readahead too fast */ + last_ra = atomic_read(&last_readahead_pages) / 2; + if (pages < last_ra) + pages = last_ra; + atomic_set(&last_readahead_pages, pages); + + return pages; +} + /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory @@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *page; - unsigned long offset = swp_offset(entry); + unsigned long entry_offset = swp_offset(entry); + unsigned long offset = entry_offset; unsigned long start_offset, end_offset; - unsigned long mask = (1UL << page_cluster) - 1; + unsigned long mask; struct blk_plug plug; + mask = swapin_nr_pages(offset) - 1; + if (!mask) + goto skip; + /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, gfp_mask, vma, addr); if (!page) continue; + if (offset != entry_offset) + SetPageReadahead(page); page_cache_release(page); } blk_finish_plug(&plug); lru_add_drain(); /* Push any new pages onto the LRU now */ +skip: return read_swap_cache_async(entry, gfp_mask, vma, addr); } diff --git a/mm/swapfile.c b/mm/swapfile.c index c6c13b050a58..4a7f7e6992b6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1923,7 +1923,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->swap_map = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; - p->flags = 0; frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); @@ -1949,6 +1948,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&inode->i_mutex); } filp_close(swap_file, NULL); + + /* + * Clear the SWP_USED flag after all resources are freed so that swapon + * can reuse this swap_info in alloc_swap_info() safely. It is ok to + * not hold p->lock after we cleared its SWP_WRITEOK. + */ + spin_lock(&swap_lock); + p->flags = 0; + spin_unlock(&swap_lock); + err = 0; atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait);