Btrfs: Fix streaming read performance with checksumming on

Large streaming reads make for large bios, which means each entry on the list async work queues represents a large amount of data. IO congestion throttling on the device was kicking in before the async worker threads decided a single thread was busy and needed some help. The end result was that a streaming read would result in a single CPU running at 100% instead of balancing the work off to other CPUs. This patch also changes the pre-IO checksum lookup done by reads to work on a per-bio basis instead of a per-page. This results in many extra btree lookups on large streaming reads. Doing the checksum lookup right before bio submit allows us to reuse searches while processing adjacent offsets. Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-07-31 15:42:53 -04:00 · 2008-07-31 15:42:53 -04:00 · 61b4944018
commit 61b4944018
parent 37d1aeee39
5 changed files with 99 additions and 54 deletions
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@ -160,7 +160,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max)
 	INIT_LIST_HEAD(&workers->idle_list);
 	spin_lock_init(&workers->lock);
 	workers->max_workers = max;
-	workers->idle_thresh = 64;
+	workers->idle_thresh = 32;
 }

 /*
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@ -1613,6 +1613,8 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       struct btrfs_key *location, int mod);

 /* file-item.c */
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       u64 objectid, u64 pos, u64 disk_offset,
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@ -1357,10 +1357,25 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 */
 	btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
+
+	/* a higher idle thresh on the submit workers makes it much more
+	 * likely that bios will be send down in a sane order to the
+	 * devices
+	 */
+	fs_info->submit_workers.idle_thresh = 64;
+
 	btrfs_init_workers(&fs_info->fixup_workers, 1);
 	btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_write_workers,
 			   fs_info->thread_pool_size);
+
+	/*
+	 * endios are largely parallel and should have a very
+	 * low idle thresh
+	 */
+	fs_info->endio_workers.idle_thresh = 4;
+	fs_info->endio_write_workers.idle_thresh = 4;
+
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
 	btrfs_start_workers(&fs_info->fixup_workers, 1);
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@ -134,6 +134,83 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }

+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio)
+{
+	u32 sum;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int bio_index = 0;
+	u64 offset;
+	u64 item_start_offset = 0;
+	u64 item_last_offset = 0;
+	u32 diff;
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_csum_item *item = NULL;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+
+	path = btrfs_alloc_path();
+	path->reada = 2;
+
+	WARN_ON(bio->bi_vcnt <= 0);
+
+	while(bio_index < bio->bi_vcnt) {
+		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+		ret = btrfs_find_ordered_sum(inode, offset, &sum);
+		if (ret == 0)
+			goto found;
+
+		if (!item || offset < item_start_offset ||
+		    offset >= item_last_offset) {
+			struct btrfs_key found_key;
+			u32 item_size;
+
+			if (item)
+				btrfs_release_path(root, path);
+			item = btrfs_lookup_csum(NULL, root, path,
+						 inode->i_ino, offset, 0);
+			if (IS_ERR(item)) {
+				ret = PTR_ERR(item);
+				if (ret == -ENOENT || ret == -EFBIG)
+					ret = 0;
+				sum = 0;
+				printk("no csum found for inode %lu start "
+				       "%llu\n", inode->i_ino,
+				       (unsigned long long)offset);
+				goto found;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+
+			item_start_offset = found_key.offset;
+			item_size = btrfs_item_size_nr(path->nodes[0],
+						       path->slots[0]);
+			item_last_offset = item_start_offset +
+				(item_size / BTRFS_CRC32_SIZE) *
+				root->sectorsize;
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_csum_item);
+		}
+		/*
+		 * this byte range must be able to fit inside
+		 * a single leaf so it will also fit inside a u32
+		 */
+		diff = offset - item_start_offset;
+		diff = diff / root->sectorsize;
+		diff = diff * BTRFS_CRC32_SIZE;
+
+		read_extent_buffer(path->nodes[0], &sum,
+				   (unsigned long)item + diff,
+				   BTRFS_CRC32_SIZE);
+found:
+		set_state_private(io_tree, offset, sum);
+		bio_index++;
+		bvec++;
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio)
 {
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@ -374,6 +374,10 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	BUG_ON(ret);

 	if (!(rw & (1 << BIO_RW))) {
+		if (!btrfs_test_opt(root, NODATASUM) &&
+		    !btrfs_test_flag(inode, NODATASUM)) {
+			btrfs_lookup_bio_sums(root, inode, bio);
+		}
 		goto mapit;
 	}

@ -598,58 +602,6 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }

-int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
-{
-	int ret = 0;
-	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_csum_item *item;
-	struct btrfs_path *path = NULL;
-	u32 csum;
-
-	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM))
-		return 0;
-
-	/*
-	 * It is possible there is an ordered extent that has
-	 * not yet finished for this range in the file.  If so,
-	 * that extent will have a csum cached, and it will insert
-	 * the sum after all the blocks in the extent are fully
-	 * on disk.  So, look for an ordered extent and use the
-	 * sum if found.  We have to do this before looking in the
-	 * btree because csum items are pre-inserted based on
-	 * the file size.  btrfs_lookup_csum might find an item
-	 * that still hasn't been fully filled.
-	 */
-	ret = btrfs_find_ordered_sum(inode, start, &csum);
-	if (ret == 0)
-		goto found;
-
-	ret = 0;
-	path = btrfs_alloc_path();
-	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
-	if (IS_ERR(item)) {
-		ret = PTR_ERR(item);
-		/* a csum that isn't present is a preallocated region. */
-		if (ret == -ENOENT || ret == -EFBIG)
-			ret = 0;
-		csum = 0;
-		printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
-		       start);
-		goto out;
-	}
-	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
-			   BTRFS_CRC32_SIZE);
-found:
-	set_state_private(io_tree, start, csum);
-out:
-	if (path)
-		btrfs_free_path(path);
-	return ret;
-}
-
 struct io_failure_record {
 	struct page *page;
 	u64 start;
@ -3613,7 +3565,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.fill_delalloc = run_delalloc_range,
 	.submit_bio_hook = btrfs_submit_bio_hook,
 	.merge_bio_hook = btrfs_merge_bio_hook,
-	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.writepage_start_hook = btrfs_writepage_start_hook,