vfs-6.12.netfs

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZuQEvgAKCRCRxhvAZXjc
 onQWAQD6IxAKPU0zom2FoWNilvSzPs7WglTtvddX9pu/lT1RNAD/YC/wOLW8mvAv
 9oTAmigQDQQhEWdJA9RgLZBiw7k+DAw=
 =zWFb
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.12.netfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull netfs updates from Christian Brauner:
 "This contains the work to improve read/write performance for the new
  netfs library.

  The main performance enhancing changes are:

   - Define a structure, struct folio_queue, and a new iterator type,
     ITER_FOLIOQ, to hold a buffer as a replacement for ITER_XARRAY. See
     that patch for questions about naming and form.

     ITER_FOLIOQ is provided as a replacement for ITER_XARRAY. The
     problem with an xarray is that accessing it requires the use of a
     lock (typically the RCU read lock) - and this means that we can't
     supply iterate_and_advance() with a step function that might sleep
     (crypto for example) without having to drop the lock between pages.
     ITER_FOLIOQ is the iterator for a chain of folio_queue structs,
     where each folio_queue holds a small list of folios. A folio_queue
     struct is a simpler structure than xarray and is not subject to
     concurrent manipulation by the VM. folio_queue is used rather than
     a bvec[] as it can form lists of indefinite size, adding to one end
     and removing from the other on the fly.

   - Provide a copy_folio_from_iter() wrapper.

   - Make cifs RDMA support ITER_FOLIOQ.

   - Use folio queues in the write-side helpers instead of xarrays.

   - Add a function to reset the iterator in a subrequest.

   - Simplify the write-side helpers to use sheaves to skip gaps rather
     than trying to work out where gaps are.

   - In afs, make the read subrequests asynchronous, putting them into
     work items to allow the next patch to do progressive
     unlocking/reading.

   - Overhaul the read-side helpers to improve performance.

   - Fix the caching of a partial block at the end of a file.

   - Allow a store to be cancelled.

  Then some changes for cifs to make it use folio queues instead of
  xarrays for crypto bufferage:

   - Use raw iteration functions rather than manually coding iteration
     when hashing data.

   - Switch to using folio_queue for crypto buffers.

   - Remove the xarray bits.

  Make some adjustments to the /proc/fs/netfs/stats file such that:

   - All the netfs stats lines begin 'Netfs:' but change this to
     something a bit more useful.

   - Add a couple of stats counters to track the numbers of skips and
     waits on the per-inode writeback serialisation lock to make it
     easier to check for this as a source of performance loss.

  Miscellaneous work:

   - Ensure that the sb_writers lock is taken around
     vfs_{set,remove}xattr() in the cachefiles code.

   - Reduce the number of conditional branches in netfs_perform_write().

   - Move the CIFS_INO_MODIFIED_ATTR flag to the netfs_inode struct and
     remove cifs_post_modify().

   - Move the max_len/max_nr_segs members from netfs_io_subrequest to
     netfs_io_request as they're only needed for one subreq at a time.

   - Add an 'unknown' source value for tracing purposes.

   - Remove NETFS_COPY_TO_CACHE as it's no longer used.

   - Set the request work function up front at allocation time.

   - Use bh-disabling spinlocks for rreq->lock as cachefiles completion
     may be run from block-filesystem DIO completion in softirq context.

   - Remove fs/netfs/io.c"

* tag 'vfs-6.12.netfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (25 commits)
  docs: filesystems: corrected grammar of netfs page
  cifs: Don't support ITER_XARRAY
  cifs: Switch crypto buffer to use a folio_queue rather than an xarray
  cifs: Use iterate_and_advance*() routines directly for hashing
  netfs: Cancel dirty folios that have no storage destination
  cachefiles, netfs: Fix write to partial block at EOF
  netfs: Remove fs/netfs/io.c
  netfs: Speed up buffered reading
  afs: Make read subreqs async
  netfs: Simplify the writeback code
  netfs: Provide an iterator-reset function
  netfs: Use new folio_queue data type and iterator instead of xarray iter
  cifs: Provide the capability to extract from ITER_FOLIOQ to RDMA SGEs
  iov_iter: Provide copy_folio_from_iter()
  mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
  netfs: Use bh-disabling spinlocks for rreq->lock
  netfs: Set the request work function upon allocation
  netfs: Remove NETFS_COPY_TO_CACHE
  netfs: Reserve netfs_sreq_source 0 as unset/unknown
  netfs: Move max_len/max_nr_segs from netfs_io_subrequest to netfs_io_stream
  ...
This commit is contained in:
Linus Torvalds 2024-09-16 12:13:31 +02:00
commit 35219bc5c7
42 changed files with 3537 additions and 2000 deletions

View File

@ -116,7 +116,7 @@ The following services are provided:
* Handle local caching, allowing cached data and server-read data to be
interleaved for a single request.
* Handle clearing of bufferage that aren't on the server.
* Handle clearing of bufferage that isn't on the server.
* Handle retrying of reads that failed, switching reads from the cache to the
server as necessary.

View File

@ -68,17 +68,22 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct p9_fid *fid = rreq->netfs_priv;
unsigned long long pos = subreq->start + subreq->transferred;
int total, err;
total = p9_client_read(fid, subreq->start + subreq->transferred,
&subreq->io_iter, &err);
total = p9_client_read(fid, pos, &subreq->io_iter, &err);
/* if we just extended the file size, any portion not in
* cache won't be on server and is zeroes */
if (subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
if (pos + total >= i_size_read(rreq->inode))
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
netfs_subreq_terminated(subreq, err ?: total, false);
if (!err)
subreq->transferred += total;
netfs_read_subreq_terminated(subreq, err, false);
}
/**

View File

@ -16,6 +16,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/netfs.h>
#include <trace/events/netfs.h>
#include "internal.h"
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
@ -242,9 +243,10 @@ static void afs_fetch_data_notify(struct afs_operation *op)
req->error = error;
if (subreq) {
if (subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
subreq->rreq->i_size = req->file_size;
if (req->pos + req->actual_len >= req->file_size)
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
netfs_read_subreq_terminated(subreq, error, false);
req->subreq = NULL;
} else if (req->done) {
req->done(req);
@ -262,6 +264,12 @@ static void afs_fetch_data_success(struct afs_operation *op)
afs_fetch_data_notify(op);
}
static void afs_fetch_data_aborted(struct afs_operation *op)
{
afs_check_for_remote_deletion(op);
afs_fetch_data_notify(op);
}
static void afs_fetch_data_put(struct afs_operation *op)
{
op->fetch.req->error = afs_op_error(op);
@ -272,7 +280,7 @@ static const struct afs_operation_ops afs_fetch_data_operation = {
.issue_afs_rpc = afs_fs_fetch_data,
.issue_yfs_rpc = yfs_fs_fetch_data,
.success = afs_fetch_data_success,
.aborted = afs_check_for_remote_deletion,
.aborted = afs_fetch_data_aborted,
.failed = afs_fetch_data_notify,
.put = afs_fetch_data_put,
};
@ -294,7 +302,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
op = afs_alloc_operation(req->key, vnode->volume);
if (IS_ERR(op)) {
if (req->subreq)
netfs_subreq_terminated(req->subreq, PTR_ERR(op), false);
netfs_read_subreq_terminated(req->subreq, PTR_ERR(op), false);
return PTR_ERR(op);
}
@ -305,14 +313,15 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
return afs_do_sync_operation(op);
}
static void afs_issue_read(struct netfs_io_subrequest *subreq)
static void afs_read_worker(struct work_struct *work)
{
struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work);
struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
struct afs_read *fsreq;
fsreq = afs_alloc_read(GFP_NOFS);
if (!fsreq)
return netfs_subreq_terminated(subreq, -ENOMEM, false);
return netfs_read_subreq_terminated(subreq, -ENOMEM, false);
fsreq->subreq = subreq;
fsreq->pos = subreq->start + subreq->transferred;
@ -321,10 +330,17 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
fsreq->vnode = vnode;
fsreq->iter = &subreq->io_iter;
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
afs_fetch_data(fsreq->vnode, fsreq);
afs_put_read(fsreq);
}
static void afs_issue_read(struct netfs_io_subrequest *subreq)
{
INIT_WORK(&subreq->work, afs_read_worker);
queue_work(system_long_wq, &subreq->work);
}
static int afs_symlink_read_folio(struct file *file, struct folio *folio)
{
struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host);

View File

@ -304,6 +304,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
const __be32 *bp;
size_t count_before;
int ret;
_enter("{%u,%zu,%zu/%llu}",
@ -345,10 +346,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
/* extract the returned data */
case 2:
_debug("extract data %zu/%llu",
iov_iter_count(call->iter), req->actual_len);
count_before = call->iov_len;
_debug("extract data %zu/%llu", count_before, req->actual_len);
ret = afs_extract_data(call, true);
if (req->subreq) {
req->subreq->transferred += count_before - call->iov_len;
netfs_read_subreq_progress(req->subreq, false);
}
if (ret < 0)
return ret;

View File

@ -89,10 +89,12 @@ static const struct afs_operation_ops afs_store_data_operation = {
*/
void afs_prepare_write(struct netfs_io_subrequest *subreq)
{
struct netfs_io_stream *stream = &subreq->rreq->io_streams[subreq->stream_nr];
//if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags))
// subreq->max_len = 512 * 1024;
//else
subreq->max_len = 256 * 1024 * 1024;
stream->sreq_max_len = 256 * 1024 * 1024;
}
/*

View File

@ -355,6 +355,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
const __be32 *bp;
size_t count_before;
int ret;
_enter("{%u,%zu, %zu/%llu}",
@ -391,10 +392,14 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
/* extract the returned data */
case 2:
_debug("extract data %zu/%llu",
iov_iter_count(call->iter), req->actual_len);
count_before = call->iov_len;
_debug("extract data %zu/%llu", count_before, req->actual_len);
ret = afs_extract_data(call, true);
if (req->subreq) {
req->subreq->transferred += count_before - call->iov_len;
netfs_read_subreq_progress(req->subreq, false);
}
if (ret < 0)
return ret;

View File

@ -627,11 +627,12 @@ static void cachefiles_prepare_write_subreq(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *wreq = subreq->rreq;
struct netfs_cache_resources *cres = &wreq->cache_resources;
struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
_enter("W=%x[%x] %llx", wreq->debug_id, subreq->debug_index, subreq->start);
subreq->max_len = MAX_RW_COUNT;
subreq->max_nr_segs = BIO_MAX_VECS;
stream->sreq_max_len = MAX_RW_COUNT;
stream->sreq_max_segs = BIO_MAX_VECS;
if (!cachefiles_cres_file(cres)) {
if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
@ -647,6 +648,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
struct netfs_cache_resources *cres = &wreq->cache_resources;
struct cachefiles_object *object = cachefiles_cres_object(cres);
struct cachefiles_cache *cache = object->volume->cache;
struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
const struct cred *saved_cred;
size_t off, pre, post, len = subreq->len;
loff_t start = subreq->start;
@ -660,6 +662,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
if (off) {
pre = CACHEFILES_DIO_BLOCK_SIZE - off;
if (pre >= len) {
fscache_count_dio_misfit();
netfs_write_subrequest_terminated(subreq, len, false);
return;
}
@ -670,10 +673,22 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
}
/* We also need to end on the cache granularity boundary */
if (start + len == wreq->i_size) {
size_t part = len % CACHEFILES_DIO_BLOCK_SIZE;
size_t need = CACHEFILES_DIO_BLOCK_SIZE - part;
if (part && stream->submit_extendable_to >= need) {
len += need;
subreq->len += need;
subreq->io_iter.count += need;
}
}
post = len & (CACHEFILES_DIO_BLOCK_SIZE - 1);
if (post) {
len -= post;
if (len == 0) {
fscache_count_dio_misfit();
netfs_write_subrequest_terminated(subreq, post, false);
return;
}

View File

@ -64,9 +64,15 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object)
memcpy(buf->data, fscache_get_aux(object->cookie), len);
ret = cachefiles_inject_write_error();
if (ret == 0)
ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache,
buf, sizeof(struct cachefiles_xattr) + len, 0);
if (ret == 0) {
ret = mnt_want_write_file(file);
if (ret == 0) {
ret = vfs_setxattr(&nop_mnt_idmap, dentry,
cachefiles_xattr_cache, buf,
sizeof(struct cachefiles_xattr) + len, 0);
mnt_drop_write_file(file);
}
}
if (ret < 0) {
trace_cachefiles_vfs_error(object, file_inode(file), ret,
cachefiles_trace_setxattr_error);
@ -151,8 +157,14 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
int ret;
ret = cachefiles_inject_remove_error();
if (ret == 0)
ret = vfs_removexattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache);
if (ret == 0) {
ret = mnt_want_write(cache->mnt);
if (ret == 0) {
ret = vfs_removexattr(&nop_mnt_idmap, dentry,
cachefiles_xattr_cache);
mnt_drop_write(cache->mnt);
}
}
if (ret < 0) {
trace_cachefiles_vfs_error(object, d_inode(dentry), ret,
cachefiles_trace_remxattr_error);
@ -208,9 +220,15 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
memcpy(buf->data, p, volume->vcookie->coherency_len);
ret = cachefiles_inject_write_error();
if (ret == 0)
ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache,
if (ret == 0) {
ret = mnt_want_write(volume->cache->mnt);
if (ret == 0) {
ret = vfs_setxattr(&nop_mnt_idmap, dentry,
cachefiles_xattr_cache,
buf, len, 0);
mnt_drop_write(volume->cache->mnt);
}
}
if (ret < 0) {
trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret,
cachefiles_trace_setxattr_error);

View File

@ -13,6 +13,7 @@
#include <linux/iversion.h>
#include <linux/ktime.h>
#include <linux/netfs.h>
#include <trace/events/netfs.h>
#include "super.h"
#include "mds_client.h"
@ -205,21 +206,6 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
}
}
static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
{
struct inode *inode = subreq->rreq->inode;
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
u32 xlen;
/* Truncate the extent at the end of the current block */
ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
&objno, &objoff, &xlen);
subreq->len = min(xlen, fsc->mount_options->rsize);
return true;
}
static void finish_netfs_read(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
@ -264,7 +250,12 @@ static void finish_netfs_read(struct ceph_osd_request *req)
calc_pages_for(osd_data->alignment,
osd_data->length), false);
}
netfs_subreq_terminated(subreq, err, false);
if (err > 0) {
subreq->transferred = err;
err = 0;
}
trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
netfs_read_subreq_terminated(subreq, err, false);
iput(req->r_inode);
ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
@ -278,7 +269,6 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci = ceph_inode(inode);
struct iov_iter iter;
ssize_t err = 0;
size_t len;
int mode;
@ -301,6 +291,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
req->r_num_caps = 2;
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0)
goto out;
@ -314,17 +305,36 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
}
len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
if (err == 0)
err = copy_to_iter(iinfo->inline_data + subreq->start, len, &subreq->io_iter);
if (err == 0) {
err = -EFAULT;
} else {
subreq->transferred += err;
err = 0;
}
ceph_mdsc_put_request(req);
out:
netfs_subreq_terminated(subreq, err, false);
netfs_read_subreq_terminated(subreq, err, false);
return true;
}
static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
u64 objno, objoff;
u32 xlen;
/* Truncate the extent at the end of the current block */
ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
&objno, &objoff, &xlen);
rreq->io_streams[0].sreq_max_len = umin(xlen, fsc->mount_options->rsize);
return 0;
}
static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
@ -334,9 +344,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct ceph_client *cl = fsc->client;
struct ceph_osd_request *req = NULL;
struct ceph_vino vino = ceph_vino(inode);
struct iov_iter iter;
int err = 0;
u64 len = subreq->len;
int err;
u64 len;
bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
u64 off = subreq->start;
int extent_cnt;
@ -349,6 +358,12 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
return;
// TODO: This rounding here is slightly dodgy. It *should* work, for
// now, as the cache only deals in blocks that are a multiple of
// PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE. What needs to
// happen is for the fscrypt driving to be moved into netfslib and the
// data in the cache also to be stored encrypted.
len = subreq->len;
ceph_fscrypt_adjust_off_and_len(inode, &off, &len);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
@ -371,8 +386,6 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
ceph_vinop(inode), subreq->start, subreq->len, len);
iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
/*
* FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
* encrypted inodes. We'd need infrastructure that handles an iov_iter
@ -384,7 +397,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct page **pages;
size_t page_off;
err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off);
if (err < 0) {
doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
ceph_vinop(inode), err);
@ -399,7 +412,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,
false);
} else {
osd_req_op_extent_osd_iter(req, 0, &iter);
osd_req_op_extent_osd_iter(req, 0, &subreq->io_iter);
}
if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
err = -EIO;
@ -410,17 +423,19 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
req->r_inode = inode;
ihold(inode);
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
ceph_osdc_start_request(req->r_osdc, req);
out:
ceph_osdc_put_request(req);
if (err)
netfs_subreq_terminated(subreq, err, false);
netfs_read_subreq_terminated(subreq, err, false);
doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
}
static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
{
struct inode *inode = rreq->inode;
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = ceph_inode_to_client(inode);
int got = 0, want = CEPH_CAP_FILE_CACHE;
struct ceph_netfs_request_data *priv;
@ -472,6 +487,7 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
priv->caps = got;
rreq->netfs_priv = priv;
rreq->io_streams[0].sreq_max_len = fsc->mount_options->rsize;
out:
if (ret < 0)
@ -496,9 +512,9 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq)
const struct netfs_request_ops ceph_netfs_ops = {
.init_request = ceph_init_request,
.free_request = ceph_netfs_free_request,
.prepare_read = ceph_netfs_prepare_read,
.issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
.check_write_begin = ceph_netfs_check_write_begin,
};

View File

@ -5,12 +5,14 @@ netfs-y := \
buffered_write.o \
direct_read.o \
direct_write.o \
io.o \
iterator.o \
locking.o \
main.o \
misc.o \
objects.o \
read_collect.o \
read_pgpriv2.o \
read_retry.o \
write_collect.o \
write_issue.o

View File

@ -9,214 +9,6 @@
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
/*
* [DEPRECATED] Unlock the folios in a read operation for when the filesystem
* is using PG_private_2 and direct writing to the cache from here rather than
* marking the page for writeback.
*
* Note that we don't touch folio->private in this code.
*/
static void netfs_rreq_unlock_folios_pgpriv2(struct netfs_io_request *rreq,
size_t *account)
{
struct netfs_io_subrequest *subreq;
struct folio *folio;
pgoff_t start_page = rreq->start / PAGE_SIZE;
pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
bool subreq_failed = false;
XA_STATE(xas, &rreq->mapping->i_pages, start_page);
/* Walk through the pagecache and the I/O request lists simultaneously.
* We may have a mixture of cached and uncached sections and we only
* really want to write out the uncached sections. This is slightly
* complicated by the possibility that we might have huge pages with a
* mixture inside.
*/
subreq = list_first_entry(&rreq->subrequests,
struct netfs_io_subrequest, rreq_link);
subreq_failed = (subreq->error < 0);
trace_netfs_rreq(rreq, netfs_rreq_trace_unlock_pgpriv2);
rcu_read_lock();
xas_for_each(&xas, folio, last_page) {
loff_t pg_end;
bool pg_failed = false;
bool folio_started = false;
if (xas_retry(&xas, folio))
continue;
pg_end = folio_pos(folio) + folio_size(folio) - 1;
for (;;) {
loff_t sreq_end;
if (!subreq) {
pg_failed = true;
break;
}
if (!folio_started &&
test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags) &&
fscache_operation_valid(&rreq->cache_resources)) {
trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
folio_start_private_2(folio);
folio_started = true;
}
pg_failed |= subreq_failed;
sreq_end = subreq->start + subreq->len - 1;
if (pg_end < sreq_end)
break;
*account += subreq->transferred;
if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
subreq = list_next_entry(subreq, rreq_link);
subreq_failed = (subreq->error < 0);
} else {
subreq = NULL;
subreq_failed = false;
}
if (pg_end == sreq_end)
break;
}
if (!pg_failed) {
flush_dcache_folio(folio);
folio_mark_uptodate(folio);
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
_debug("no unlock");
else
folio_unlock(folio);
}
}
rcu_read_unlock();
}
/*
* Unlock the folios in a read operation. We need to set PG_writeback on any
* folios we're going to write back before we unlock them.
*
* Note that if the deprecated NETFS_RREQ_USE_PGPRIV2 is set then we use
* PG_private_2 and do a direct write to the cache from here instead.
*/
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
struct netfs_folio *finfo;
struct folio *folio;
pgoff_t start_page = rreq->start / PAGE_SIZE;
pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
size_t account = 0;
bool subreq_failed = false;
XA_STATE(xas, &rreq->mapping->i_pages, start_page);
if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
__clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
}
}
/* Handle deprecated PG_private_2 case. */
if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
netfs_rreq_unlock_folios_pgpriv2(rreq, &account);
goto out;
}
/* Walk through the pagecache and the I/O request lists simultaneously.
* We may have a mixture of cached and uncached sections and we only
* really want to write out the uncached sections. This is slightly
* complicated by the possibility that we might have huge pages with a
* mixture inside.
*/
subreq = list_first_entry(&rreq->subrequests,
struct netfs_io_subrequest, rreq_link);
subreq_failed = (subreq->error < 0);
trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
rcu_read_lock();
xas_for_each(&xas, folio, last_page) {
loff_t pg_end;
bool pg_failed = false;
bool wback_to_cache = false;
if (xas_retry(&xas, folio))
continue;
pg_end = folio_pos(folio) + folio_size(folio) - 1;
for (;;) {
loff_t sreq_end;
if (!subreq) {
pg_failed = true;
break;
}
wback_to_cache |= test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
pg_failed |= subreq_failed;
sreq_end = subreq->start + subreq->len - 1;
if (pg_end < sreq_end)
break;
account += subreq->transferred;
if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
subreq = list_next_entry(subreq, rreq_link);
subreq_failed = (subreq->error < 0);
} else {
subreq = NULL;
subreq_failed = false;
}
if (pg_end == sreq_end)
break;
}
if (!pg_failed) {
flush_dcache_folio(folio);
finfo = netfs_folio_info(folio);
if (finfo) {
trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
if (finfo->netfs_group)
folio_change_private(folio, finfo->netfs_group);
else
folio_detach_private(folio);
kfree(finfo);
}
folio_mark_uptodate(folio);
if (wback_to_cache && !WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
filemap_dirty_folio(folio->mapping, folio);
}
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
_debug("no unlock");
else
folio_unlock(folio);
}
}
rcu_read_unlock();
out:
task_io_account_read(account);
if (rreq->netfs_ops->done)
rreq->netfs_ops->done(rreq);
}
static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
unsigned long long *_start,
unsigned long long *_len,
@ -271,6 +63,336 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
}
/*
* Decant the list of folios to read into a rolling buffer.
*/
static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
struct folio_queue *folioq)
{
unsigned int order, nr;
size_t size = 0;
nr = __readahead_batch(rreq->ractl, (struct page **)folioq->vec.folios,
ARRAY_SIZE(folioq->vec.folios));
folioq->vec.nr = nr;
for (int i = 0; i < nr; i++) {
struct folio *folio = folioq_folio(folioq, i);
trace_netfs_folio(folio, netfs_folio_trace_read);
order = folio_order(folio);
folioq->orders[i] = order;
size += PAGE_SIZE << order;
}
for (int i = nr; i < folioq_nr_slots(folioq); i++)
folioq_clear(folioq, i);
return size;
}
/*
* netfs_prepare_read_iterator - Prepare the subreq iterator for I/O
* @subreq: The subrequest to be set up
*
* Prepare the I/O iterator representing the read buffer on a subrequest for
* the filesystem to use for I/O (it can be passed directly to a socket). This
* is intended to be called from the ->issue_read() method once the filesystem
* has trimmed the request to the size it wants.
*
* Returns the limited size if successful and -ENOMEM if insufficient memory
* available.
*
* [!] NOTE: This must be run in the same thread as ->issue_read() was called
* in as we access the readahead_control struct.
*/
static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
size_t rsize = subreq->len;
if (subreq->source == NETFS_DOWNLOAD_FROM_SERVER)
rsize = umin(rsize, rreq->io_streams[0].sreq_max_len);
if (rreq->ractl) {
/* If we don't have sufficient folios in the rolling buffer,
* extract a folioq's worth from the readahead region at a time
* into the buffer. Note that this acquires a ref on each page
* that we will need to release later - but we don't want to do
* that until after we've started the I/O.
*/
while (rreq->submitted < subreq->start + rsize) {
struct folio_queue *tail = rreq->buffer_tail, *new;
size_t added;
new = kmalloc(sizeof(*new), GFP_NOFS);
if (!new)
return -ENOMEM;
netfs_stat(&netfs_n_folioq);
folioq_init(new);
new->prev = tail;
tail->next = new;
rreq->buffer_tail = new;
added = netfs_load_buffer_from_ra(rreq, new);
rreq->iter.count += added;
rreq->submitted += added;
}
}
subreq->len = rsize;
if (unlikely(rreq->io_streams[0].sreq_max_segs)) {
size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize,
rreq->io_streams[0].sreq_max_segs);
if (limit < rsize) {
subreq->len = limit;
trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
}
}
subreq->io_iter = rreq->iter;
if (iov_iter_is_folioq(&subreq->io_iter)) {
if (subreq->io_iter.folioq_slot >= folioq_nr_slots(subreq->io_iter.folioq)) {
subreq->io_iter.folioq = subreq->io_iter.folioq->next;
subreq->io_iter.folioq_slot = 0;
}
subreq->curr_folioq = (struct folio_queue *)subreq->io_iter.folioq;
subreq->curr_folioq_slot = subreq->io_iter.folioq_slot;
subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot];
}
iov_iter_truncate(&subreq->io_iter, subreq->len);
iov_iter_advance(&rreq->iter, subreq->len);
return subreq->len;
}
static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq,
loff_t i_size)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
if (!cres->ops)
return NETFS_DOWNLOAD_FROM_SERVER;
return cres->ops->prepare_read(subreq, i_size);
}
static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
bool was_async)
{
struct netfs_io_subrequest *subreq = priv;
if (transferred_or_error < 0) {
netfs_read_subreq_terminated(subreq, transferred_or_error, was_async);
return;
}
if (transferred_or_error > 0)
subreq->transferred += transferred_or_error;
netfs_read_subreq_terminated(subreq, 0, was_async);
}
/*
* Issue a read against the cache.
* - Eats the caller's ref on subreq.
*/
static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
netfs_stat(&netfs_n_rh_read);
cres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_IGNORE,
netfs_cache_read_terminated, subreq);
}
/*
* Perform a read to the pagecache from a series of sources of different types,
* slicing up the region to be read according to available cache blocks and
* network rsize.
*/
static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
{
struct netfs_inode *ictx = netfs_inode(rreq->inode);
unsigned long long start = rreq->start;
ssize_t size = rreq->len;
int ret = 0;
atomic_inc(&rreq->nr_outstanding);
do {
struct netfs_io_subrequest *subreq;
enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
ssize_t slice;
subreq = netfs_alloc_subrequest(rreq);
if (!subreq) {
ret = -ENOMEM;
break;
}
subreq->start = start;
subreq->len = size;
atomic_inc(&rreq->nr_outstanding);
spin_lock_bh(&rreq->lock);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
subreq->prev_donated = rreq->prev_donated;
rreq->prev_donated = 0;
trace_netfs_sreq(subreq, netfs_sreq_trace_added);
spin_unlock_bh(&rreq->lock);
source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size);
subreq->source = source;
if (source == NETFS_DOWNLOAD_FROM_SERVER) {
unsigned long long zp = umin(ictx->zero_point, rreq->i_size);
size_t len = subreq->len;
if (subreq->start >= zp) {
subreq->source = source = NETFS_FILL_WITH_ZEROES;
goto fill_with_zeroes;
}
if (len > zp - subreq->start)
len = zp - subreq->start;
if (len == 0) {
pr_err("ZERO-LEN READ: R=%08x[%x] l=%zx/%zx s=%llx z=%llx i=%llx",
rreq->debug_id, subreq->debug_index,
subreq->len, size,
subreq->start, ictx->zero_point, rreq->i_size);
break;
}
subreq->len = len;
netfs_stat(&netfs_n_rh_download);
if (rreq->netfs_ops->prepare_read) {
ret = rreq->netfs_ops->prepare_read(subreq);
if (ret < 0) {
atomic_dec(&rreq->nr_outstanding);
netfs_put_subrequest(subreq, false,
netfs_sreq_trace_put_cancel);
break;
}
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
}
slice = netfs_prepare_read_iterator(subreq);
if (slice < 0) {
atomic_dec(&rreq->nr_outstanding);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
ret = slice;
break;
}
rreq->netfs_ops->issue_read(subreq);
goto done;
}
fill_with_zeroes:
if (source == NETFS_FILL_WITH_ZEROES) {
subreq->source = NETFS_FILL_WITH_ZEROES;
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
netfs_stat(&netfs_n_rh_zero);
slice = netfs_prepare_read_iterator(subreq);
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
netfs_read_subreq_terminated(subreq, 0, false);
goto done;
}
if (source == NETFS_READ_FROM_CACHE) {
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
slice = netfs_prepare_read_iterator(subreq);
netfs_read_cache_to_pagecache(rreq, subreq);
goto done;
}
pr_err("Unexpected read source %u\n", source);
WARN_ON_ONCE(1);
break;
done:
size -= slice;
start += slice;
cond_resched();
} while (size > 0);
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_terminated(rreq, false);
/* Defer error return as we may need to wait for outstanding I/O. */
cmpxchg(&rreq->error, 0, ret);
}
/*
* Wait for the read operation to complete, successfully or otherwise.
*/
static int netfs_wait_for_read(struct netfs_io_request *rreq)
{
int ret;
trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE);
ret = rreq->error;
if (ret == 0 && rreq->submitted < rreq->len) {
trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
ret = -EIO;
}
return ret;
}
/*
* Set up the initial folioq of buffer folios in the rolling buffer and set the
* iterator to refer to it.
*/
static int netfs_prime_buffer(struct netfs_io_request *rreq)
{
struct folio_queue *folioq;
size_t added;
folioq = kmalloc(sizeof(*folioq), GFP_KERNEL);
if (!folioq)
return -ENOMEM;
netfs_stat(&netfs_n_folioq);
folioq_init(folioq);
rreq->buffer = folioq;
rreq->buffer_tail = folioq;
rreq->submitted = rreq->start;
iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0);
added = netfs_load_buffer_from_ra(rreq, folioq);
rreq->iter.count += added;
rreq->submitted += added;
return 0;
}
/*
* Drop the ref on each folio that we inherited from the VM readahead code. We
* still have the folio locks to pin the page until we complete the I/O.
*
* Note that we can't just release the batch in each queue struct as we use the
* occupancy count in other places.
*/
static void netfs_put_ra_refs(struct folio_queue *folioq)
{
struct folio_batch fbatch;
folio_batch_init(&fbatch);
while (folioq) {
for (unsigned int slot = 0; slot < folioq_count(folioq); slot++) {
struct folio *folio = folioq_folio(folioq, slot);
if (!folio)
continue;
trace_netfs_folio(folio, netfs_folio_trace_read_put);
if (!folio_batch_add(&fbatch, folio))
folio_batch_release(&fbatch);
}
folioq = folioq->next;
}
folio_batch_release(&fbatch);
}
/**
* netfs_readahead - Helper to manage a read request
* @ractl: The description of the readahead request
@ -289,22 +411,17 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
void netfs_readahead(struct readahead_control *ractl)
{
struct netfs_io_request *rreq;
struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
struct netfs_inode *ictx = netfs_inode(ractl->mapping->host);
unsigned long long start = readahead_pos(ractl);
size_t size = readahead_length(ractl);
int ret;
_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
if (readahead_count(ractl) == 0)
return;
rreq = netfs_alloc_request(ractl->mapping, ractl->file,
readahead_pos(ractl),
readahead_length(ractl),
rreq = netfs_alloc_request(ractl->mapping, ractl->file, start, size,
NETFS_READAHEAD);
if (IS_ERR(rreq))
return;
ret = netfs_begin_cache_read(rreq, ctx);
ret = netfs_begin_cache_read(rreq, ictx);
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto cleanup_free;
@ -314,18 +431,15 @@ void netfs_readahead(struct readahead_control *ractl)
netfs_rreq_expand(rreq, ractl);
/* Set up the output buffer */
iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
rreq->start, rreq->len);
rreq->ractl = ractl;
if (netfs_prime_buffer(rreq) < 0)
goto cleanup_free;
netfs_read_to_pagecache(rreq);
/* Drop the refs on the folios here rather than in the cache or
* filesystem. The locks will be dropped in netfs_rreq_unlock().
*/
while (readahead_folio(ractl))
;
/* Release the folio refs whilst we're waiting for the I/O. */
netfs_put_ra_refs(rreq->buffer);
netfs_begin_read(rreq, false);
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
return;
cleanup_free:
@ -334,6 +448,117 @@ cleanup_free:
}
EXPORT_SYMBOL(netfs_readahead);
/*
* Create a rolling buffer with a single occupying folio.
*/
static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio)
{
struct folio_queue *folioq;
folioq = kmalloc(sizeof(*folioq), GFP_KERNEL);
if (!folioq)
return -ENOMEM;
netfs_stat(&netfs_n_folioq);
folioq_init(folioq);
folioq_append(folioq, folio);
BUG_ON(folioq_folio(folioq, 0) != folio);
BUG_ON(folioq_folio_order(folioq, 0) != folio_order(folio));
rreq->buffer = folioq;
rreq->buffer_tail = folioq;
rreq->submitted = rreq->start + rreq->len;
iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, rreq->len);
rreq->ractl = (struct readahead_control *)1UL;
return 0;
}
/*
* Read into gaps in a folio partially filled by a streaming write.
*/
static int netfs_read_gaps(struct file *file, struct folio *folio)
{
struct netfs_io_request *rreq;
struct address_space *mapping = folio->mapping;
struct netfs_folio *finfo = netfs_folio_info(folio);
struct netfs_inode *ctx = netfs_inode(mapping->host);
struct folio *sink = NULL;
struct bio_vec *bvec;
unsigned int from = finfo->dirty_offset;
unsigned int to = from + finfo->dirty_len;
unsigned int off = 0, i = 0;
size_t flen = folio_size(folio);
size_t nr_bvec = flen / PAGE_SIZE + 2;
size_t part;
int ret;
_enter("%lx", folio->index);
rreq = netfs_alloc_request(mapping, file, folio_pos(folio), flen, NETFS_READ_GAPS);
if (IS_ERR(rreq)) {
ret = PTR_ERR(rreq);
goto alloc_error;
}
ret = netfs_begin_cache_read(rreq, ctx);
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto discard;
netfs_stat(&netfs_n_rh_read_folio);
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_read_gaps);
/* Fiddle the buffer so that a gap at the beginning and/or a gap at the
* end get copied to, but the middle is discarded.
*/
ret = -ENOMEM;
bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
if (!bvec)
goto discard;
sink = folio_alloc(GFP_KERNEL, 0);
if (!sink) {
kfree(bvec);
goto discard;
}
trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
rreq->direct_bv = bvec;
rreq->direct_bv_count = nr_bvec;
if (from > 0) {
bvec_set_folio(&bvec[i++], folio, from, 0);
off = from;
}
while (off < to) {
part = min_t(size_t, to - off, PAGE_SIZE);
bvec_set_folio(&bvec[i++], sink, part, 0);
off += part;
}
if (to < flen)
bvec_set_folio(&bvec[i++], folio, flen - to, to);
iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
rreq->submitted = rreq->start + flen;
netfs_read_to_pagecache(rreq);
if (sink)
folio_put(sink);
ret = netfs_wait_for_read(rreq);
if (ret == 0) {
flush_dcache_folio(folio);
folio_mark_uptodate(folio);
}
folio_unlock(folio);
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return ret < 0 ? ret : 0;
discard:
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
alloc_error:
folio_unlock(folio);
return ret;
}
/**
* netfs_read_folio - Helper to manage a read_folio request
* @file: The file to read from
@ -353,9 +578,13 @@ int netfs_read_folio(struct file *file, struct folio *folio)
struct address_space *mapping = folio->mapping;
struct netfs_io_request *rreq;
struct netfs_inode *ctx = netfs_inode(mapping->host);
struct folio *sink = NULL;
int ret;
if (folio_test_dirty(folio)) {
trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
return netfs_read_gaps(file, folio);
}
_enter("%lx", folio->index);
rreq = netfs_alloc_request(mapping, file,
@ -374,54 +603,12 @@ int netfs_read_folio(struct file *file, struct folio *folio)
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
/* Set up the output buffer */
if (folio_test_dirty(folio)) {
/* Handle someone trying to read from an unflushed streaming
* write. We fiddle the buffer so that a gap at the beginning
* and/or a gap at the end get copied to, but the middle is
* discarded.
*/
struct netfs_folio *finfo = netfs_folio_info(folio);
struct bio_vec *bvec;
unsigned int from = finfo->dirty_offset;
unsigned int to = from + finfo->dirty_len;
unsigned int off = 0, i = 0;
size_t flen = folio_size(folio);
size_t nr_bvec = flen / PAGE_SIZE + 2;
size_t part;
ret = -ENOMEM;
bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
if (!bvec)
ret = netfs_create_singular_buffer(rreq, folio);
if (ret < 0)
goto discard;
sink = folio_alloc(GFP_KERNEL, 0);
if (!sink)
goto discard;
trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
rreq->direct_bv = bvec;
rreq->direct_bv_count = nr_bvec;
if (from > 0) {
bvec_set_folio(&bvec[i++], folio, from, 0);
off = from;
}
while (off < to) {
part = min_t(size_t, to - off, PAGE_SIZE);
bvec_set_folio(&bvec[i++], sink, part, 0);
off += part;
}
if (to < flen)
bvec_set_folio(&bvec[i++], folio, flen - to, to);
iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
} else {
iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
rreq->start, rreq->len);
}
ret = netfs_begin_read(rreq, true);
if (sink)
folio_put(sink);
netfs_read_to_pagecache(rreq);
ret = netfs_wait_for_read(rreq);
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return ret < 0 ? ret : 0;
@ -494,13 +681,10 @@ zero_out:
*
* Pre-read data for a write-begin request by drawing data from the cache if
* possible, or the netfs if not. Space beyond the EOF is zero-filled.
* Multiple I/O requests from different sources will get munged together. If
* necessary, the readahead window can be expanded in either direction to a
* more convenient alighment for RPC efficiency or to make storage in the cache
* feasible.
* Multiple I/O requests from different sources will get munged together.
*
* The calling netfs must provide a table of operations, only one of which,
* issue_op, is mandatory.
* issue_read, is mandatory.
*
* The check_write_begin() operation can be provided to check for and flush
* conflicting writes once the folio is grabbed and locked. It is passed a
@ -528,8 +712,6 @@ int netfs_write_begin(struct netfs_inode *ctx,
pgoff_t index = pos >> PAGE_SHIFT;
int ret;
DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
retry:
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
@ -577,22 +759,13 @@ retry:
netfs_stat(&netfs_n_rh_write_begin);
trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
/* Expand the request to meet caching requirements and download
* preferences.
*/
ractl._nr_pages = folio_nr_pages(folio);
netfs_rreq_expand(rreq, &ractl);
/* Set up the output buffer */
iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
rreq->start, rreq->len);
ret = netfs_create_singular_buffer(rreq, folio);
if (ret < 0)
goto error_put;
/* We hold the folio locks, so we can drop the references */
folio_get(folio);
while (readahead_folio(&ractl))
;
ret = netfs_begin_read(rreq, true);
netfs_read_to_pagecache(rreq);
ret = netfs_wait_for_read(rreq);
if (ret < 0)
goto error;
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
@ -652,10 +825,13 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
/* Set up the output buffer */
iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
rreq->start, rreq->len);
ret = netfs_create_singular_buffer(rreq, folio);
if (ret < 0)
goto error_put;
ret = netfs_begin_read(rreq, true);
folioq_mark2(rreq->buffer, 0);
netfs_read_to_pagecache(rreq);
ret = netfs_wait_for_read(rreq);
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return ret;

View File

@ -13,91 +13,22 @@
#include <linux/pagevec.h>
#include "internal.h"
/*
* Determined write method. Adjust netfs_folio_traces if this is changed.
*/
enum netfs_how_to_modify {
NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */
NETFS_JUST_PREFETCH, /* We have to read the folio anyway */
NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */
NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */
NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */
NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */
NETFS_FLUSH_CONTENT, /* Flush incompatible content. */
};
static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
{
if (netfs_group)
folio_attach_private(folio, netfs_get_group(netfs_group));
}
static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
{
void *priv = folio_get_private(folio);
if (unlikely(priv != netfs_group)) {
if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE))
folio_attach_private(folio, netfs_get_group(netfs_group));
else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
folio_detach_private(folio);
}
/*
* Decide how we should modify a folio. We might be attempting to do
* write-streaming, in which case we don't want to a local RMW cycle if we can
* avoid it. If we're doing local caching or content crypto, we award that
* priority over avoiding RMW. If the file is open readably, then we also
* assume that we may want to read what we wrote.
*/
static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
struct file *file,
struct folio *folio,
void *netfs_group,
size_t flen,
size_t offset,
size_t len,
bool maybe_trouble)
{
struct netfs_folio *finfo = netfs_folio_info(folio);
struct netfs_group *group = netfs_folio_group(folio);
loff_t pos = folio_pos(folio);
_enter("");
if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE)
return NETFS_FLUSH_CONTENT;
if (folio_test_uptodate(folio))
return NETFS_FOLIO_IS_UPTODATE;
if (pos >= ctx->zero_point)
return NETFS_MODIFY_AND_CLEAR;
if (!maybe_trouble && offset == 0 && len >= flen)
return NETFS_WHOLE_FOLIO_MODIFY;
if (file->f_mode & FMODE_READ)
goto no_write_streaming;
if (netfs_is_cache_enabled(ctx)) {
/* We don't want to get a streaming write on a file that loses
* caching service temporarily because the backing store got
* culled.
*/
goto no_write_streaming;
}
if (!finfo)
return NETFS_STREAMING_WRITE;
/* We can continue a streaming write only if it continues on from the
* previous. If it overlaps, we must flush lest we suffer a partial
* copy and disjoint dirty regions.
*/
if (offset == finfo->dirty_offset + finfo->dirty_len)
return NETFS_STREAMING_WRITE_CONT;
return NETFS_FLUSH_CONTENT;
no_write_streaming:
if (finfo) {
netfs_stat(&netfs_n_wh_wstream_conflict);
return NETFS_FLUSH_CONTENT;
}
return NETFS_JUST_PREFETCH;
}
/*
@ -177,13 +108,10 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
.range_end = iocb->ki_pos + iter->count,
};
struct netfs_io_request *wreq = NULL;
struct netfs_folio *finfo;
struct folio *folio, *writethrough = NULL;
enum netfs_how_to_modify howto;
enum netfs_folio_trace trace;
struct folio *folio = NULL, *writethrough = NULL;
unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0;
ssize_t written = 0, ret, ret2;
loff_t i_size, pos = iocb->ki_pos, from, to;
loff_t i_size, pos = iocb->ki_pos;
size_t max_chunk = mapping_max_folio_size(mapping);
bool maybe_trouble = false;
@ -213,15 +141,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
}
do {
struct netfs_folio *finfo;
struct netfs_group *group;
unsigned long long fpos;
size_t flen;
size_t offset; /* Offset into pagecache folio */
size_t part; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
if (unlikely(ret < 0))
break;
offset = pos & (max_chunk - 1);
part = min(max_chunk - offset, iov_iter_count(iter));
@ -247,7 +174,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
}
flen = folio_size(folio);
offset = pos & (flen - 1);
fpos = folio_pos(folio);
offset = pos - fpos;
part = min_t(size_t, flen - offset, part);
/* Wait for writeback to complete. The writeback engine owns
@ -265,71 +193,52 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
goto error_folio_unlock;
}
/* See if we need to prefetch the area we're going to modify.
* We need to do this before we get a lock on the folio in case
* there's more than one writer competing for the same cache
* block.
/* Decide how we should modify a folio. We might be attempting
* to do write-streaming, in which case we don't want to a
* local RMW cycle if we can avoid it. If we're doing local
* caching or content crypto, we award that priority over
* avoiding RMW. If the file is open readably, then we also
* assume that we may want to read what we wrote.
*/
howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
flen, offset, part, maybe_trouble);
_debug("howto %u", howto);
switch (howto) {
case NETFS_JUST_PREFETCH:
ret = netfs_prefetch_for_write(file, folio, offset, part);
if (ret < 0) {
_debug("prefetch = %zd", ret);
goto error_folio_unlock;
}
break;
case NETFS_FOLIO_IS_UPTODATE:
case NETFS_WHOLE_FOLIO_MODIFY:
case NETFS_STREAMING_WRITE_CONT:
break;
case NETFS_MODIFY_AND_CLEAR:
zero_user_segment(&folio->page, 0, offset);
break;
case NETFS_STREAMING_WRITE:
ret = -EIO;
if (WARN_ON(folio_get_private(folio)))
goto error_folio_unlock;
break;
case NETFS_FLUSH_CONTENT:
trace_netfs_folio(folio, netfs_flush_content);
from = folio_pos(folio);
to = from + folio_size(folio) - 1;
folio_unlock(folio);
folio_put(folio);
ret = filemap_write_and_wait_range(mapping, from, to);
if (ret < 0)
goto error_folio_unlock;
continue;
}
finfo = netfs_folio_info(folio);
group = netfs_folio_group(folio);
if (unlikely(group != netfs_group) &&
group != NETFS_FOLIO_COPY_TO_CACHE)
goto flush_content;
if (folio_test_uptodate(folio)) {
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
flush_dcache_folio(folio);
/* Deal with a (partially) failed copy */
if (copied == 0) {
ret = -EFAULT;
goto error_folio_unlock;
if (unlikely(copied == 0))
goto copy_failed;
netfs_set_group(folio, netfs_group);
trace_netfs_folio(folio, netfs_folio_is_uptodate);
goto copied;
}
trace = (enum netfs_folio_trace)howto;
switch (howto) {
case NETFS_FOLIO_IS_UPTODATE:
case NETFS_JUST_PREFETCH:
netfs_set_group(folio, netfs_group);
break;
case NETFS_MODIFY_AND_CLEAR:
/* If the page is above the zero-point then we assume that the
* server would just return a block of zeros or a short read if
* we try to read it.
*/
if (fpos >= ctx->zero_point) {
zero_user_segment(&folio->page, 0, offset);
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
if (unlikely(copied == 0))
goto copy_failed;
zero_user_segment(&folio->page, offset + copied, flen);
netfs_set_group(folio, netfs_group);
__netfs_set_group(folio, netfs_group);
folio_mark_uptodate(folio);
break;
case NETFS_WHOLE_FOLIO_MODIFY:
trace_netfs_folio(folio, netfs_modify_and_clear);
goto copied;
}
/* See if we can write a whole folio in one go. */
if (!maybe_trouble && offset == 0 && part >= flen) {
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
if (unlikely(copied == 0))
goto copy_failed;
if (unlikely(copied < part)) {
maybe_trouble = true;
iov_iter_revert(iter, copied);
@ -337,16 +246,53 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
folio_unlock(folio);
goto retry;
}
netfs_set_group(folio, netfs_group);
__netfs_set_group(folio, netfs_group);
folio_mark_uptodate(folio);
break;
case NETFS_STREAMING_WRITE:
if (offset == 0 && copied == flen) {
netfs_set_group(folio, netfs_group);
folio_mark_uptodate(folio);
trace = netfs_streaming_filled_page;
break;
trace_netfs_folio(folio, netfs_whole_folio_modify);
goto copied;
}
/* We don't want to do a streaming write on a file that loses
* caching service temporarily because the backing store got
* culled and we don't really want to get a streaming write on
* a file that's open for reading as ->read_folio() then has to
* be able to flush it.
*/
if ((file->f_mode & FMODE_READ) ||
netfs_is_cache_enabled(ctx)) {
if (finfo) {
netfs_stat(&netfs_n_wh_wstream_conflict);
goto flush_content;
}
ret = netfs_prefetch_for_write(file, folio, offset, part);
if (ret < 0) {
_debug("prefetch = %zd", ret);
goto error_folio_unlock;
}
/* Note that copy-to-cache may have been set. */
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
if (unlikely(copied == 0))
goto copy_failed;
netfs_set_group(folio, netfs_group);
trace_netfs_folio(folio, netfs_just_prefetch);
goto copied;
}
if (!finfo) {
ret = -EIO;
if (WARN_ON(folio_get_private(folio)))
goto error_folio_unlock;
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
if (unlikely(copied == 0))
goto copy_failed;
if (offset == 0 && copied == flen) {
__netfs_set_group(folio, netfs_group);
folio_mark_uptodate(folio);
trace_netfs_folio(folio, netfs_streaming_filled_page);
goto copied;
}
finfo = kzalloc(sizeof(*finfo), GFP_KERNEL);
if (!finfo) {
iov_iter_revert(iter, copied);
@ -358,9 +304,18 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
finfo->dirty_len = copied;
folio_attach_private(folio, (void *)((unsigned long)finfo |
NETFS_FOLIO_INFO));
break;
case NETFS_STREAMING_WRITE_CONT:
finfo = netfs_folio_info(folio);
trace_netfs_folio(folio, netfs_streaming_write);
goto copied;
}
/* We can continue a streaming write only if it continues on
* from the previous. If it overlaps, we must flush lest we
* suffer a partial copy and disjoint dirty regions.
*/
if (offset == finfo->dirty_offset + finfo->dirty_len) {
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
if (unlikely(copied == 0))
goto copy_failed;
finfo->dirty_len += copied;
if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) {
if (finfo->netfs_group)
@ -369,17 +324,25 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
folio_detach_private(folio);
folio_mark_uptodate(folio);
kfree(finfo);
trace = netfs_streaming_cont_filled_page;
trace_netfs_folio(folio, netfs_streaming_cont_filled_page);
} else {
trace_netfs_folio(folio, netfs_streaming_write_cont);
}
break;
default:
WARN(true, "Unexpected modify type %u ix=%lx\n",
howto, folio->index);
ret = -EIO;
goto error_folio_unlock;
goto copied;
}
trace_netfs_folio(folio, trace);
/* Incompatible write; flush the folio and try again. */
flush_content:
trace_netfs_folio(folio, netfs_flush_content);
folio_unlock(folio);
folio_put(folio);
ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1);
if (ret < 0)
goto error_folio_unlock;
continue;
copied:
flush_dcache_folio(folio);
/* Update the inode size if we moved the EOF marker */
pos += copied;
@ -401,12 +364,22 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
folio_put(folio);
folio = NULL;
ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
if (unlikely(ret < 0))
break;
cond_resched();
} while (iov_iter_count(iter));
out:
if (likely(written) && ctx->ops->post_modify)
if (likely(written)) {
/* Set indication that ctime and mtime got updated in case
* close is deferred.
*/
set_bit(NETFS_ICTX_MODIFIED_ATTR, &ctx->flags);
if (unlikely(ctx->ops->post_modify))
ctx->ops->post_modify(inode);
}
if (unlikely(wreq)) {
ret2 = netfs_end_writethrough(wreq, &wbc, writethrough);
@ -421,6 +394,8 @@ out:
_leave(" = %zd [%zd]", written, ret);
return written ? written : ret;
copy_failed:
ret = -EFAULT;
error_folio_unlock:
folio_unlock(folio);
folio_put(folio);

View File

@ -16,6 +16,143 @@
#include <linux/netfs.h>
#include "internal.h"
static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
size_t rsize;
rsize = umin(subreq->len, rreq->io_streams[0].sreq_max_len);
subreq->len = rsize;
if (unlikely(rreq->io_streams[0].sreq_max_segs)) {
size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize,
rreq->io_streams[0].sreq_max_segs);
if (limit < rsize) {
subreq->len = limit;
trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
}
}
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
subreq->io_iter = rreq->iter;
iov_iter_truncate(&subreq->io_iter, subreq->len);
iov_iter_advance(&rreq->iter, subreq->len);
}
/*
* Perform a read to a buffer from the server, slicing up the region to be read
* according to the network rsize.
*/
static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
{
unsigned long long start = rreq->start;
ssize_t size = rreq->len;
int ret = 0;
atomic_set(&rreq->nr_outstanding, 1);
do {
struct netfs_io_subrequest *subreq;
ssize_t slice;
subreq = netfs_alloc_subrequest(rreq);
if (!subreq) {
ret = -ENOMEM;
break;
}
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->start = start;
subreq->len = size;
atomic_inc(&rreq->nr_outstanding);
spin_lock_bh(&rreq->lock);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
subreq->prev_donated = rreq->prev_donated;
rreq->prev_donated = 0;
trace_netfs_sreq(subreq, netfs_sreq_trace_added);
spin_unlock_bh(&rreq->lock);
netfs_stat(&netfs_n_rh_download);
if (rreq->netfs_ops->prepare_read) {
ret = rreq->netfs_ops->prepare_read(subreq);
if (ret < 0) {
atomic_dec(&rreq->nr_outstanding);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
break;
}
}
netfs_prepare_dio_read_iterator(subreq);
slice = subreq->len;
rreq->netfs_ops->issue_read(subreq);
size -= slice;
start += slice;
rreq->submitted += slice;
if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
break;
cond_resched();
} while (size > 0);
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_terminated(rreq, false);
return ret;
}
/*
* Perform a read to an application buffer, bypassing the pagecache and the
* local disk cache.
*/
static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
{
int ret;
_enter("R=%x %llx-%llx",
rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
if (rreq->len == 0) {
pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
return -EIO;
}
// TODO: Use bounce buffer if requested
inode_dio_begin(rreq->inode);
ret = netfs_dispatch_unbuffered_reads(rreq);
if (!rreq->submitted) {
netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
inode_dio_end(rreq->inode);
ret = 0;
goto out;
}
if (sync) {
trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
TASK_UNINTERRUPTIBLE);
ret = rreq->error;
if (ret == 0 && rreq->submitted < rreq->len &&
rreq->origin != NETFS_DIO_READ) {
trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
ret = -EIO;
}
} else {
ret = -EIOCBQUEUED;
}
out:
_leave(" = %d", ret);
return ret;
}
/**
* netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read
* @iocb: The I/O control descriptor describing the read
@ -31,7 +168,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
struct netfs_io_request *rreq;
ssize_t ret;
size_t orig_count = iov_iter_count(iter);
bool async = !is_sync_kiocb(iocb);
bool sync = is_sync_kiocb(iocb);
_enter("");
@ -78,13 +215,13 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
// TODO: Set up bounce buffer if needed
if (async)
if (!sync)
rreq->iocb = iocb;
ret = netfs_begin_read(rreq, is_sync_kiocb(iocb));
ret = netfs_unbuffered_read(rreq, sync);
if (ret < 0)
goto out; /* May be -EIOCBQUEUED */
if (!async) {
if (sync) {
// TODO: Copy from bounce buffer
iocb->ki_pos += rreq->transferred;
ret = rreq->transferred;
@ -94,8 +231,6 @@ out:
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
if (ret > 0)
orig_count -= ret;
if (ret != -EIOCBQUEUED)
iov_iter_revert(iter, orig_count - iov_iter_count(iter));
return ret;
}
EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked);

View File

@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/folio_queue.h>
#include <linux/netfs.h>
#include <linux/fscache.h>
#include <linux/fscache-cache.h>
@ -22,15 +23,9 @@
/*
* buffered_read.c
*/
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq);
int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t offset, size_t len);
/*
* io.c
*/
int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
/*
* main.c
*/
@ -63,6 +58,11 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
/*
* misc.c
*/
int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
bool needs_put);
struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq);
void netfs_clear_buffer(struct netfs_io_request *rreq);
void netfs_reset_iter(struct netfs_io_subrequest *subreq);
/*
* objects.c
@ -83,6 +83,28 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what);
}
/*
* read_collect.c
*/
void netfs_read_termination_worker(struct work_struct *work);
void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async);
/*
* read_pgpriv2.c
*/
void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq,
struct folio_queue *folioq,
int slot);
void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq);
bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq);
/*
* read_retry.c
*/
void netfs_retry_reads(struct netfs_io_request *rreq);
void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq);
/*
* stats.c
*/
@ -110,6 +132,7 @@ extern atomic_t netfs_n_wh_buffered_write;
extern atomic_t netfs_n_wh_writethrough;
extern atomic_t netfs_n_wh_dio_write;
extern atomic_t netfs_n_wh_writepages;
extern atomic_t netfs_n_wh_copy_to_cache;
extern atomic_t netfs_n_wh_wstream_conflict;
extern atomic_t netfs_n_wh_upload;
extern atomic_t netfs_n_wh_upload_done;
@ -117,6 +140,9 @@ extern atomic_t netfs_n_wh_upload_failed;
extern atomic_t netfs_n_wh_write;
extern atomic_t netfs_n_wh_write_done;
extern atomic_t netfs_n_wh_write_failed;
extern atomic_t netfs_n_wb_lock_skip;
extern atomic_t netfs_n_wb_lock_wait;
extern atomic_t netfs_n_folioq;
int netfs_stats_show(struct seq_file *m, void *v);
@ -150,7 +176,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
loff_t start,
enum netfs_io_origin origin);
void netfs_reissue_write(struct netfs_io_stream *stream,
struct netfs_io_subrequest *subreq);
struct netfs_io_subrequest *subreq,
struct iov_iter *source);
void netfs_issue_write(struct netfs_io_request *wreq,
struct netfs_io_stream *stream);
int netfs_advance_write(struct netfs_io_request *wreq,
struct netfs_io_stream *stream,
loff_t start, size_t len, bool to_eof);

View File

@ -1,804 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* Network filesystem high-level read support.
*
* Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/module.h>
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/uio.h>
#include <linux/sched/mm.h>
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
/*
* Clear the unread part of an I/O request.
*/
static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
{
iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
}
static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
bool was_async)
{
struct netfs_io_subrequest *subreq = priv;
netfs_subreq_terminated(subreq, transferred_or_error, was_async);
}
/*
* Issue a read against the cache.
* - Eats the caller's ref on subreq.
*/
static void netfs_read_from_cache(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq,
enum netfs_read_from_hole read_hole)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
netfs_stat(&netfs_n_rh_read);
cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole,
netfs_cache_read_terminated, subreq);
}
/*
* Fill a subrequest region with zeroes.
*/
static void netfs_fill_with_zeroes(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
netfs_stat(&netfs_n_rh_zero);
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
netfs_subreq_terminated(subreq, 0, false);
}
/*
* Ask the netfs to issue a read request to the server for us.
*
* The netfs is expected to read from subreq->pos + subreq->transferred to
* subreq->pos + subreq->len - 1. It may not backtrack and write data into the
* buffer prior to the transferred point as it might clobber dirty data
* obtained from the cache.
*
* Alternatively, the netfs is allowed to indicate one of two things:
*
* - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and
* make progress.
*
* - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be
* cleared.
*/
static void netfs_read_from_server(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
netfs_stat(&netfs_n_rh_download);
if (rreq->origin != NETFS_DIO_READ &&
iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n",
rreq->debug_id, subreq->debug_index,
iov_iter_count(&subreq->io_iter), subreq->len,
subreq->transferred, subreq->flags);
rreq->netfs_ops->issue_read(subreq);
}
/*
* Release those waiting.
*/
static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async)
{
trace_netfs_rreq(rreq, netfs_rreq_trace_done);
netfs_clear_subrequests(rreq, was_async);
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete);
}
/*
* [DEPRECATED] Deal with the completion of writing the data to the cache. We
* have to clear the PG_fscache bits on the folios involved and release the
* caller's ref.
*
* May be called in softirq mode and we inherit a ref from the caller.
*/
static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
bool was_async)
{
struct netfs_io_subrequest *subreq;
struct folio *folio;
pgoff_t unlocked = 0;
bool have_unlocked = false;
rcu_read_lock();
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
if (xas_retry(&xas, folio))
continue;
/* We might have multiple writes from the same huge
* folio, but we mustn't unlock a folio more than once.
*/
if (have_unlocked && folio->index <= unlocked)
continue;
unlocked = folio_next_index(folio) - 1;
trace_netfs_folio(folio, netfs_folio_trace_end_copy);
folio_end_private_2(folio);
have_unlocked = true;
}
}
rcu_read_unlock();
netfs_rreq_completed(rreq, was_async);
}
static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
bool was_async) /* [DEPRECATED] */
{
struct netfs_io_subrequest *subreq = priv;
struct netfs_io_request *rreq = subreq->rreq;
if (IS_ERR_VALUE(transferred_or_error)) {
netfs_stat(&netfs_n_rh_write_failed);
trace_netfs_failure(rreq, subreq, transferred_or_error,
netfs_fail_copy_to_cache);
} else {
netfs_stat(&netfs_n_rh_write_done);
}
trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
/* If we decrement nr_copy_ops to 0, the ref belongs to us. */
if (atomic_dec_and_test(&rreq->nr_copy_ops))
netfs_rreq_unmark_after_write(rreq, was_async);
netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
}
/*
* [DEPRECATED] Perform any outstanding writes to the cache. We inherit a ref
* from the caller.
*/
static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
struct netfs_io_subrequest *subreq, *next, *p;
struct iov_iter iter;
int ret;
trace_netfs_rreq(rreq, netfs_rreq_trace_copy);
/* We don't want terminating writes trying to wake us up whilst we're
* still going through the list.
*/
atomic_inc(&rreq->nr_copy_ops);
list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
list_del_init(&subreq->rreq_link);
netfs_put_subrequest(subreq, false,
netfs_sreq_trace_put_no_copy);
}
}
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
/* Amalgamate adjacent writes */
while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
next = list_next_entry(subreq, rreq_link);
if (next->start != subreq->start + subreq->len)
break;
subreq->len += next->len;
list_del_init(&next->rreq_link);
netfs_put_subrequest(next, false,
netfs_sreq_trace_put_merged);
}
ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
subreq->len, rreq->i_size, true);
if (ret < 0) {
trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
continue;
}
iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages,
subreq->start, subreq->len);
atomic_inc(&rreq->nr_copy_ops);
netfs_stat(&netfs_n_rh_write);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache);
trace_netfs_sreq(subreq, netfs_sreq_trace_write);
cres->ops->write(cres, subreq->start, &iter,
netfs_rreq_copy_terminated, subreq);
}
/* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */
if (atomic_dec_and_test(&rreq->nr_copy_ops))
netfs_rreq_unmark_after_write(rreq, false);
}
static void netfs_rreq_write_to_cache_work(struct work_struct *work) /* [DEPRECATED] */
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
netfs_rreq_do_write_to_cache(rreq);
}
static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq) /* [DEPRECATED] */
{
rreq->work.func = netfs_rreq_write_to_cache_work;
if (!queue_work(system_unbound_wq, &rreq->work))
BUG();
}
/*
* Handle a short read.
*/
static void netfs_rreq_short_read(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
__clear_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
__set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
netfs_stat(&netfs_n_rh_short_read);
trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_short_read);
atomic_inc(&rreq->nr_outstanding);
if (subreq->source == NETFS_READ_FROM_CACHE)
netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR);
else
netfs_read_from_server(rreq, subreq);
}
/*
* Reset the subrequest iterator prior to resubmission.
*/
static void netfs_reset_subreq_iter(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
size_t remaining = subreq->len - subreq->transferred;
size_t count = iov_iter_count(&subreq->io_iter);
if (count == remaining)
return;
_debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x",
rreq->debug_id, subreq->debug_index,
iov_iter_count(&subreq->io_iter), subreq->transferred,
subreq->len, rreq->i_size,
subreq->io_iter.iter_type);
if (count < remaining)
iov_iter_revert(&subreq->io_iter, remaining - count);
else
iov_iter_advance(&subreq->io_iter, count - remaining);
}
/*
* Resubmit any short or failed operations. Returns true if we got the rreq
* ref back.
*/
static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
WARN_ON(in_interrupt());
trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
/* We don't want terminating submissions trying to wake us up whilst
* we're still going through the list.
*/
atomic_inc(&rreq->nr_outstanding);
__clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
if (subreq->error) {
if (subreq->source != NETFS_READ_FROM_CACHE)
break;
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->error = 0;
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
netfs_stat(&netfs_n_rh_download_instead);
trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
atomic_inc(&rreq->nr_outstanding);
netfs_reset_subreq_iter(rreq, subreq);
netfs_read_from_server(rreq, subreq);
} else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
netfs_reset_subreq_iter(rreq, subreq);
netfs_rreq_short_read(rreq, subreq);
}
}
/* If we decrement nr_outstanding to 0, the usage ref belongs to us. */
if (atomic_dec_and_test(&rreq->nr_outstanding))
return true;
wake_up_var(&rreq->nr_outstanding);
return false;
}
/*
* Check to see if the data read is still valid.
*/
static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
if (!rreq->netfs_ops->is_still_valid ||
rreq->netfs_ops->is_still_valid(rreq))
return;
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
if (subreq->source == NETFS_READ_FROM_CACHE) {
subreq->error = -ESTALE;
__set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
}
}
}
/*
* Determine how much we can admit to having read from a DIO read.
*/
static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
unsigned int i;
size_t transferred = 0;
for (i = 0; i < rreq->direct_bv_count; i++) {
flush_dcache_page(rreq->direct_bv[i].bv_page);
// TODO: cifs marks pages in the destination buffer
// dirty under some circumstances after a read. Do we
// need to do that too?
set_page_dirty(rreq->direct_bv[i].bv_page);
}
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
if (subreq->error || subreq->transferred == 0)
break;
transferred += subreq->transferred;
if (subreq->transferred < subreq->len ||
test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags))
break;
}
for (i = 0; i < rreq->direct_bv_count; i++)
flush_dcache_page(rreq->direct_bv[i].bv_page);
rreq->transferred = transferred;
task_io_account_read(transferred);
if (rreq->iocb) {
rreq->iocb->ki_pos += transferred;
if (rreq->iocb->ki_complete)
rreq->iocb->ki_complete(
rreq->iocb, rreq->error ? rreq->error : transferred);
}
if (rreq->netfs_ops->done)
rreq->netfs_ops->done(rreq);
inode_dio_end(rreq->inode);
}
/*
* Assess the state of a read request and decide what to do next.
*
* Note that we could be in an ordinary kernel thread, on a workqueue or in
* softirq context at this point. We inherit a ref from the caller.
*/
static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async)
{
trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
again:
netfs_rreq_is_still_valid(rreq);
if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
if (netfs_rreq_perform_resubmissions(rreq))
goto again;
return;
}
if (rreq->origin != NETFS_DIO_READ)
netfs_rreq_unlock_folios(rreq);
else
netfs_rreq_assess_dio(rreq);
trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags) &&
test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags))
return netfs_rreq_write_to_cache(rreq);
netfs_rreq_completed(rreq, was_async);
}
static void netfs_rreq_work(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
netfs_rreq_assess(rreq, false);
}
/*
* Handle the completion of all outstanding I/O operations on a read request.
* We inherit a ref from the caller.
*/
static void netfs_rreq_terminated(struct netfs_io_request *rreq,
bool was_async)
{
if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) &&
was_async) {
if (!queue_work(system_unbound_wq, &rreq->work))
BUG();
} else {
netfs_rreq_assess(rreq, was_async);
}
}
/**
* netfs_subreq_terminated - Note the termination of an I/O operation.
* @subreq: The I/O request that has terminated.
* @transferred_or_error: The amount of data transferred or an error code.
* @was_async: The termination was asynchronous
*
* This tells the read helper that a contributory I/O operation has terminated,
* one way or another, and that it should integrate the results.
*
* The caller indicates in @transferred_or_error the outcome of the operation,
* supplying a positive value to indicate the number of bytes transferred, 0 to
* indicate a failure to transfer anything that should be retried or a negative
* error code. The helper will look after reissuing I/O operations as
* appropriate and writing downloaded data to the cache.
*
* If @was_async is true, the caller might be running in softirq or interrupt
* context and we can't sleep.
*/
void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
ssize_t transferred_or_error,
bool was_async)
{
struct netfs_io_request *rreq = subreq->rreq;
int u;
_enter("R=%x[%x]{%llx,%lx},%zd",
rreq->debug_id, subreq->debug_index,
subreq->start, subreq->flags, transferred_or_error);
switch (subreq->source) {
case NETFS_READ_FROM_CACHE:
netfs_stat(&netfs_n_rh_read_done);
break;
case NETFS_DOWNLOAD_FROM_SERVER:
netfs_stat(&netfs_n_rh_download_done);
break;
default:
break;
}
if (IS_ERR_VALUE(transferred_or_error)) {
subreq->error = transferred_or_error;
trace_netfs_failure(rreq, subreq, transferred_or_error,
netfs_fail_read);
goto failed;
}
if (WARN(transferred_or_error > subreq->len - subreq->transferred,
"Subreq overread: R%x[%x] %zd > %zu - %zu",
rreq->debug_id, subreq->debug_index,
transferred_or_error, subreq->len, subreq->transferred))
transferred_or_error = subreq->len - subreq->transferred;
subreq->error = 0;
subreq->transferred += transferred_or_error;
if (subreq->transferred < subreq->len &&
!test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags))
goto incomplete;
complete:
__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
set_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
out:
trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
/* If we decrement nr_outstanding to 0, the ref belongs to us. */
u = atomic_dec_return(&rreq->nr_outstanding);
if (u == 0)
netfs_rreq_terminated(rreq, was_async);
else if (u == 1)
wake_up_var(&rreq->nr_outstanding);
netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
return;
incomplete:
if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
netfs_clear_unread(subreq);
subreq->transferred = subreq->len;
goto complete;
}
if (transferred_or_error == 0) {
if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
if (rreq->origin != NETFS_DIO_READ)
subreq->error = -ENODATA;
goto failed;
}
} else {
__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
}
__set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
goto out;
failed:
if (subreq->source == NETFS_READ_FROM_CACHE) {
netfs_stat(&netfs_n_rh_read_failed);
set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
} else {
netfs_stat(&netfs_n_rh_download_failed);
set_bit(NETFS_RREQ_FAILED, &rreq->flags);
rreq->error = subreq->error;
}
goto out;
}
EXPORT_SYMBOL(netfs_subreq_terminated);
static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest *subreq,
loff_t i_size)
{
struct netfs_io_request *rreq = subreq->rreq;
struct netfs_cache_resources *cres = &rreq->cache_resources;
if (cres->ops)
return cres->ops->prepare_read(subreq, i_size);
if (subreq->start >= rreq->i_size)
return NETFS_FILL_WITH_ZEROES;
return NETFS_DOWNLOAD_FROM_SERVER;
}
/*
* Work out what sort of subrequest the next one will be.
*/
static enum netfs_io_source
netfs_rreq_prepare_read(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq,
struct iov_iter *io_iter)
{
enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
struct netfs_inode *ictx = netfs_inode(rreq->inode);
size_t lsize;
_enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
if (rreq->origin != NETFS_DIO_READ) {
source = netfs_cache_prepare_read(subreq, rreq->i_size);
if (source == NETFS_INVALID_READ)
goto out;
}
if (source == NETFS_DOWNLOAD_FROM_SERVER) {
/* Call out to the netfs to let it shrink the request to fit
* its own I/O sizes and boundaries. If it shinks it here, it
* will be called again to make simultaneous calls; if it wants
* to make serial calls, it can indicate a short read and then
* we will call it again.
*/
if (rreq->origin != NETFS_DIO_READ) {
if (subreq->start >= ictx->zero_point) {
source = NETFS_FILL_WITH_ZEROES;
goto set;
}
if (subreq->len > ictx->zero_point - subreq->start)
subreq->len = ictx->zero_point - subreq->start;
/* We limit buffered reads to the EOF, but let the
* server deal with larger-than-EOF DIO/unbuffered
* reads.
*/
if (subreq->len > rreq->i_size - subreq->start)
subreq->len = rreq->i_size - subreq->start;
}
if (rreq->rsize && subreq->len > rreq->rsize)
subreq->len = rreq->rsize;
if (rreq->netfs_ops->clamp_length &&
!rreq->netfs_ops->clamp_length(subreq)) {
source = NETFS_INVALID_READ;
goto out;
}
if (subreq->max_nr_segs) {
lsize = netfs_limit_iter(io_iter, 0, subreq->len,
subreq->max_nr_segs);
if (subreq->len > lsize) {
subreq->len = lsize;
trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
}
}
}
set:
if (subreq->len > rreq->len)
pr_warn("R=%08x[%u] SREQ>RREQ %zx > %llx\n",
rreq->debug_id, subreq->debug_index,
subreq->len, rreq->len);
if (WARN_ON(subreq->len == 0)) {
source = NETFS_INVALID_READ;
goto out;
}
subreq->source = source;
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
subreq->io_iter = *io_iter;
iov_iter_truncate(&subreq->io_iter, subreq->len);
iov_iter_advance(io_iter, subreq->len);
out:
subreq->source = source;
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
return source;
}
/*
* Slice off a piece of a read request and submit an I/O request for it.
*/
static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
struct iov_iter *io_iter)
{
struct netfs_io_subrequest *subreq;
enum netfs_io_source source;
subreq = netfs_alloc_subrequest(rreq);
if (!subreq)
return false;
subreq->start = rreq->start + rreq->submitted;
subreq->len = io_iter->count;
_debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
/* Call out to the cache to find out what it can do with the remaining
* subset. It tells us in subreq->flags what it decided should be done
* and adjusts subreq->len down if the subset crosses a cache boundary.
*
* Then when we hand the subset, it can choose to take a subset of that
* (the starts must coincide), in which case, we go around the loop
* again and ask it to download the next piece.
*/
source = netfs_rreq_prepare_read(rreq, subreq, io_iter);
if (source == NETFS_INVALID_READ)
goto subreq_failed;
atomic_inc(&rreq->nr_outstanding);
rreq->submitted += subreq->len;
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
switch (source) {
case NETFS_FILL_WITH_ZEROES:
netfs_fill_with_zeroes(rreq, subreq);
break;
case NETFS_DOWNLOAD_FROM_SERVER:
netfs_read_from_server(rreq, subreq);
break;
case NETFS_READ_FROM_CACHE:
netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE);
break;
default:
BUG();
}
return true;
subreq_failed:
rreq->error = subreq->error;
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_failed);
return false;
}
/*
* Begin the process of reading in a chunk of data, where that data may be
* stitched together from multiple sources, including multiple servers and the
* local cache.
*/
int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
{
struct iov_iter io_iter;
int ret;
_enter("R=%x %llx-%llx",
rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
if (rreq->len == 0) {
pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
return -EIO;
}
if (rreq->origin == NETFS_DIO_READ)
inode_dio_begin(rreq->inode);
// TODO: Use bounce buffer if requested
rreq->io_iter = rreq->iter;
INIT_WORK(&rreq->work, netfs_rreq_work);
/* Chop the read into slices according to what the cache and the netfs
* want and submit each one.
*/
netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding);
atomic_set(&rreq->nr_outstanding, 1);
io_iter = rreq->io_iter;
do {
_debug("submit %llx + %llx >= %llx",
rreq->start, rreq->submitted, rreq->i_size);
if (!netfs_rreq_submit_slice(rreq, &io_iter))
break;
if (test_bit(NETFS_SREQ_NO_PROGRESS, &rreq->flags))
break;
if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
break;
} while (rreq->submitted < rreq->len);
if (!rreq->submitted) {
netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
if (rreq->origin == NETFS_DIO_READ)
inode_dio_end(rreq->inode);
ret = 0;
goto out;
}
if (sync) {
/* Keep nr_outstanding incremented so that the ref always
* belongs to us, and the service code isn't punted off to a
* random thread pool to process. Note that this might start
* further work, such as writing to the cache.
*/
wait_var_event(&rreq->nr_outstanding,
atomic_read(&rreq->nr_outstanding) == 1);
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_assess(rreq, false);
trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
TASK_UNINTERRUPTIBLE);
ret = rreq->error;
if (ret == 0) {
if (rreq->origin == NETFS_DIO_READ) {
ret = rreq->transferred;
} else if (rreq->submitted < rreq->len) {
trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
ret = -EIO;
}
}
} else {
/* If we decrement nr_outstanding to 0, the ref belongs to us. */
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_assess(rreq, false);
ret = -EIOCBQUEUED;
}
out:
return ret;
}

View File

@ -188,9 +188,59 @@ static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offse
return min(span, max_size);
}
/*
* Select the span of a folio queue iterator we're going to use. Limit it by
* both maximum size and maximum number of segments. Returns the size of the
* span in bytes.
*/
static size_t netfs_limit_folioq(const struct iov_iter *iter, size_t start_offset,
size_t max_size, size_t max_segs)
{
const struct folio_queue *folioq = iter->folioq;
unsigned int nsegs = 0;
unsigned int slot = iter->folioq_slot;
size_t span = 0, n = iter->count;
if (WARN_ON(!iov_iter_is_folioq(iter)) ||
WARN_ON(start_offset > n) ||
n == 0)
return 0;
max_size = umin(max_size, n - start_offset);
if (slot >= folioq_nr_slots(folioq)) {
folioq = folioq->next;
slot = 0;
}
start_offset += iter->iov_offset;
do {
size_t flen = folioq_folio_size(folioq, slot);
if (start_offset < flen) {
span += flen - start_offset;
nsegs++;
start_offset = 0;
} else {
start_offset -= flen;
}
if (span >= max_size || nsegs >= max_segs)
break;
slot++;
if (slot >= folioq_nr_slots(folioq)) {
folioq = folioq->next;
slot = 0;
}
} while (folioq);
return umin(span, max_size);
}
size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
size_t max_size, size_t max_segs)
{
if (iov_iter_is_folioq(iter))
return netfs_limit_folioq(iter, start_offset, max_size, max_segs);
if (iov_iter_is_bvec(iter))
return netfs_limit_bvec(iter, start_offset, max_size, max_segs);
if (iov_iter_is_xarray(iter))

View File

@ -36,13 +36,14 @@ DEFINE_SPINLOCK(netfs_proc_lock);
static const char *netfs_origins[nr__netfs_io_origin] = {
[NETFS_READAHEAD] = "RA",
[NETFS_READPAGE] = "RP",
[NETFS_READ_GAPS] = "RG",
[NETFS_READ_FOR_WRITE] = "RW",
[NETFS_COPY_TO_CACHE] = "CC",
[NETFS_DIO_READ] = "DR",
[NETFS_WRITEBACK] = "WB",
[NETFS_WRITETHROUGH] = "WT",
[NETFS_UNBUFFERED_WRITE] = "UW",
[NETFS_DIO_READ] = "DR",
[NETFS_DIO_WRITE] = "DW",
[NETFS_PGPRIV2_COPY_TO_CACHE] = "2C",
};
/*
@ -62,7 +63,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v)
rreq = list_entry(v, struct netfs_io_request, proc_link);
seq_printf(m,
"%08x %s %3d %2lx %4d %3d @%04llx %llx/%llx",
"%08x %s %3d %2lx %4ld %3d @%04llx %llx/%llx",
rreq->debug_id,
netfs_origins[rreq->origin],
refcount_read(&rreq->ref),

View File

@ -8,6 +8,100 @@
#include <linux/swap.h>
#include "internal.h"
/*
* Append a folio to the rolling queue.
*/
int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
bool needs_put)
{
struct folio_queue *tail = rreq->buffer_tail;
unsigned int slot, order = folio_order(folio);
if (WARN_ON_ONCE(!rreq->buffer && tail) ||
WARN_ON_ONCE(rreq->buffer && !tail))
return -EIO;
if (!tail || folioq_full(tail)) {
tail = kmalloc(sizeof(*tail), GFP_NOFS);
if (!tail)
return -ENOMEM;
netfs_stat(&netfs_n_folioq);
folioq_init(tail);
tail->prev = rreq->buffer_tail;
if (tail->prev)
tail->prev->next = tail;
rreq->buffer_tail = tail;
if (!rreq->buffer) {
rreq->buffer = tail;
iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0);
}
rreq->buffer_tail_slot = 0;
}
rreq->io_iter.count += PAGE_SIZE << order;
slot = folioq_append(tail, folio);
/* Store the counter after setting the slot. */
smp_store_release(&rreq->buffer_tail_slot, slot);
return 0;
}
/*
* Delete the head of a rolling queue.
*/
struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq)
{
struct folio_queue *head = wreq->buffer, *next = head->next;
if (next)
next->prev = NULL;
netfs_stat_d(&netfs_n_folioq);
kfree(head);
wreq->buffer = next;
return next;
}
/*
* Clear out a rolling queue.
*/
void netfs_clear_buffer(struct netfs_io_request *rreq)
{
struct folio_queue *p;
while ((p = rreq->buffer)) {
rreq->buffer = p->next;
for (int slot = 0; slot < folioq_nr_slots(p); slot++) {
struct folio *folio = folioq_folio(p, slot);
if (!folio)
continue;
if (folioq_is_marked(p, slot)) {
trace_netfs_folio(folio, netfs_folio_trace_put);
folio_put(folio);
}
}
netfs_stat_d(&netfs_n_folioq);
kfree(p);
}
}
/*
* Reset the subrequest iterator to refer just to the region remaining to be
* read. The iterator may or may not have been advanced by socket ops or
* extraction ops to an extent that may or may not match the amount actually
* read.
*/
void netfs_reset_iter(struct netfs_io_subrequest *subreq)
{
struct iov_iter *io_iter = &subreq->io_iter;
size_t remain = subreq->len - subreq->transferred;
if (io_iter->count > remain)
iov_iter_advance(io_iter, io_iter->count - remain);
else if (io_iter->count < remain)
iov_iter_revert(io_iter, remain - io_iter->count);
iov_iter_truncate(&subreq->io_iter, remain);
}
/**
* netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
* @mapping: The mapping the folio belongs to.

View File

@ -36,7 +36,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
memset(rreq, 0, kmem_cache_size(cache));
rreq->start = start;
rreq->len = len;
rreq->upper_len = len;
rreq->origin = origin;
rreq->netfs_ops = ctx->ops;
rreq->mapping = mapping;
@ -44,13 +43,23 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
rreq->i_size = i_size_read(inode);
rreq->debug_id = atomic_inc_return(&debug_ids);
rreq->wsize = INT_MAX;
rreq->io_streams[0].sreq_max_len = ULONG_MAX;
rreq->io_streams[0].sreq_max_segs = 0;
spin_lock_init(&rreq->lock);
INIT_LIST_HEAD(&rreq->io_streams[0].subrequests);
INIT_LIST_HEAD(&rreq->io_streams[1].subrequests);
INIT_LIST_HEAD(&rreq->subrequests);
INIT_WORK(&rreq->work, NULL);
refcount_set(&rreq->ref, 1);
if (origin == NETFS_READAHEAD ||
origin == NETFS_READPAGE ||
origin == NETFS_READ_GAPS ||
origin == NETFS_READ_FOR_WRITE ||
origin == NETFS_DIO_READ)
INIT_WORK(&rreq->work, netfs_read_termination_worker);
else
INIT_WORK(&rreq->work, netfs_write_collection_worker);
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
if (file && file->f_flags & O_NONBLOCK)
__set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
@ -134,6 +143,7 @@ static void netfs_free_request(struct work_struct *work)
}
kvfree(rreq->direct_bv);
}
netfs_clear_buffer(rreq);
if (atomic_dec_and_test(&ictx->io_count))
wake_up_var(&ictx->io_count);
@ -155,7 +165,7 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
if (was_async) {
rreq->work.func = netfs_free_request;
if (!queue_work(system_unbound_wq, &rreq->work))
BUG();
WARN_ON(1);
} else {
netfs_free_request(&rreq->work);
}

544
fs/netfs/read_collect.c Normal file
View File

@ -0,0 +1,544 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Network filesystem read subrequest result collection, assessment and
* retrying.
*
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
/*
* Clear the unread part of an I/O request.
*/
static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
{
netfs_reset_iter(subreq);
WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter));
iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
if (subreq->start + subreq->transferred >= subreq->rreq->i_size)
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
}
/*
* Flush, mark and unlock a folio that's now completely read. If we want to
* cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it
* dirty and let writeback handle it.
*/
static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq,
struct folio_queue *folioq,
int slot)
{
struct netfs_folio *finfo;
struct folio *folio = folioq_folio(folioq, slot);
flush_dcache_folio(folio);
folio_mark_uptodate(folio);
if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
finfo = netfs_folio_info(folio);
if (finfo) {
trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
if (finfo->netfs_group)
folio_change_private(folio, finfo->netfs_group);
else
folio_detach_private(folio);
kfree(finfo);
}
if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
folio_mark_dirty(folio);
}
} else {
trace_netfs_folio(folio, netfs_folio_trace_read_done);
}
} else {
// TODO: Use of PG_private_2 is deprecated.
if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot);
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
_debug("no unlock");
} else {
trace_netfs_folio(folio, netfs_folio_trace_read_unlock);
folio_unlock(folio);
}
}
}
/*
* Unlock any folios that are now completely read. Returns true if the
* subrequest is removed from the list.
*/
static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async)
{
struct netfs_io_subrequest *prev, *next;
struct netfs_io_request *rreq = subreq->rreq;
struct folio_queue *folioq = subreq->curr_folioq;
size_t avail, prev_donated, next_donated, fsize, part, excess;
loff_t fpos, start;
loff_t fend;
int slot = subreq->curr_folioq_slot;
if (WARN(subreq->transferred > subreq->len,
"Subreq overread: R%x[%x] %zu > %zu",
rreq->debug_id, subreq->debug_index,
subreq->transferred, subreq->len))
subreq->transferred = subreq->len;
next_folio:
fsize = PAGE_SIZE << subreq->curr_folio_order;
fpos = round_down(subreq->start + subreq->consumed, fsize);
fend = fpos + fsize;
if (WARN_ON_ONCE(!folioq) ||
WARN_ON_ONCE(!folioq_folio(folioq, slot)) ||
WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) {
pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n",
rreq->debug_id, subreq->debug_index,
subreq->start, subreq->start + subreq->transferred - 1,
subreq->consumed, subreq->transferred, subreq->len,
slot);
if (folioq) {
struct folio *folio = folioq_folio(folioq, slot);
pr_err("folioq: orders=%02x%02x%02x%02x\n",
folioq->orders[0], folioq->orders[1],
folioq->orders[2], folioq->orders[3]);
if (folio)
pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n",
fpos, fend - 1, folio_pos(folio), folio_order(folio),
folioq_folio_order(folioq, slot));
}
}
donation_changed:
/* Try to consume the current folio if we've hit or passed the end of
* it. There's a possibility that this subreq doesn't start at the
* beginning of the folio, in which case we need to donate to/from the
* preceding subreq.
*
* We also need to include any potential donation back from the
* following subreq.
*/
prev_donated = READ_ONCE(subreq->prev_donated);
next_donated = READ_ONCE(subreq->next_donated);
if (prev_donated || next_donated) {
spin_lock_bh(&rreq->lock);
prev_donated = subreq->prev_donated;
next_donated = subreq->next_donated;
subreq->start -= prev_donated;
subreq->len += prev_donated;
subreq->transferred += prev_donated;
prev_donated = subreq->prev_donated = 0;
if (subreq->transferred == subreq->len) {
subreq->len += next_donated;
subreq->transferred += next_donated;
next_donated = subreq->next_donated = 0;
}
trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations);
spin_unlock_bh(&rreq->lock);
}
avail = subreq->transferred;
if (avail == subreq->len)
avail += next_donated;
start = subreq->start;
if (subreq->consumed == 0) {
start -= prev_donated;
avail += prev_donated;
} else {
start += subreq->consumed;
avail -= subreq->consumed;
}
part = umin(avail, fsize);
trace_netfs_progress(subreq, start, avail, part);
if (start + avail >= fend) {
if (fpos == start) {
/* Flush, unlock and mark for caching any folio we've just read. */
subreq->consumed = fend - subreq->start;
netfs_unlock_read_folio(subreq, rreq, folioq, slot);
folioq_mark2(folioq, slot);
if (subreq->consumed >= subreq->len)
goto remove_subreq;
} else if (fpos < start) {
excess = fend - subreq->start;
spin_lock_bh(&rreq->lock);
/* If we complete first on a folio split with the
* preceding subreq, donate to that subreq - otherwise
* we get the responsibility.
*/
if (subreq->prev_donated != prev_donated) {
spin_unlock_bh(&rreq->lock);
goto donation_changed;
}
if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
spin_unlock_bh(&rreq->lock);
pr_err("Can't donate prior to front\n");
goto bad;
}
prev = list_prev_entry(subreq, rreq_link);
WRITE_ONCE(prev->next_donated, prev->next_donated + excess);
subreq->start += excess;
subreq->len -= excess;
subreq->transferred -= excess;
trace_netfs_donate(rreq, subreq, prev, excess,
netfs_trace_donate_tail_to_prev);
trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
if (subreq->consumed >= subreq->len)
goto remove_subreq_locked;
spin_unlock_bh(&rreq->lock);
} else {
pr_err("fpos > start\n");
goto bad;
}
/* Advance the rolling buffer to the next folio. */
slot++;
if (slot >= folioq_nr_slots(folioq)) {
slot = 0;
folioq = folioq->next;
subreq->curr_folioq = folioq;
}
subreq->curr_folioq_slot = slot;
if (folioq && folioq_folio(folioq, slot))
subreq->curr_folio_order = folioq->orders[slot];
if (!was_async)
cond_resched();
goto next_folio;
}
/* Deal with partial progress. */
if (subreq->transferred < subreq->len)
return false;
/* Donate the remaining downloaded data to one of the neighbouring
* subrequests. Note that we may race with them doing the same thing.
*/
spin_lock_bh(&rreq->lock);
if (subreq->prev_donated != prev_donated ||
subreq->next_donated != next_donated) {
spin_unlock_bh(&rreq->lock);
cond_resched();
goto donation_changed;
}
/* Deal with the trickiest case: that this subreq is in the middle of a
* folio, not touching either edge, but finishes first. In such a
* case, we donate to the previous subreq, if there is one, so that the
* donation is only handled when that completes - and remove this
* subreq from the list.
*
* If the previous subreq finished first, we will have acquired their
* donation and should be able to unlock folios and/or donate nextwards.
*/
if (!subreq->consumed &&
!prev_donated &&
!list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
prev = list_prev_entry(subreq, rreq_link);
WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len);
subreq->start += subreq->len;
subreq->len = 0;
subreq->transferred = 0;
trace_netfs_donate(rreq, subreq, prev, subreq->len,
netfs_trace_donate_to_prev);
trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
goto remove_subreq_locked;
}
/* If we can't donate down the chain, donate up the chain instead. */
excess = subreq->len - subreq->consumed + next_donated;
if (!subreq->consumed)
excess += prev_donated;
if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
rreq->prev_donated = excess;
trace_netfs_donate(rreq, subreq, NULL, excess,
netfs_trace_donate_to_deferred_next);
} else {
next = list_next_entry(subreq, rreq_link);
WRITE_ONCE(next->prev_donated, excess);
trace_netfs_donate(rreq, subreq, next, excess,
netfs_trace_donate_to_next);
}
trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next);
subreq->len = subreq->consumed;
subreq->transferred = subreq->consumed;
goto remove_subreq_locked;
remove_subreq:
spin_lock_bh(&rreq->lock);
remove_subreq_locked:
subreq->consumed = subreq->len;
list_del(&subreq->rreq_link);
spin_unlock_bh(&rreq->lock);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed);
return true;
bad:
/* Errr... prev and next both donated to us, but insufficient to finish
* the folio.
*/
printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n",
rreq->debug_id, subreq->debug_index,
subreq->start, subreq->start + subreq->transferred - 1,
subreq->consumed, subreq->transferred, subreq->len);
printk("folio: %llx-%llx\n", fpos, fend - 1);
printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated);
printk("s=%llx av=%zx part=%zx\n", start, avail, part);
BUG();
}
/*
* Do page flushing and suchlike after DIO.
*/
static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
unsigned int i;
/* Collect unbuffered reads and direct reads, adding up the transfer
* sizes until we find the first short or failed subrequest.
*/
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
rreq->transferred += subreq->transferred;
if (subreq->transferred < subreq->len ||
test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
rreq->error = subreq->error;
break;
}
}
if (rreq->origin == NETFS_DIO_READ) {
for (i = 0; i < rreq->direct_bv_count; i++) {
flush_dcache_page(rreq->direct_bv[i].bv_page);
// TODO: cifs marks pages in the destination buffer
// dirty under some circumstances after a read. Do we
// need to do that too?
set_page_dirty(rreq->direct_bv[i].bv_page);
}
}
if (rreq->iocb) {
rreq->iocb->ki_pos += rreq->transferred;
if (rreq->iocb->ki_complete)
rreq->iocb->ki_complete(
rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
}
if (rreq->netfs_ops->done)
rreq->netfs_ops->done(rreq);
if (rreq->origin == NETFS_DIO_READ)
inode_dio_end(rreq->inode);
}
/*
* Assess the state of a read request and decide what to do next.
*
* Note that we're in normal kernel thread context at this point, possibly
* running on a workqueue.
*/
static void netfs_rreq_assess(struct netfs_io_request *rreq)
{
trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
//netfs_rreq_is_still_valid(rreq);
if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) {
netfs_retry_reads(rreq);
return;
}
if (rreq->origin == NETFS_DIO_READ ||
rreq->origin == NETFS_READ_GAPS)
netfs_rreq_assess_dio(rreq);
task_io_account_read(rreq->transferred);
trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
trace_netfs_rreq(rreq, netfs_rreq_trace_done);
netfs_clear_subrequests(rreq, false);
netfs_unlock_abandoned_read_pages(rreq);
if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)))
netfs_pgpriv2_write_to_the_cache(rreq);
}
void netfs_read_termination_worker(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
netfs_see_request(rreq, netfs_rreq_trace_see_work);
netfs_rreq_assess(rreq);
netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete);
}
/*
* Handle the completion of all outstanding I/O operations on a read request.
* We inherit a ref from the caller.
*/
void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async)
{
if (!was_async)
return netfs_rreq_assess(rreq);
if (!work_pending(&rreq->work)) {
netfs_get_request(rreq, netfs_rreq_trace_get_work);
if (!queue_work(system_unbound_wq, &rreq->work))
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq);
}
}
/**
* netfs_read_subreq_progress - Note progress of a read operation.
* @subreq: The read request that has terminated.
* @was_async: True if we're in an asynchronous context.
*
* This tells the read side of netfs lib that a contributory I/O operation has
* made some progress and that it may be possible to unlock some folios.
*
* Before calling, the filesystem should update subreq->transferred to track
* the amount of data copied into the output buffer.
*
* If @was_async is true, the caller might be running in softirq or interrupt
* context and we can't sleep.
*/
void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
bool was_async)
{
struct netfs_io_request *rreq = subreq->rreq;
trace_netfs_sreq(subreq, netfs_sreq_trace_progress);
if (subreq->transferred > subreq->consumed &&
(rreq->origin == NETFS_READAHEAD ||
rreq->origin == NETFS_READPAGE ||
rreq->origin == NETFS_READ_FOR_WRITE)) {
netfs_consume_read_data(subreq, was_async);
__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
}
}
EXPORT_SYMBOL(netfs_read_subreq_progress);
/**
* netfs_read_subreq_terminated - Note the termination of an I/O operation.
* @subreq: The I/O request that has terminated.
* @error: Error code indicating type of completion.
* @was_async: The termination was asynchronous
*
* This tells the read helper that a contributory I/O operation has terminated,
* one way or another, and that it should integrate the results.
*
* The caller indicates the outcome of the operation through @error, supplying
* 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY
* is set) or a negative error code. The helper will look after reissuing I/O
* operations as appropriate and writing downloaded data to the cache.
*
* Before calling, the filesystem should update subreq->transferred to track
* the amount of data copied into the output buffer.
*
* If @was_async is true, the caller might be running in softirq or interrupt
* context and we can't sleep.
*/
void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
int error, bool was_async)
{
struct netfs_io_request *rreq = subreq->rreq;
switch (subreq->source) {
case NETFS_READ_FROM_CACHE:
netfs_stat(&netfs_n_rh_read_done);
break;
case NETFS_DOWNLOAD_FROM_SERVER:
netfs_stat(&netfs_n_rh_download_done);
break;
default:
break;
}
if (rreq->origin != NETFS_DIO_READ) {
/* Collect buffered reads.
*
* If the read completed validly short, then we can clear the
* tail before going on to unlock the folios.
*/
if (error == 0 && subreq->transferred < subreq->len &&
(test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) ||
test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) {
netfs_clear_unread(subreq);
subreq->transferred = subreq->len;
trace_netfs_sreq(subreq, netfs_sreq_trace_clear);
}
if (subreq->transferred > subreq->consumed &&
(rreq->origin == NETFS_READAHEAD ||
rreq->origin == NETFS_READPAGE ||
rreq->origin == NETFS_READ_FOR_WRITE)) {
netfs_consume_read_data(subreq, was_async);
__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
}
rreq->transferred += subreq->transferred;
}
/* Deal with retry requests, short reads and errors. If we retry
* but don't make progress, we abandon the attempt.
*/
if (!error && subreq->transferred < subreq->len) {
if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) {
trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof);
} else {
trace_netfs_sreq(subreq, netfs_sreq_trace_short);
if (subreq->transferred > subreq->consumed) {
__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
} else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
} else {
__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
error = -ENODATA;
}
}
}
subreq->error = error;
trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
if (unlikely(error < 0)) {
trace_netfs_failure(rreq, subreq, error, netfs_fail_read);
if (subreq->source == NETFS_READ_FROM_CACHE) {
netfs_stat(&netfs_n_rh_read_failed);
} else {
netfs_stat(&netfs_n_rh_download_failed);
set_bit(NETFS_RREQ_FAILED, &rreq->flags);
rreq->error = subreq->error;
}
}
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_terminated(rreq, was_async);
netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
}
EXPORT_SYMBOL(netfs_read_subreq_terminated);

264
fs/netfs/read_pgpriv2.c Normal file
View File

@ -0,0 +1,264 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Read with PG_private_2 [DEPRECATED].
*
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
/*
* [DEPRECATED] Mark page as requiring copy-to-cache using PG_private_2. The
* third mark in the folio queue is used to indicate that this folio needs
* writing.
*/
void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq,
struct folio_queue *folioq,
int slot)
{
struct folio *folio = folioq_folio(folioq, slot);
trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
folio_start_private_2(folio);
folioq_mark3(folioq, slot);
}
/*
* [DEPRECATED] Cancel PG_private_2 on all marked folios in the event of an
* unrecoverable error.
*/
static void netfs_pgpriv2_cancel(struct folio_queue *folioq)
{
struct folio *folio;
int slot;
while (folioq) {
if (!folioq->marks3) {
folioq = folioq->next;
continue;
}
slot = __ffs(folioq->marks3);
folio = folioq_folio(folioq, slot);
trace_netfs_folio(folio, netfs_folio_trace_cancel_copy);
folio_end_private_2(folio);
folioq_unmark3(folioq, slot);
}
}
/*
* [DEPRECATED] Copy a folio to the cache with PG_private_2 set.
*/
static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio *folio)
{
struct netfs_io_stream *cache = &wreq->io_streams[1];
size_t fsize = folio_size(folio), flen = fsize;
loff_t fpos = folio_pos(folio), i_size;
bool to_eof = false;
_enter("");
/* netfs_perform_write() may shift i_size around the page or from out
* of the page to beyond it, but cannot move i_size into or through the
* page since we have it locked.
*/
i_size = i_size_read(wreq->inode);
if (fpos >= i_size) {
/* mmap beyond eof. */
_debug("beyond eof");
folio_end_private_2(folio);
return 0;
}
if (fpos + fsize > wreq->i_size)
wreq->i_size = i_size;
if (flen > i_size - fpos) {
flen = i_size - fpos;
to_eof = true;
} else if (flen == i_size - fpos) {
to_eof = true;
}
_debug("folio %zx %zx", flen, fsize);
trace_netfs_folio(folio, netfs_folio_trace_store_copy);
/* Attach the folio to the rolling buffer. */
if (netfs_buffer_append_folio(wreq, folio, false) < 0)
return -ENOMEM;
cache->submit_extendable_to = fsize;
cache->submit_off = 0;
cache->submit_len = flen;
/* Attach the folio to one or more subrequests. For a big folio, we
* could end up with thousands of subrequests if the wsize is small -
* but we might need to wait during the creation of subrequests for
* network resources (eg. SMB credits).
*/
do {
ssize_t part;
wreq->io_iter.iov_offset = cache->submit_off;
atomic64_set(&wreq->issued_to, fpos + cache->submit_off);
cache->submit_extendable_to = fsize - cache->submit_off;
part = netfs_advance_write(wreq, cache, fpos + cache->submit_off,
cache->submit_len, to_eof);
cache->submit_off += part;
if (part > cache->submit_len)
cache->submit_len = 0;
else
cache->submit_len -= part;
} while (cache->submit_len > 0);
wreq->io_iter.iov_offset = 0;
iov_iter_advance(&wreq->io_iter, fsize);
atomic64_set(&wreq->issued_to, fpos + fsize);
if (flen < fsize)
netfs_issue_write(wreq, cache);
_leave(" = 0");
return 0;
}
/*
* [DEPRECATED] Go through the buffer and write any folios that are marked with
* the third mark to the cache.
*/
void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq)
{
struct netfs_io_request *wreq;
struct folio_queue *folioq;
struct folio *folio;
int error = 0;
int slot = 0;
_enter("");
if (!fscache_resources_valid(&rreq->cache_resources))
goto couldnt_start;
/* Need the first folio to be able to set up the op. */
for (folioq = rreq->buffer; folioq; folioq = folioq->next) {
if (folioq->marks3) {
slot = __ffs(folioq->marks3);
break;
}
}
if (!folioq)
return;
folio = folioq_folio(folioq, slot);
wreq = netfs_create_write_req(rreq->mapping, NULL, folio_pos(folio),
NETFS_PGPRIV2_COPY_TO_CACHE);
if (IS_ERR(wreq)) {
kleave(" [create %ld]", PTR_ERR(wreq));
goto couldnt_start;
}
trace_netfs_write(wreq, netfs_write_trace_copy_to_cache);
netfs_stat(&netfs_n_wh_copy_to_cache);
for (;;) {
error = netfs_pgpriv2_copy_folio(wreq, folio);
if (error < 0)
break;
folioq_unmark3(folioq, slot);
if (!folioq->marks3) {
folioq = folioq->next;
if (!folioq)
break;
}
slot = __ffs(folioq->marks3);
folio = folioq_folio(folioq, slot);
}
netfs_issue_write(wreq, &wreq->io_streams[1]);
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
_leave(" = %d", error);
couldnt_start:
netfs_pgpriv2_cancel(rreq->buffer);
}
/*
* [DEPRECATED] Remove the PG_private_2 mark from any folios we've finished
* copying.
*/
bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
{
struct folio_queue *folioq = wreq->buffer;
unsigned long long collected_to = wreq->collected_to;
unsigned int slot = wreq->buffer_head_slot;
bool made_progress = false;
if (slot >= folioq_nr_slots(folioq)) {
folioq = netfs_delete_buffer_head(wreq);
slot = 0;
}
for (;;) {
struct folio *folio;
unsigned long long fpos, fend;
size_t fsize, flen;
folio = folioq_folio(folioq, slot);
if (WARN_ONCE(!folio_test_private_2(folio),
"R=%08x: folio %lx is not marked private_2\n",
wreq->debug_id, folio->index))
trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
fpos = folio_pos(folio);
fsize = folio_size(folio);
flen = fsize;
fend = min_t(unsigned long long, fpos + flen, wreq->i_size);
trace_netfs_collect_folio(wreq, folio, fend, collected_to);
/* Unlock any folio we've transferred all of. */
if (collected_to < fend)
break;
trace_netfs_folio(folio, netfs_folio_trace_end_copy);
folio_end_private_2(folio);
wreq->cleaned_to = fpos + fsize;
made_progress = true;
/* Clean up the head folioq. If we clear an entire folioq, then
* we can get rid of it provided it's not also the tail folioq
* being filled by the issuer.
*/
folioq_clear(folioq, slot);
slot++;
if (slot >= folioq_nr_slots(folioq)) {
if (READ_ONCE(wreq->buffer_tail) == folioq)
break;
folioq = netfs_delete_buffer_head(wreq);
slot = 0;
}
if (fpos + fsize >= collected_to)
break;
}
wreq->buffer = folioq;
wreq->buffer_head_slot = slot;
return made_progress;
}

256
fs/netfs/read_retry.c Normal file
View File

@ -0,0 +1,256 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Network filesystem read subrequest retrying.
*
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include "internal.h"
static void netfs_reissue_read(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
struct iov_iter *io_iter = &subreq->io_iter;
if (iov_iter_is_folioq(io_iter)) {
subreq->curr_folioq = (struct folio_queue *)io_iter->folioq;
subreq->curr_folioq_slot = io_iter->folioq_slot;
subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot];
}
atomic_inc(&rreq->nr_outstanding);
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
subreq->rreq->netfs_ops->issue_read(subreq);
}
/*
* Go through the list of failed/short reads, retrying all retryable ones. We
* need to switch failed cache reads to network downloads.
*/
static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream0 = &rreq->io_streams[0];
LIST_HEAD(sublist);
LIST_HEAD(queue);
_enter("R=%x", rreq->debug_id);
if (list_empty(&rreq->subrequests))
return;
if (rreq->netfs_ops->retry_request)
rreq->netfs_ops->retry_request(rreq, NULL);
/* If there's no renegotiation to do, just resend each retryable subreq
* up to the first permanently failed one.
*/
if (!rreq->netfs_ops->prepare_read &&
!test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) {
struct netfs_io_subrequest *subreq;
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
break;
if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
netfs_reset_iter(subreq);
netfs_reissue_read(rreq, subreq);
}
}
return;
}
/* Okay, we need to renegotiate all the download requests and flip any
* failed cache reads over to being download requests and negotiate
* those also. All fully successful subreqs have been removed from the
* list and any spare data from those has been donated.
*
* What we do is decant the list and rebuild it one subreq at a time so
* that we don't end up with donations jumping over a gap we're busy
* populating with smaller subrequests. In the event that the subreq
* we just launched finishes before we insert the next subreq, it'll
* fill in rreq->prev_donated instead.
* Note: Alternatively, we could split the tail subrequest right before
* we reissue it and fix up the donations under lock.
*/
list_splice_init(&rreq->subrequests, &queue);
do {
struct netfs_io_subrequest *from;
struct iov_iter source;
unsigned long long start, len;
size_t part, deferred_next_donated = 0;
bool boundary = false;
/* Go through the subreqs and find the next span of contiguous
* buffer that we then rejig (cifs, for example, needs the
* rsize renegotiating) and reissue.
*/
from = list_first_entry(&queue, struct netfs_io_subrequest, rreq_link);
list_move_tail(&from->rreq_link, &sublist);
start = from->start + from->transferred;
len = from->len - from->transferred;
_debug("from R=%08x[%x] s=%llx ctl=%zx/%zx/%zx",
rreq->debug_id, from->debug_index,
from->start, from->consumed, from->transferred, from->len);
if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
!test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
goto abandon;
deferred_next_donated = from->next_donated;
while ((subreq = list_first_entry_or_null(
&queue, struct netfs_io_subrequest, rreq_link))) {
if (subreq->start != start + len ||
subreq->transferred > 0 ||
!test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
break;
list_move_tail(&subreq->rreq_link, &sublist);
len += subreq->len;
deferred_next_donated = subreq->next_donated;
if (test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags))
break;
}
_debug(" - range: %llx-%llx %llx", start, start + len - 1, len);
/* Determine the set of buffers we're going to use. Each
* subreq gets a subset of a single overall contiguous buffer.
*/
netfs_reset_iter(from);
source = from->io_iter;
source.count = len;
/* Work through the sublist. */
while ((subreq = list_first_entry_or_null(
&sublist, struct netfs_io_subrequest, rreq_link))) {
list_del(&subreq->rreq_link);
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->start = start - subreq->transferred;
subreq->len = len + subreq->transferred;
stream0->sreq_max_len = subreq->len;
__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
spin_lock_bh(&rreq->lock);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
subreq->prev_donated += rreq->prev_donated;
rreq->prev_donated = 0;
trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
spin_unlock_bh(&rreq->lock);
BUG_ON(!len);
/* Renegotiate max_len (rsize) */
if (rreq->netfs_ops->prepare_read(subreq) < 0) {
trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed);
__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
}
part = umin(len, stream0->sreq_max_len);
if (unlikely(rreq->io_streams[0].sreq_max_segs))
part = netfs_limit_iter(&source, 0, part, stream0->sreq_max_segs);
subreq->len = subreq->transferred + part;
subreq->io_iter = source;
iov_iter_truncate(&subreq->io_iter, part);
iov_iter_advance(&source, part);
len -= part;
start += part;
if (!len) {
if (boundary)
__set_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
subreq->next_donated = deferred_next_donated;
} else {
__clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
subreq->next_donated = 0;
}
netfs_reissue_read(rreq, subreq);
if (!len)
break;
/* If we ran out of subrequests, allocate another. */
if (list_empty(&sublist)) {
subreq = netfs_alloc_subrequest(rreq);
if (!subreq)
goto abandon;
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->start = start;
/* We get two refs, but need just one. */
netfs_put_subrequest(subreq, false, netfs_sreq_trace_new);
trace_netfs_sreq(subreq, netfs_sreq_trace_split);
list_add_tail(&subreq->rreq_link, &sublist);
}
}
/* If we managed to use fewer subreqs, we can discard the
* excess.
*/
while ((subreq = list_first_entry_or_null(
&sublist, struct netfs_io_subrequest, rreq_link))) {
trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
list_del(&subreq->rreq_link);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
}
} while (!list_empty(&queue));
return;
/* If we hit ENOMEM, fail all remaining subrequests */
abandon:
list_splice_init(&sublist, &queue);
list_for_each_entry(subreq, &queue, rreq_link) {
if (!subreq->error)
subreq->error = -ENOMEM;
__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
__clear_bit(NETFS_SREQ_RETRYING, &subreq->flags);
}
spin_lock_bh(&rreq->lock);
list_splice_tail_init(&queue, &rreq->subrequests);
spin_unlock_bh(&rreq->lock);
}
/*
* Retry reads.
*/
void netfs_retry_reads(struct netfs_io_request *rreq)
{
trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
atomic_inc(&rreq->nr_outstanding);
netfs_retry_read_subrequests(rreq);
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_terminated(rreq, false);
}
/*
* Unlock any the pages that haven't been unlocked yet due to abandoned
* subrequests.
*/
void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq)
{
struct folio_queue *p;
for (p = rreq->buffer; p; p = p->next) {
for (int slot = 0; slot < folioq_count(p); slot++) {
struct folio *folio = folioq_folio(p, slot);
if (folio && !folioq_is_marked2(p, slot)) {
trace_netfs_folio(folio, netfs_folio_trace_abandon);
folio_unlock(folio);
}
}
}
}

View File

@ -32,6 +32,7 @@ atomic_t netfs_n_wh_buffered_write;
atomic_t netfs_n_wh_writethrough;
atomic_t netfs_n_wh_dio_write;
atomic_t netfs_n_wh_writepages;
atomic_t netfs_n_wh_copy_to_cache;
atomic_t netfs_n_wh_wstream_conflict;
atomic_t netfs_n_wh_upload;
atomic_t netfs_n_wh_upload_done;
@ -39,45 +40,53 @@ atomic_t netfs_n_wh_upload_failed;
atomic_t netfs_n_wh_write;
atomic_t netfs_n_wh_write_done;
atomic_t netfs_n_wh_write_failed;
atomic_t netfs_n_wb_lock_skip;
atomic_t netfs_n_wb_lock_wait;
atomic_t netfs_n_folioq;
int netfs_stats_show(struct seq_file *m, void *v)
{
seq_printf(m, "Netfs : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n",
seq_printf(m, "Reads : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n",
atomic_read(&netfs_n_rh_dio_read),
atomic_read(&netfs_n_rh_readahead),
atomic_read(&netfs_n_rh_read_folio),
atomic_read(&netfs_n_rh_write_begin),
atomic_read(&netfs_n_rh_write_zskip));
seq_printf(m, "Netfs : BW=%u WT=%u DW=%u WP=%u\n",
seq_printf(m, "Writes : BW=%u WT=%u DW=%u WP=%u 2C=%u\n",
atomic_read(&netfs_n_wh_buffered_write),
atomic_read(&netfs_n_wh_writethrough),
atomic_read(&netfs_n_wh_dio_write),
atomic_read(&netfs_n_wh_writepages));
seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n",
atomic_read(&netfs_n_wh_writepages),
atomic_read(&netfs_n_wh_copy_to_cache));
seq_printf(m, "ZeroOps: ZR=%u sh=%u sk=%u\n",
atomic_read(&netfs_n_rh_zero),
atomic_read(&netfs_n_rh_short_read),
atomic_read(&netfs_n_rh_write_zskip));
seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n",
seq_printf(m, "DownOps: DL=%u ds=%u df=%u di=%u\n",
atomic_read(&netfs_n_rh_download),
atomic_read(&netfs_n_rh_download_done),
atomic_read(&netfs_n_rh_download_failed),
atomic_read(&netfs_n_rh_download_instead));
seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n",
seq_printf(m, "CaRdOps: RD=%u rs=%u rf=%u\n",
atomic_read(&netfs_n_rh_read),
atomic_read(&netfs_n_rh_read_done),
atomic_read(&netfs_n_rh_read_failed));
seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n",
seq_printf(m, "UpldOps: UL=%u us=%u uf=%u\n",
atomic_read(&netfs_n_wh_upload),
atomic_read(&netfs_n_wh_upload_done),
atomic_read(&netfs_n_wh_upload_failed));
seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n",
seq_printf(m, "CaWrOps: WR=%u ws=%u wf=%u\n",
atomic_read(&netfs_n_wh_write),
atomic_read(&netfs_n_wh_write_done),
atomic_read(&netfs_n_wh_write_failed));
seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n",
seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n",
atomic_read(&netfs_n_rh_rreq),
atomic_read(&netfs_n_rh_sreq),
atomic_read(&netfs_n_folioq),
atomic_read(&netfs_n_wh_wstream_conflict));
seq_printf(m, "WbLock : skip=%u wait=%u\n",
atomic_read(&netfs_n_wb_lock_skip),
atomic_read(&netfs_n_wb_lock_wait));
return fscache_stats_show(m);
}
EXPORT_SYMBOL(netfs_stats_show);

View File

@ -15,15 +15,11 @@
/* Notes made in the collector */
#define HIT_PENDING 0x01 /* A front op was still pending */
#define SOME_EMPTY 0x02 /* One of more streams are empty */
#define ALL_EMPTY 0x04 /* All streams are empty */
#define MAYBE_DISCONTIG 0x08 /* A front op may be discontiguous (rounded to PAGE_SIZE) */
#define NEED_REASSESS 0x10 /* Need to loop round and reassess */
#define REASSESS_DISCONTIG 0x20 /* Reassess discontiguity if contiguity advances */
#define MADE_PROGRESS 0x40 /* Made progress cleaning up a stream or the folio set */
#define BUFFERED 0x80 /* The pagecache needs cleaning up */
#define NEED_RETRY 0x100 /* A front op requests retrying */
#define SAW_FAILURE 0x200 /* One stream or hit a permanent failure */
#define NEED_REASSESS 0x02 /* Need to loop round and reassess */
#define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */
#define BUFFERED 0x08 /* The pagecache needs cleaning up */
#define NEED_RETRY 0x10 /* A front op requests retrying */
#define SAW_FAILURE 0x20 /* One stream or hit a permanent failure */
/*
* Successful completion of write of a folio to the server and/or cache. Note
@ -81,56 +77,38 @@ end_wb:
return gcount;
}
/*
* Get hold of a folio we have under writeback. We don't want to get the
* refcount on it.
*/
static struct folio *netfs_writeback_lookup_folio(struct netfs_io_request *wreq, loff_t pos)
{
XA_STATE(xas, &wreq->mapping->i_pages, pos / PAGE_SIZE);
struct folio *folio;
rcu_read_lock();
for (;;) {
xas_reset(&xas);
folio = xas_load(&xas);
if (xas_retry(&xas, folio))
continue;
if (!folio || xa_is_value(folio))
kdebug("R=%08x: folio %lx (%llx) not present",
wreq->debug_id, xas.xa_index, pos / PAGE_SIZE);
BUG_ON(!folio || xa_is_value(folio));
if (folio == xas_reload(&xas))
break;
}
rcu_read_unlock();
if (WARN_ONCE(!folio_test_writeback(folio),
"R=%08x: folio %lx is not under writeback\n",
wreq->debug_id, folio->index)) {
trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
}
return folio;
}
/*
* Unlock any folios we've finished with.
*/
static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
unsigned long long collected_to,
unsigned int *notes)
{
struct folio_queue *folioq = wreq->buffer;
unsigned long long collected_to = wreq->collected_to;
unsigned int slot = wreq->buffer_head_slot;
if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) {
if (netfs_pgpriv2_unlock_copied_folios(wreq))
*notes |= MADE_PROGRESS;
return;
}
if (slot >= folioq_nr_slots(folioq)) {
folioq = netfs_delete_buffer_head(wreq);
slot = 0;
}
for (;;) {
struct folio *folio;
struct netfs_folio *finfo;
unsigned long long fpos, fend;
size_t fsize, flen;
folio = netfs_writeback_lookup_folio(wreq, wreq->cleaned_to);
folio = folioq_folio(folioq, slot);
if (WARN_ONCE(!folio_test_writeback(folio),
"R=%08x: folio %lx is not under writeback\n",
wreq->debug_id, folio->index))
trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
fpos = folio_pos(folio);
fsize = folio_size(folio);
@ -141,12 +119,6 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
trace_netfs_collect_folio(wreq, folio, fend, collected_to);
if (fpos + fsize > wreq->contiguity) {
trace_netfs_collect_contig(wreq, fpos + fsize,
netfs_contig_trace_unlock);
wreq->contiguity = fpos + fsize;
}
/* Unlock any folio we've transferred all of. */
if (collected_to < fend)
break;
@ -155,9 +127,25 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
wreq->cleaned_to = fpos + fsize;
*notes |= MADE_PROGRESS;
/* Clean up the head folioq. If we clear an entire folioq, then
* we can get rid of it provided it's not also the tail folioq
* being filled by the issuer.
*/
folioq_clear(folioq, slot);
slot++;
if (slot >= folioq_nr_slots(folioq)) {
if (READ_ONCE(wreq->buffer_tail) == folioq)
break;
folioq = netfs_delete_buffer_head(wreq);
slot = 0;
}
if (fpos + fsize >= collected_to)
break;
}
wreq->buffer = folioq;
wreq->buffer_head_slot = slot;
}
/*
@ -188,9 +176,12 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
break;
if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
struct iov_iter source = subreq->io_iter;
iov_iter_revert(&source, subreq->len - source.count);
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_write(stream, subreq);
netfs_reissue_write(stream, subreq, &source);
}
}
return;
@ -200,6 +191,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
do {
struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;
struct iov_iter source;
unsigned long long start, len;
size_t part;
bool boundary = false;
@ -227,6 +219,13 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
len += to->len;
}
/* Determine the set of buffers we're going to use. Each
* subreq gets a subset of a single overall contiguous buffer.
*/
netfs_reset_iter(from);
source = from->io_iter;
source.count = len;
/* Work through the sublist. */
subreq = from;
list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
@ -238,7 +237,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
stream->prepare_write(subreq);
part = min(len, subreq->max_len);
part = min(len, stream->sreq_max_len);
subreq->len = part;
subreq->start = start;
subreq->transferred = 0;
@ -249,7 +248,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
boundary = true;
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_write(stream, subreq);
netfs_reissue_write(stream, subreq, &source);
if (subreq == to)
break;
}
@ -278,8 +277,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
subreq = netfs_alloc_subrequest(wreq);
subreq->source = to->source;
subreq->start = start;
subreq->max_len = len;
subreq->max_nr_segs = INT_MAX;
subreq->debug_index = atomic_inc_return(&wreq->subreq_counter);
subreq->stream_nr = to->stream_nr;
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
@ -293,10 +290,12 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
to = list_next_entry(to, rreq_link);
trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
stream->sreq_max_len = len;
stream->sreq_max_segs = INT_MAX;
switch (stream->source) {
case NETFS_UPLOAD_TO_SERVER:
netfs_stat(&netfs_n_wh_upload);
subreq->max_len = min(len, wreq->wsize);
stream->sreq_max_len = umin(len, wreq->wsize);
break;
case NETFS_WRITE_TO_CACHE:
netfs_stat(&netfs_n_wh_write);
@ -307,7 +306,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
stream->prepare_write(subreq);
part = min(len, subreq->max_len);
part = umin(len, stream->sreq_max_len);
subreq->len = subreq->transferred + part;
len -= part;
start += part;
@ -316,7 +315,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
boundary = false;
}
netfs_reissue_write(stream, subreq);
netfs_reissue_write(stream, subreq, &source);
if (!len)
break;
@ -377,7 +376,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)
{
struct netfs_io_subrequest *front, *remove;
struct netfs_io_stream *stream;
unsigned long long collected_to;
unsigned long long collected_to, issued_to;
unsigned int notes;
int s;
@ -386,28 +385,22 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)
trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
reassess_streams:
issued_to = atomic64_read(&wreq->issued_to);
smp_rmb();
collected_to = ULLONG_MAX;
if (wreq->origin == NETFS_WRITEBACK)
notes = ALL_EMPTY | BUFFERED | MAYBE_DISCONTIG;
else if (wreq->origin == NETFS_WRITETHROUGH)
notes = ALL_EMPTY | BUFFERED;
if (wreq->origin == NETFS_WRITEBACK ||
wreq->origin == NETFS_WRITETHROUGH ||
wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE)
notes = BUFFERED;
else
notes = ALL_EMPTY;
notes = 0;
/* Remove completed subrequests from the front of the streams and
* advance the completion point on each stream. We stop when we hit
* something that's in progress. The issuer thread may be adding stuff
* to the tail whilst we're doing this.
*
* We must not, however, merge in discontiguities that span whole
* folios that aren't under writeback. This is made more complicated
* by the folios in the gap being of unpredictable sizes - if they even
* exist - but we don't want to look them up.
*/
for (s = 0; s < NR_IO_STREAMS; s++) {
loff_t rstart, rend;
stream = &wreq->io_streams[s];
/* Read active flag before list pointers */
if (!smp_load_acquire(&stream->active))
@ -419,26 +412,10 @@ reassess_streams:
//_debug("sreq [%x] %llx %zx/%zx",
// front->debug_index, front->start, front->transferred, front->len);
/* Stall if there may be a discontinuity. */
rstart = round_down(front->start, PAGE_SIZE);
if (rstart > wreq->contiguity) {
if (wreq->contiguity > stream->collected_to) {
trace_netfs_collect_gap(wreq, stream,
wreq->contiguity, 'D');
stream->collected_to = wreq->contiguity;
if (stream->collected_to < front->start) {
trace_netfs_collect_gap(wreq, stream, issued_to, 'F');
stream->collected_to = front->start;
}
notes |= REASSESS_DISCONTIG;
break;
}
rend = round_up(front->start + front->len, PAGE_SIZE);
if (rend > wreq->contiguity) {
trace_netfs_collect_contig(wreq, rend,
netfs_contig_trace_collect);
wreq->contiguity = rend;
if (notes & REASSESS_DISCONTIG)
notes |= NEED_REASSESS;
}
notes &= ~MAYBE_DISCONTIG;
/* Stall if the front is still undergoing I/O. */
if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) {
@ -473,33 +450,27 @@ reassess_streams:
cancel:
/* Remove if completely consumed. */
spin_lock(&wreq->lock);
spin_lock_bh(&wreq->lock);
remove = front;
list_del_init(&front->rreq_link);
front = list_first_entry_or_null(&stream->subrequests,
struct netfs_io_subrequest, rreq_link);
stream->front = front;
if (!front) {
unsigned long long jump_to = atomic64_read(&wreq->issued_to);
if (stream->collected_to < jump_to) {
trace_netfs_collect_gap(wreq, stream, jump_to, 'A');
stream->collected_to = jump_to;
}
}
spin_unlock(&wreq->lock);
spin_unlock_bh(&wreq->lock);
netfs_put_subrequest(remove, false,
notes & SAW_FAILURE ?
netfs_sreq_trace_put_cancel :
netfs_sreq_trace_put_done);
}
if (front)
notes &= ~ALL_EMPTY;
else
notes |= SOME_EMPTY;
/* If we have an empty stream, we need to jump it forward
* otherwise the collection point will never advance.
*/
if (!front && issued_to > stream->collected_to) {
trace_netfs_collect_gap(wreq, stream, issued_to, 'E');
stream->collected_to = issued_to;
}
if (stream->collected_to < collected_to)
collected_to = stream->collected_to;
@ -508,36 +479,6 @@ reassess_streams:
if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to)
wreq->collected_to = collected_to;
/* If we have an empty stream, we need to jump it forward over any gap
* otherwise the collection point will never advance.
*
* Note that the issuer always adds to the stream with the lowest
* so-far submitted start, so if we see two consecutive subreqs in one
* stream with nothing between then in another stream, then the second
* stream has a gap that can be jumped.
*/
if (notes & SOME_EMPTY) {
unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted);
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
if (stream->active &&
stream->front &&
stream->front->start < jump_to)
jump_to = stream->front->start;
}
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
if (stream->active &&
!stream->front &&
stream->collected_to < jump_to) {
trace_netfs_collect_gap(wreq, stream, jump_to, 'B');
stream->collected_to = jump_to;
}
}
}
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
if (stream->active)
@ -548,43 +489,14 @@ reassess_streams:
/* Unlock any folios that we have now finished with. */
if (notes & BUFFERED) {
unsigned long long clean_to = min(wreq->collected_to, wreq->contiguity);
if (wreq->cleaned_to < clean_to)
netfs_writeback_unlock_folios(wreq, clean_to, &notes);
if (wreq->cleaned_to < wreq->collected_to)
netfs_writeback_unlock_folios(wreq, &notes);
} else {
wreq->cleaned_to = wreq->collected_to;
}
// TODO: Discard encryption buffers
/* If all streams are discontiguous with the last folio we cleared, we
* may need to skip a set of folios.
*/
if ((notes & (MAYBE_DISCONTIG | ALL_EMPTY)) == MAYBE_DISCONTIG) {
unsigned long long jump_to = ULLONG_MAX;
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
if (stream->active && stream->front &&
stream->front->start < jump_to)
jump_to = stream->front->start;
}
trace_netfs_collect_contig(wreq, jump_to, netfs_contig_trace_jump);
wreq->contiguity = jump_to;
wreq->cleaned_to = jump_to;
wreq->collected_to = jump_to;
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
if (stream->collected_to < jump_to)
stream->collected_to = jump_to;
}
//cond_resched();
notes |= MADE_PROGRESS;
goto reassess_streams;
}
if (notes & NEED_RETRY)
goto need_retry;
if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {

View File

@ -95,7 +95,8 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
struct netfs_io_request *wreq;
struct netfs_inode *ictx;
bool is_buffered = (origin == NETFS_WRITEBACK ||
origin == NETFS_WRITETHROUGH);
origin == NETFS_WRITETHROUGH ||
origin == NETFS_PGPRIV2_COPY_TO_CACHE);
wreq = netfs_alloc_request(mapping, file, start, 0, origin);
if (IS_ERR(wreq))
@ -107,9 +108,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
if (is_buffered && netfs_is_cache_enabled(ictx))
fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
wreq->contiguity = wreq->start;
wreq->cleaned_to = wreq->start;
INIT_WORK(&wreq->work, netfs_write_collection_worker);
wreq->io_streams[0].stream_nr = 0;
wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER;
@ -158,22 +157,19 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
subreq = netfs_alloc_subrequest(wreq);
subreq->source = stream->source;
subreq->start = start;
subreq->max_len = ULONG_MAX;
subreq->max_nr_segs = INT_MAX;
subreq->stream_nr = stream->stream_nr;
subreq->io_iter = wreq->io_iter;
_enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
refcount_read(&subreq->ref),
netfs_sreq_trace_new);
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
stream->sreq_max_len = UINT_MAX;
stream->sreq_max_segs = INT_MAX;
switch (stream->source) {
case NETFS_UPLOAD_TO_SERVER:
netfs_stat(&netfs_n_wh_upload);
subreq->max_len = wreq->wsize;
stream->sreq_max_len = wreq->wsize;
break;
case NETFS_WRITE_TO_CACHE:
netfs_stat(&netfs_n_wh_write);
@ -192,7 +188,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
* the list. The collector only goes nextwards and uses the lock to
* remove entries off of the front.
*/
spin_lock(&wreq->lock);
spin_lock_bh(&wreq->lock);
list_add_tail(&subreq->rreq_link, &stream->subrequests);
if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
stream->front = subreq;
@ -203,7 +199,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
}
}
spin_unlock(&wreq->lock);
spin_unlock_bh(&wreq->lock);
stream->construct = subreq;
}
@ -223,31 +219,26 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream,
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
return netfs_write_subrequest_terminated(subreq, subreq->error, false);
// TODO: Use encrypted buffer
if (test_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags)) {
subreq->io_iter = wreq->io_iter;
iov_iter_advance(&subreq->io_iter,
subreq->start + subreq->transferred - wreq->start);
iov_iter_truncate(&subreq->io_iter,
subreq->len - subreq->transferred);
} else {
iov_iter_xarray(&subreq->io_iter, ITER_SOURCE, &wreq->mapping->i_pages,
subreq->start + subreq->transferred,
subreq->len - subreq->transferred);
}
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
stream->issue_write(subreq);
}
void netfs_reissue_write(struct netfs_io_stream *stream,
struct netfs_io_subrequest *subreq)
struct netfs_io_subrequest *subreq,
struct iov_iter *source)
{
size_t size = subreq->len - subreq->transferred;
// TODO: Use encrypted buffer
subreq->io_iter = *source;
iov_iter_advance(source, size);
iov_iter_truncate(&subreq->io_iter, size);
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
netfs_do_issue_write(stream, subreq);
}
static void netfs_issue_write(struct netfs_io_request *wreq,
void netfs_issue_write(struct netfs_io_request *wreq,
struct netfs_io_stream *stream)
{
struct netfs_io_subrequest *subreq = stream->construct;
@ -255,9 +246,7 @@ static void netfs_issue_write(struct netfs_io_request *wreq,
if (!subreq)
return;
stream->construct = NULL;
if (subreq->start + subreq->len > wreq->start + wreq->submitted)
WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start);
subreq->io_iter.count = subreq->len;
netfs_do_issue_write(stream, subreq);
}
@ -290,13 +279,14 @@ int netfs_advance_write(struct netfs_io_request *wreq,
netfs_prepare_write(wreq, stream, start);
subreq = stream->construct;
part = min(subreq->max_len - subreq->len, len);
_debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len);
part = umin(stream->sreq_max_len - subreq->len, len);
_debug("part %zx/%zx %zx/%zx", subreq->len, stream->sreq_max_len, part, len);
subreq->len += part;
subreq->nr_segs++;
stream->submit_extendable_to -= part;
if (subreq->len >= subreq->max_len ||
subreq->nr_segs >= subreq->max_nr_segs ||
if (subreq->len >= stream->sreq_max_len ||
subreq->nr_segs >= stream->sreq_max_segs ||
to_eof) {
netfs_issue_write(wreq, stream);
subreq = NULL;
@ -410,19 +400,26 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
folio_unlock(folio);
if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
if (!fscache_resources_valid(&wreq->cache_resources)) {
if (!cache->avail) {
trace_netfs_folio(folio, netfs_folio_trace_cancel_copy);
netfs_issue_write(wreq, upload);
netfs_folio_written_back(folio);
return 0;
}
trace_netfs_folio(folio, netfs_folio_trace_store_copy);
} else if (!upload->avail && !cache->avail) {
trace_netfs_folio(folio, netfs_folio_trace_cancel_store);
netfs_folio_written_back(folio);
return 0;
} else if (!upload->construct) {
trace_netfs_folio(folio, netfs_folio_trace_store);
} else {
trace_netfs_folio(folio, netfs_folio_trace_store_plus);
}
/* Attach the folio to the rolling buffer. */
netfs_buffer_append_folio(wreq, folio, false);
/* Move the submission point forward to allow for write-streaming data
* not starting at the front of the page. We don't do write-streaming
* with the cache as the cache requires DIO alignment.
@ -432,7 +429,6 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
*/
for (int s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
stream->submit_max_len = fsize;
stream->submit_off = foff;
stream->submit_len = flen;
if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) ||
@ -440,7 +436,6 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
fgroup == NETFS_FOLIO_COPY_TO_CACHE)) {
stream->submit_off = UINT_MAX;
stream->submit_len = 0;
stream->submit_max_len = 0;
}
}
@ -467,12 +462,13 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
if (choose_s < 0)
break;
stream = &wreq->io_streams[choose_s];
wreq->io_iter.iov_offset = stream->submit_off;
atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
stream->submit_extendable_to = fsize - stream->submit_off;
part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
stream->submit_len, to_eof);
atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
stream->submit_off += part;
stream->submit_max_len -= part;
if (part > stream->submit_len)
stream->submit_len = 0;
else
@ -481,6 +477,8 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
debug = true;
}
wreq->io_iter.iov_offset = 0;
iov_iter_advance(&wreq->io_iter, fsize);
atomic64_set(&wreq->issued_to, fpos + fsize);
if (!debug)
@ -505,10 +503,14 @@ int netfs_writepages(struct address_space *mapping,
struct folio *folio;
int error = 0;
if (wbc->sync_mode == WB_SYNC_ALL)
mutex_lock(&ictx->wb_lock);
else if (!mutex_trylock(&ictx->wb_lock))
if (!mutex_trylock(&ictx->wb_lock)) {
if (wbc->sync_mode == WB_SYNC_NONE) {
netfs_stat(&netfs_n_wb_lock_skip);
return 0;
}
netfs_stat(&netfs_n_wb_lock_wait);
mutex_lock(&ictx->wb_lock);
}
/* Need the first folio to be able to set up the op. */
folio = writeback_iter(mapping, wbc, NULL, &error);
@ -525,10 +527,10 @@ int netfs_writepages(struct address_space *mapping,
netfs_stat(&netfs_n_wh_writepages);
do {
_debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted);
_debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to));
/* It appears we don't have to handle cyclic writeback wrapping. */
WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted);
WARN_ON_ONCE(wreq && folio_pos(folio) < atomic64_read(&wreq->issued_to));
if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE &&
unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) {
@ -672,6 +674,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
part = netfs_advance_write(wreq, upload, start, len, false);
start += part;
len -= part;
iov_iter_advance(&wreq->io_iter, part);
if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE);

View File

@ -267,6 +267,7 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi
rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);
/* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
__set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
rreq->io_streams[0].sreq_max_len = NFS_SB(rreq->inode->i_sb)->rsize;
return 0;
}
@ -288,14 +289,6 @@ static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sre
return netfs;
}
static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq)
{
size_t rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize;
sreq->len = min(sreq->len, rsize);
return true;
}
static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
{
struct nfs_netfs_io_data *netfs;
@ -304,17 +297,18 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
struct nfs_open_context *ctx = sreq->rreq->netfs_priv;
struct page *page;
unsigned long idx;
pgoff_t start, last;
int err;
pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT;
pgoff_t last = ((sreq->start + sreq->len -
sreq->transferred - 1) >> PAGE_SHIFT);
start = (sreq->start + sreq->transferred) >> PAGE_SHIFT;
last = ((sreq->start + sreq->len - sreq->transferred - 1) >> PAGE_SHIFT);
nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
netfs = nfs_netfs_alloc(sreq);
if (!netfs)
return netfs_subreq_terminated(sreq, -ENOMEM, false);
return netfs_read_subreq_terminated(sreq, -ENOMEM, false);
pgio.pg_netfs = netfs; /* used in completion */
@ -380,5 +374,4 @@ const struct netfs_request_ops nfs_netfs_ops = {
.init_request = nfs_netfs_init_request,
.free_request = nfs_netfs_free_request,
.issue_read = nfs_netfs_issue_read,
.clamp_length = nfs_netfs_clamp_length
};

View File

@ -60,8 +60,6 @@ static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs)
static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
{
ssize_t final_len;
/* Only the last RPC completion should call netfs_subreq_terminated() */
if (!refcount_dec_and_test(&netfs->refcount))
return;
@ -74,8 +72,9 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
* Correct the final length here to be no larger than the netfs subrequest
* length, and thus avoid netfs's "Subreq overread" warning message.
*/
final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred));
netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false);
netfs->sreq->transferred = min_t(s64, netfs->sreq->len,
atomic64_read(&netfs->transferred));
netfs_read_subreq_terminated(netfs->sreq, netfs->error, false);
kfree(netfs);
}
static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)

View File

@ -21,127 +21,21 @@
#include <linux/random.h>
#include <linux/highmem.h>
#include <linux/fips.h>
#include <linux/iov_iter.h>
#include "../common/arc4.h"
#include <crypto/aead.h>
/*
* Hash data from a BVEC-type iterator.
*/
static int cifs_shash_bvec(const struct iov_iter *iter, ssize_t maxsize,
struct shash_desc *shash)
static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len,
void *priv, void *priv2)
{
const struct bio_vec *bv = iter->bvec;
unsigned long start = iter->iov_offset;
unsigned int i;
void *p;
int ret;
struct shash_desc *shash = priv;
int ret, *pret = priv2;
for (i = 0; i < iter->nr_segs; i++) {
size_t off, len;
len = bv[i].bv_len;
if (start >= len) {
start -= len;
continue;
ret = crypto_shash_update(shash, iter_base, len);
if (ret < 0) {
*pret = ret;
return len;
}
len = min_t(size_t, maxsize, len - start);
off = bv[i].bv_offset + start;
p = kmap_local_page(bv[i].bv_page);
ret = crypto_shash_update(shash, p + off, len);
kunmap_local(p);
if (ret < 0)
return ret;
maxsize -= len;
if (maxsize <= 0)
break;
start = 0;
}
return 0;
}
/*
* Hash data from a KVEC-type iterator.
*/
static int cifs_shash_kvec(const struct iov_iter *iter, ssize_t maxsize,
struct shash_desc *shash)
{
const struct kvec *kv = iter->kvec;
unsigned long start = iter->iov_offset;
unsigned int i;
int ret;
for (i = 0; i < iter->nr_segs; i++) {
size_t len;
len = kv[i].iov_len;
if (start >= len) {
start -= len;
continue;
}
len = min_t(size_t, maxsize, len - start);
ret = crypto_shash_update(shash, kv[i].iov_base + start, len);
if (ret < 0)
return ret;
maxsize -= len;
if (maxsize <= 0)
break;
start = 0;
}
return 0;
}
/*
* Hash data from an XARRAY-type iterator.
*/
static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize,
struct shash_desc *shash)
{
struct folio *folios[16], *folio;
unsigned int nr, i, j, npages;
loff_t start = iter->xarray_start + iter->iov_offset;
pgoff_t last, index = start / PAGE_SIZE;
ssize_t ret = 0;
size_t len, offset, foffset;
void *p;
if (maxsize == 0)
return 0;
last = (start + maxsize - 1) / PAGE_SIZE;
do {
nr = xa_extract(iter->xarray, (void **)folios, index, last,
ARRAY_SIZE(folios), XA_PRESENT);
if (nr == 0)
return -EIO;
for (i = 0; i < nr; i++) {
folio = folios[i];
npages = folio_nr_pages(folio);
foffset = start - folio_pos(folio);
offset = foffset % PAGE_SIZE;
for (j = foffset / PAGE_SIZE; j < npages; j++) {
len = min_t(size_t, maxsize, PAGE_SIZE - offset);
p = kmap_local_page(folio_page(folio, j));
ret = crypto_shash_update(shash, p + offset, len);
kunmap_local(p);
if (ret < 0)
return ret;
maxsize -= len;
if (maxsize <= 0)
return 0;
start += len;
offset = 0;
index++;
}
}
} while (nr == ARRAY_SIZE(folios));
return 0;
}
@ -151,21 +45,13 @@ static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize,
static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize,
struct shash_desc *shash)
{
if (maxsize == 0)
return 0;
struct iov_iter tmp_iter = *iter;
int err = -EIO;
switch (iov_iter_type(iter)) {
case ITER_BVEC:
return cifs_shash_bvec(iter, maxsize, shash);
case ITER_KVEC:
return cifs_shash_kvec(iter, maxsize, shash);
case ITER_XARRAY:
return cifs_shash_xarray(iter, maxsize, shash);
default:
pr_err("cifs_shash_iter(%u) unsupported\n", iov_iter_type(iter));
WARN_ON_ONCE(1);
return -EIO;
}
if (iterate_and_advance_kernel(&tmp_iter, maxsize, shash, &err,
cifs_shash_step) != maxsize)
return err;
return 0;
}
int __cifs_calc_signature(struct smb_rqst *rqst,

View File

@ -255,7 +255,7 @@ struct smb_rqst {
struct kvec *rq_iov; /* array of kvecs */
unsigned int rq_nvec; /* number of kvecs in array */
struct iov_iter rq_iter; /* Data iterator */
struct xarray rq_buffer; /* Page buffer for encryption */
struct folio_queue *rq_buffer; /* Buffer for encryption */
};
struct mid_q_entry;
@ -1485,7 +1485,6 @@ struct cifs_io_subrequest {
struct cifs_io_request *req;
};
ssize_t got_bytes;
size_t actual_len;
unsigned int xid;
int result;
bool have_xid;
@ -1550,7 +1549,6 @@ struct cifsInodeInfo {
#define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */
#define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */
#define CIFS_INO_LOCK (5) /* lock bit for synchronization */
#define CIFS_INO_MODIFIED_ATTR (6) /* Indicate change in mtime/ctime */
#define CIFS_INO_CLOSE_ON_LOCK (7) /* Not to defer the close when lock is set */
unsigned long flags;
spinlock_t writers_lock;

View File

@ -1266,9 +1266,7 @@ static void cifs_readv_worker(struct work_struct *work)
struct cifs_io_subrequest *rdata =
container_of(work, struct cifs_io_subrequest, subreq.work);
netfs_subreq_terminated(&rdata->subreq,
(rdata->result == 0 || rdata->result == -EAGAIN) ?
rdata->got_bytes : rdata->result, true);
netfs_read_subreq_terminated(&rdata->subreq, rdata->result, true);
}
static void
@ -1327,15 +1325,16 @@ cifs_readv_callback(struct mid_q_entry *mid)
__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
rdata->result = 0;
} else {
if (rdata->got_bytes < rdata->actual_len &&
rdata->subreq.start + rdata->subreq.transferred + rdata->got_bytes ==
ictx->remote_i_size) {
size_t trans = rdata->subreq.transferred + rdata->got_bytes;
if (trans < rdata->subreq.len &&
rdata->subreq.start + trans == ictx->remote_i_size) {
__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
rdata->result = 0;
}
}
rdata->credits.value = 0;
rdata->subreq.transferred += rdata->got_bytes;
INIT_WORK(&rdata->subreq.work, cifs_readv_worker);
queue_work(cifsiod_wq, &rdata->subreq.work);
release_mid(mid);

View File

@ -49,6 +49,7 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq)
struct cifs_io_subrequest *wdata =
container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = wdata->req;
struct netfs_io_stream *stream = &req->rreq.io_streams[subreq->stream_nr];
struct TCP_Server_Info *server;
struct cifsFileInfo *open_file = req->cfile;
size_t wsize = req->rreq.wsize;
@ -73,7 +74,7 @@ retry:
}
}
rc = server->ops->wait_mtu_credits(server, wsize, &wdata->subreq.max_len,
rc = server->ops->wait_mtu_credits(server, wsize, &stream->sreq_max_len,
&wdata->credits);
if (rc < 0) {
subreq->error = rc;
@ -92,7 +93,7 @@ retry:
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->smbd_conn)
subreq->max_nr_segs = server->smbd_conn->max_frmr_depth;
stream->sreq_max_segs = server->smbd_conn->max_frmr_depth;
#endif
}
@ -111,7 +112,6 @@ static void cifs_issue_write(struct netfs_io_subrequest *subreq)
goto fail;
}
wdata->actual_len = wdata->subreq.len;
rc = adjust_credits(wdata->server, wdata, cifs_trace_rw_credits_issue_write_adjust);
if (rc)
goto fail;
@ -140,25 +140,22 @@ static void cifs_netfs_invalidate_cache(struct netfs_io_request *wreq)
}
/*
* Split the read up according to how many credits we can get for each piece.
* It's okay to sleep here if we need to wait for more credit to become
* available.
*
* We also choose the server and allocate an operation ID to be cleaned up
* later.
* Negotiate the size of a read operation on behalf of the netfs library.
*/
static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
static int cifs_prepare_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
struct TCP_Server_Info *server = req->server;
struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
size_t rsize;
int rc;
size_t size;
int rc = 0;
if (!rdata->have_xid) {
rdata->xid = get_xid();
rdata->have_xid = true;
}
rdata->server = server;
if (cifs_sb->ctx->rsize == 0)
@ -166,13 +163,12 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
server->ops->negotiate_rsize(tlink_tcon(req->cfile->tlink),
cifs_sb->ctx);
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
&rsize, &rdata->credits);
if (rc) {
subreq->error = rc;
return false;
}
&size, &rdata->credits);
if (rc)
return rc;
rreq->io_streams[0].sreq_max_len = size;
rdata->credits.in_flight_check = 1;
rdata->credits.rreq_debug_id = rreq->debug_id;
@ -184,14 +180,11 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
server->credits, server->in_flight, 0,
cifs_trace_rw_credits_read_submit);
subreq->len = umin(subreq->len, rsize);
rdata->actual_len = subreq->len;
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->smbd_conn)
subreq->max_nr_segs = server->smbd_conn->max_frmr_depth;
rreq->io_streams[0].sreq_max_segs = server->smbd_conn->max_frmr_depth;
#endif
return true;
return 0;
}
/*
@ -200,59 +193,41 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
* to only read a portion of that, but as long as we read something, the netfs
* helper will call us again so that we can issue another read.
*/
static void cifs_req_issue_read(struct netfs_io_subrequest *subreq)
static void cifs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
struct TCP_Server_Info *server = req->server;
struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
int rc = 0;
cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n",
__func__, rreq->debug_id, subreq->debug_index, rreq->mapping,
subreq->transferred, subreq->len);
if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) {
/*
* As we're issuing a retry, we need to negotiate some new
* credits otherwise the server may reject the op with
* INVALID_PARAMETER. Note, however, we may get back less
* credit than we need to complete the op, in which case, we
* shorten the op and rely on additional rounds of retry.
*/
size_t rsize = umin(subreq->len - subreq->transferred,
cifs_sb->ctx->rsize);
rc = server->ops->wait_mtu_credits(server, rsize, &rdata->actual_len,
&rdata->credits);
rc = adjust_credits(server, rdata, cifs_trace_rw_credits_issue_read_adjust);
if (rc)
goto out;
rdata->credits.in_flight_check = 1;
trace_smb3_rw_credits(rdata->rreq->debug_id,
rdata->subreq.debug_index,
rdata->credits.value,
server->credits, server->in_flight, 0,
cifs_trace_rw_credits_read_resubmit);
}
goto failed;
if (req->cfile->invalidHandle) {
do {
rc = cifs_reopen_file(req->cfile, true);
} while (rc == -EAGAIN);
if (rc)
goto out;
goto failed;
}
if (subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
rc = rdata->server->ops->async_readv(rdata);
out:
if (rc)
netfs_subreq_terminated(subreq, rc, false);
goto failed;
return;
failed:
netfs_read_subreq_terminated(subreq, rc, false);
}
/*
@ -316,12 +291,6 @@ static void cifs_rreq_done(struct netfs_io_request *rreq)
inode_set_atime_to_ts(inode, inode_get_mtime(inode));
}
static void cifs_post_modify(struct inode *inode)
{
/* Indication to update ctime and mtime as close is deferred */
set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
}
static void cifs_free_request(struct netfs_io_request *rreq)
{
struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq);
@ -369,10 +338,9 @@ const struct netfs_request_ops cifs_req_ops = {
.init_request = cifs_init_request,
.free_request = cifs_free_request,
.free_subrequest = cifs_free_subrequest,
.clamp_length = cifs_clamp_length,
.issue_read = cifs_req_issue_read,
.prepare_read = cifs_prepare_read,
.issue_read = cifs_issue_read,
.done = cifs_rreq_done,
.post_modify = cifs_post_modify,
.begin_writeback = cifs_begin_writeback,
.prepare_write = cifs_prepare_write,
.issue_write = cifs_issue_write,
@ -1396,7 +1364,7 @@ int cifs_close(struct inode *inode, struct file *file)
dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL);
if ((cfile->status_file_deleted == false) &&
(smb2_can_defer_close(inode, dclose))) {
if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
if (test_and_clear_bit(NETFS_ICTX_MODIFIED_ATTR, &cinode->netfs.flags)) {
inode_set_mtime_to_ts(inode,
inode_set_ctime_current(inode));
}

View File

@ -13,6 +13,7 @@
#include <linux/sort.h>
#include <crypto/aead.h>
#include <linux/fiemap.h>
#include <linux/folio_queue.h>
#include <uapi/linux/magic.h>
#include "cifsfs.h"
#include "cifsglob.h"
@ -301,7 +302,8 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
unsigned int /*enum smb3_rw_credits_trace*/ trace)
{
struct cifs_credits *credits = &subreq->credits;
int new_val = DIV_ROUND_UP(subreq->actual_len, SMB2_MAX_BUFFER_SIZE);
int new_val = DIV_ROUND_UP(subreq->subreq.len - subreq->subreq.transferred,
SMB2_MAX_BUFFER_SIZE);
int scredits, in_flight;
if (!credits->value || credits->value == new_val)
@ -4392,30 +4394,86 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
}
/*
* Clear a read buffer, discarding the folios which have XA_MARK_0 set.
* Clear a read buffer, discarding the folios which have the 1st mark set.
*/
static void cifs_clear_xarray_buffer(struct xarray *buffer)
static void cifs_clear_folioq_buffer(struct folio_queue *buffer)
{
struct folio *folio;
struct folio_queue *folioq;
XA_STATE(xas, buffer, 0);
rcu_read_lock();
xas_for_each_marked(&xas, folio, ULONG_MAX, XA_MARK_0) {
folio_put(folio);
while ((folioq = buffer)) {
for (int s = 0; s < folioq_count(folioq); s++)
if (folioq_is_marked(folioq, s))
folio_put(folioq_folio(folioq, s));
buffer = folioq->next;
kfree(folioq);
}
rcu_read_unlock();
xa_destroy(buffer);
}
/*
* Allocate buffer space into a folio queue.
*/
static struct folio_queue *cifs_alloc_folioq_buffer(ssize_t size)
{
struct folio_queue *buffer = NULL, *tail = NULL, *p;
struct folio *folio;
unsigned int slot;
do {
if (!tail || folioq_full(tail)) {
p = kmalloc(sizeof(*p), GFP_NOFS);
if (!p)
goto nomem;
folioq_init(p);
if (tail) {
tail->next = p;
p->prev = tail;
} else {
buffer = p;
}
tail = p;
}
folio = folio_alloc(GFP_KERNEL|__GFP_HIGHMEM, 0);
if (!folio)
goto nomem;
slot = folioq_append_mark(tail, folio);
size -= folioq_folio_size(tail, slot);
} while (size > 0);
return buffer;
nomem:
cifs_clear_folioq_buffer(buffer);
return NULL;
}
/*
* Copy data from an iterator to the folios in a folio queue buffer.
*/
static bool cifs_copy_iter_to_folioq(struct iov_iter *iter, size_t size,
struct folio_queue *buffer)
{
for (; buffer; buffer = buffer->next) {
for (int s = 0; s < folioq_count(buffer); s++) {
struct folio *folio = folioq_folio(buffer, s);
size_t part = folioq_folio_size(buffer, s);
part = umin(part, size);
if (copy_folio_from_iter(folio, 0, part, iter) != part)
return false;
size -= part;
}
}
return true;
}
void
smb3_free_compound_rqst(int num_rqst, struct smb_rqst *rqst)
{
int i;
for (i = 0; i < num_rqst; i++)
if (!xa_empty(&rqst[i].rq_buffer))
cifs_clear_xarray_buffer(&rqst[i].rq_buffer);
for (int i = 0; i < num_rqst; i++)
cifs_clear_folioq_buffer(rqst[i].rq_buffer);
}
/*
@ -4436,52 +4494,32 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
struct smb_rqst *new_rq, struct smb_rqst *old_rq)
{
struct smb2_transform_hdr *tr_hdr = new_rq[0].rq_iov[0].iov_base;
struct page *page;
unsigned int orig_len = 0;
int i, j;
int rc = -ENOMEM;
for (i = 1; i < num_rqst; i++) {
for (int i = 1; i < num_rqst; i++) {
struct smb_rqst *old = &old_rq[i - 1];
struct smb_rqst *new = &new_rq[i];
struct xarray *buffer = &new->rq_buffer;
size_t size = iov_iter_count(&old->rq_iter), seg, copied = 0;
struct folio_queue *buffer;
size_t size = iov_iter_count(&old->rq_iter);
orig_len += smb_rqst_len(server, old);
new->rq_iov = old->rq_iov;
new->rq_nvec = old->rq_nvec;
xa_init(buffer);
if (size > 0) {
unsigned int npages = DIV_ROUND_UP(size, PAGE_SIZE);
for (j = 0; j < npages; j++) {
void *o;
rc = -ENOMEM;
page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
if (!page)
buffer = cifs_alloc_folioq_buffer(size);
if (!buffer)
goto err_free;
page->index = j;
o = xa_store(buffer, j, page, GFP_KERNEL);
if (xa_is_err(o)) {
rc = xa_err(o);
put_page(page);
new->rq_buffer = buffer;
iov_iter_folio_queue(&new->rq_iter, ITER_SOURCE,
buffer, 0, 0, size);
if (!cifs_copy_iter_to_folioq(&old->rq_iter, size, buffer)) {
rc = -EIO;
goto err_free;
}
xa_set_mark(buffer, j, XA_MARK_0);
seg = min_t(size_t, size - copied, PAGE_SIZE);
if (copy_page_from_iter(page, 0, seg, &old->rq_iter) != seg) {
rc = -EFAULT;
goto err_free;
}
copied += seg;
}
iov_iter_xarray(&new->rq_iter, ITER_SOURCE,
buffer, 0, size);
}
}
@ -4545,16 +4583,16 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
}
static int
cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size,
unsigned int skip, struct iov_iter *iter)
cifs_copy_folioq_to_iter(struct folio_queue *folioq, size_t data_size,
size_t skip, struct iov_iter *iter)
{
struct page *page;
unsigned long index;
for (; folioq; folioq = folioq->next) {
for (int s = 0; s < folioq_count(folioq); s++) {
struct folio *folio = folioq_folio(folioq, s);
size_t fsize = folio_size(folio);
size_t n, len = umin(fsize - skip, data_size);
xa_for_each(pages, index, page) {
size_t n, len = min_t(unsigned int, PAGE_SIZE - skip, data_size);
n = copy_page_to_iter(page, skip, len, iter);
n = copy_folio_to_iter(folio, skip, len, iter);
if (n != len) {
cifs_dbg(VFS, "%s: something went wrong\n", __func__);
return -EIO;
@ -4562,14 +4600,15 @@ cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size,
data_size -= n;
skip = 0;
}
}
return 0;
}
static int
handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
char *buf, unsigned int buf_len, struct xarray *pages,
unsigned int pages_len, bool is_offloaded)
char *buf, unsigned int buf_len, struct folio_queue *buffer,
unsigned int buffer_len, bool is_offloaded)
{
unsigned int data_offset;
unsigned int data_len;
@ -4666,7 +4705,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
if (data_len > pages_len - pad_len) {
if (data_len > buffer_len - pad_len) {
/* data_len is corrupt -- discard frame */
rdata->result = -EIO;
if (is_offloaded)
@ -4677,7 +4716,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
}
/* Copy the data to the output I/O iterator. */
rdata->result = cifs_copy_pages_to_iter(pages, pages_len,
rdata->result = cifs_copy_folioq_to_iter(buffer, buffer_len,
cur_off, &rdata->subreq.io_iter);
if (rdata->result != 0) {
if (is_offloaded)
@ -4686,12 +4725,11 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
dequeue_mid(mid, rdata->result);
return 0;
}
rdata->got_bytes = pages_len;
rdata->got_bytes = buffer_len;
} else if (buf_len >= data_offset + data_len) {
/* read response payload is in buf */
WARN_ONCE(pages && !xa_empty(pages),
"read data can be either in buf or in pages");
WARN_ONCE(buffer, "read data can be either in buf or in buffer");
length = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter);
if (length < 0)
return length;
@ -4717,7 +4755,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
struct smb2_decrypt_work {
struct work_struct decrypt;
struct TCP_Server_Info *server;
struct xarray buffer;
struct folio_queue *buffer;
char *buf;
unsigned int len;
};
@ -4731,7 +4769,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
struct mid_q_entry *mid;
struct iov_iter iter;
iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, dw->len);
iov_iter_folio_queue(&iter, ITER_DEST, dw->buffer, 0, 0, dw->len);
rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size,
&iter, true);
if (rc) {
@ -4747,7 +4785,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
mid->decrypted = true;
rc = handle_read_data(dw->server, mid, dw->buf,
dw->server->vals->read_rsp_size,
&dw->buffer, dw->len,
dw->buffer, dw->len,
true);
if (rc >= 0) {
#ifdef CONFIG_CIFS_STATS2
@ -4780,7 +4818,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
}
free_pages:
cifs_clear_xarray_buffer(&dw->buffer);
cifs_clear_folioq_buffer(dw->buffer);
cifs_small_buf_release(dw->buf);
kfree(dw);
}
@ -4790,20 +4828,17 @@ static int
receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
int *num_mids)
{
struct page *page;
char *buf = server->smallbuf;
struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
struct iov_iter iter;
unsigned int len, npages;
unsigned int len;
unsigned int buflen = server->pdu_size;
int rc;
int i = 0;
struct smb2_decrypt_work *dw;
dw = kzalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL);
if (!dw)
return -ENOMEM;
xa_init(&dw->buffer);
INIT_WORK(&dw->decrypt, smb2_decrypt_offload);
dw->server = server;
@ -4819,26 +4854,14 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
len = le32_to_cpu(tr_hdr->OriginalMessageSize) -
server->vals->read_rsp_size;
dw->len = len;
npages = DIV_ROUND_UP(len, PAGE_SIZE);
len = round_up(dw->len, PAGE_SIZE);
rc = -ENOMEM;
for (; i < npages; i++) {
void *old;
page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
if (!page)
dw->buffer = cifs_alloc_folioq_buffer(len);
if (!dw->buffer)
goto discard_data;
page->index = i;
old = xa_store(&dw->buffer, i, page, GFP_KERNEL);
if (xa_is_err(old)) {
rc = xa_err(old);
put_page(page);
goto discard_data;
}
xa_set_mark(&dw->buffer, i, XA_MARK_0);
}
iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, npages * PAGE_SIZE);
iov_iter_folio_queue(&iter, ITER_DEST, dw->buffer, 0, 0, len);
/* Read the data into the buffer and clear excess bufferage. */
rc = cifs_read_iter_from_socket(server, &iter, dw->len);
@ -4846,9 +4869,9 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
goto discard_data;
server->total_read += rc;
if (rc < npages * PAGE_SIZE)
iov_iter_zero(npages * PAGE_SIZE - rc, &iter);
iov_iter_revert(&iter, npages * PAGE_SIZE);
if (rc < len)
iov_iter_zero(len - rc, &iter);
iov_iter_revert(&iter, len);
iov_iter_truncate(&iter, dw->len);
rc = cifs_discard_remaining_data(server);
@ -4883,7 +4906,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
(*mid)->decrypted = true;
rc = handle_read_data(server, *mid, buf,
server->vals->read_rsp_size,
&dw->buffer, dw->len, false);
dw->buffer, dw->len, false);
if (rc >= 0) {
if (server->ops->is_network_name_deleted) {
server->ops->is_network_name_deleted(buf,
@ -4893,7 +4916,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
}
free_pages:
cifs_clear_xarray_buffer(&dw->buffer);
cifs_clear_folioq_buffer(dw->buffer);
free_dw:
kfree(dw);
return rc;

View File

@ -4498,9 +4498,7 @@ static void smb2_readv_worker(struct work_struct *work)
struct cifs_io_subrequest *rdata =
container_of(work, struct cifs_io_subrequest, subreq.work);
netfs_subreq_terminated(&rdata->subreq,
(rdata->result == 0 || rdata->result == -EAGAIN) ?
rdata->got_bytes : rdata->result, true);
netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false);
}
static void
@ -4532,7 +4530,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu/%zu\n",
__func__, mid->mid, mid->mid_state, rdata->result,
rdata->actual_len, rdata->subreq.len - rdata->subreq.transferred);
rdata->got_bytes, rdata->subreq.len - rdata->subreq.transferred);
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
@ -4554,6 +4552,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
__set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags);
rdata->result = -EAGAIN;
if (server->sign && rdata->got_bytes)
/* reset bytes number since we can not check a sign */
@ -4588,7 +4587,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
rdata->req->cfile->fid.persistent_fid,
tcon->tid, tcon->ses->Suid,
rdata->subreq.start + rdata->subreq.transferred,
rdata->actual_len,
rdata->subreq.len - rdata->subreq.transferred,
rdata->result);
} else
trace_smb3_read_done(rdata->rreq->debug_id,
@ -4603,9 +4602,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
rdata->result = 0;
} else {
if (rdata->got_bytes < rdata->actual_len &&
rdata->subreq.start + rdata->subreq.transferred + rdata->got_bytes ==
ictx->remote_i_size) {
size_t trans = rdata->subreq.transferred + rdata->got_bytes;
if (trans < rdata->subreq.len &&
rdata->subreq.start + trans == ictx->remote_i_size) {
__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
rdata->result = 0;
}
@ -4614,6 +4613,10 @@ smb2_readv_callback(struct mid_q_entry *mid)
server->credits, server->in_flight,
0, cifs_trace_rw_credits_read_response_clear);
rdata->credits.value = 0;
rdata->subreq.transferred += rdata->got_bytes;
if (rdata->subreq.start + rdata->subreq.transferred >= rdata->subreq.rreq->i_size)
__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress);
INIT_WORK(&rdata->subreq.work, smb2_readv_worker);
queue_work(cifsiod_wq, &rdata->subreq.work);
release_mid(mid);
@ -4648,7 +4651,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
io_parms.tcon = tlink_tcon(rdata->req->cfile->tlink);
io_parms.server = server = rdata->server;
io_parms.offset = subreq->start + subreq->transferred;
io_parms.length = rdata->actual_len;
io_parms.length = subreq->len - subreq->transferred;
io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid;
io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid;
io_parms.pid = rdata->req->pid;
@ -4669,7 +4672,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
shdr = (struct smb2_hdr *)buf;
if (rdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->actual_len,
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(io_parms.length,
SMB2_MAX_BUFFER_SIZE));
credit_request = le16_to_cpu(shdr->CreditCharge) + 8;
if (server->credits >= server->max_credits)
@ -4697,7 +4700,8 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
rdata->xid, io_parms.persistent_fid,
io_parms.tcon->tid,
io_parms.tcon->ses->Suid,
io_parms.offset, rdata->actual_len, rc);
io_parms.offset,
subreq->len - subreq->transferred, rc);
}
async_readv_out:
@ -4880,6 +4884,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
server->credits, server->in_flight,
0, cifs_trace_rw_credits_write_response_clear);
wdata->credits.value = 0;
trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress);
cifs_write_subrequest_terminated(wdata, result ?: written, true);
release_mid(mid);
trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0,

View File

@ -6,6 +6,7 @@
*/
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/folio_queue.h>
#include "smbdirect.h"
#include "cifs_debug.h"
#include "cifsproto.h"
@ -2463,6 +2464,8 @@ static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter,
start = 0;
}
if (ret > 0)
iov_iter_advance(iter, ret);
return ret;
}
@ -2519,50 +2522,65 @@ static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter,
start = 0;
}
if (ret > 0)
iov_iter_advance(iter, ret);
return ret;
}
/*
* Extract folio fragments from an XARRAY-class iterator and add them to an
* RDMA list. The folios are not pinned.
* Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA
* list. The folios are not pinned.
*/
static ssize_t smb_extract_xarray_to_rdma(struct iov_iter *iter,
static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter,
struct smb_extract_to_rdma *rdma,
ssize_t maxsize)
{
struct xarray *xa = iter->xarray;
struct folio *folio;
loff_t start = iter->xarray_start + iter->iov_offset;
pgoff_t index = start / PAGE_SIZE;
const struct folio_queue *folioq = iter->folioq;
unsigned int slot = iter->folioq_slot;
ssize_t ret = 0;
size_t off, len;
XA_STATE(xas, xa, index);
size_t offset = iter->iov_offset;
rcu_read_lock();
BUG_ON(!folioq);
xas_for_each(&xas, folio, ULONG_MAX) {
if (xas_retry(&xas, folio))
continue;
if (WARN_ON(xa_is_value(folio)))
break;
if (WARN_ON(folio_test_hugetlb(folio)))
break;
off = offset_in_folio(folio, start);
len = min_t(size_t, maxsize, folio_size(folio) - off);
if (!smb_set_sge(rdma, folio_page(folio, 0), off, len)) {
rcu_read_unlock();
if (slot >= folioq_nr_slots(folioq)) {
folioq = folioq->next;
if (WARN_ON_ONCE(!folioq))
return -EIO;
slot = 0;
}
maxsize -= len;
ret += len;
if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
do {
struct folio *folio = folioq_folio(folioq, slot);
size_t fsize = folioq_folio_size(folioq, slot);
if (offset < fsize) {
size_t part = umin(maxsize - ret, fsize - offset);
if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part))
return -EIO;
offset += part;
ret += part;
}
if (offset >= fsize) {
offset = 0;
slot++;
if (slot >= folioq_nr_slots(folioq)) {
if (!folioq->next) {
WARN_ON_ONCE(ret < iter->count);
break;
}
folioq = folioq->next;
slot = 0;
}
}
} while (rdma->nr_sge < rdma->max_sge || maxsize > 0);
rcu_read_unlock();
iter->folioq = folioq;
iter->folioq_slot = slot;
iter->iov_offset = offset;
iter->count -= ret;
return ret;
}
@ -2590,17 +2608,15 @@ static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
case ITER_KVEC:
ret = smb_extract_kvec_to_rdma(iter, rdma, len);
break;
case ITER_XARRAY:
ret = smb_extract_xarray_to_rdma(iter, rdma, len);
case ITER_FOLIOQ:
ret = smb_extract_folioq_to_rdma(iter, rdma, len);
break;
default:
WARN_ON_ONCE(1);
return -EIO;
}
if (ret > 0) {
iov_iter_advance(iter, ret);
} else if (ret < 0) {
if (ret < 0) {
while (rdma->nr_sge > before) {
struct ib_sge *sge = &rdma->sge[rdma->nr_sge--];

156
include/linux/folio_queue.h Normal file
View File

@ -0,0 +1,156 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Queue of folios definitions
*
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#ifndef _LINUX_FOLIO_QUEUE_H
#define _LINUX_FOLIO_QUEUE_H
#include <linux/pagevec.h>
/*
* Segment in a queue of running buffers. Each segment can hold a number of
* folios and a portion of the queue can be referenced with the ITER_FOLIOQ
* iterator. The possibility exists of inserting non-folio elements into the
* queue (such as gaps).
*
* Explicit prev and next pointers are used instead of a list_head to make it
* easier to add segments to tail and remove them from the head without the
* need for a lock.
*/
struct folio_queue {
struct folio_batch vec; /* Folios in the queue segment */
u8 orders[PAGEVEC_SIZE]; /* Order of each folio */
struct folio_queue *next; /* Next queue segment or NULL */
struct folio_queue *prev; /* Previous queue segment of NULL */
unsigned long marks; /* 1-bit mark per folio */
unsigned long marks2; /* Second 1-bit mark per folio */
unsigned long marks3; /* Third 1-bit mark per folio */
#if PAGEVEC_SIZE > BITS_PER_LONG
#error marks is not big enough
#endif
};
static inline void folioq_init(struct folio_queue *folioq)
{
folio_batch_init(&folioq->vec);
folioq->next = NULL;
folioq->prev = NULL;
folioq->marks = 0;
folioq->marks2 = 0;
folioq->marks3 = 0;
}
static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq)
{
return PAGEVEC_SIZE;
}
static inline unsigned int folioq_count(struct folio_queue *folioq)
{
return folio_batch_count(&folioq->vec);
}
static inline bool folioq_full(struct folio_queue *folioq)
{
//return !folio_batch_space(&folioq->vec);
return folioq_count(folioq) >= folioq_nr_slots(folioq);
}
static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot)
{
return test_bit(slot, &folioq->marks);
}
static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot)
{
set_bit(slot, &folioq->marks);
}
static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot)
{
clear_bit(slot, &folioq->marks);
}
static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot)
{
return test_bit(slot, &folioq->marks2);
}
static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot)
{
set_bit(slot, &folioq->marks2);
}
static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot)
{
clear_bit(slot, &folioq->marks2);
}
static inline bool folioq_is_marked3(const struct folio_queue *folioq, unsigned int slot)
{
return test_bit(slot, &folioq->marks3);
}
static inline void folioq_mark3(struct folio_queue *folioq, unsigned int slot)
{
set_bit(slot, &folioq->marks3);
}
static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot)
{
clear_bit(slot, &folioq->marks3);
}
static inline unsigned int __folio_order(struct folio *folio)
{
if (!folio_test_large(folio))
return 0;
return folio->_flags_1 & 0xff;
}
static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio)
{
unsigned int slot = folioq->vec.nr++;
folioq->vec.folios[slot] = folio;
folioq->orders[slot] = __folio_order(folio);
return slot;
}
static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio)
{
unsigned int slot = folioq->vec.nr++;
folioq->vec.folios[slot] = folio;
folioq->orders[slot] = __folio_order(folio);
folioq_mark(folioq, slot);
return slot;
}
static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot)
{
return folioq->vec.folios[slot];
}
static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot)
{
return folioq->orders[slot];
}
static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot)
{
return PAGE_SIZE << folioq_folio_order(folioq, slot);
}
static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot)
{
folioq->vec.folios[slot] = NULL;
folioq_unmark(folioq, slot);
folioq_unmark2(folioq, slot);
folioq_unmark3(folioq, slot);
}
#endif /* _LINUX_FOLIO_QUEUE_H */

View File

@ -10,6 +10,7 @@
#include <linux/uio.h>
#include <linux/bvec.h>
#include <linux/folio_queue.h>
typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
void *priv, void *priv2);
@ -140,6 +141,60 @@ size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
return progress;
}
/*
* Handle ITER_FOLIOQ.
*/
static __always_inline
size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2,
iov_step_f step)
{
const struct folio_queue *folioq = iter->folioq;
unsigned int slot = iter->folioq_slot;
size_t progress = 0, skip = iter->iov_offset;
if (slot == folioq_nr_slots(folioq)) {
/* The iterator may have been extended. */
folioq = folioq->next;
slot = 0;
}
do {
struct folio *folio = folioq_folio(folioq, slot);
size_t part, remain, consumed;
size_t fsize;
void *base;
if (!folio)
break;
fsize = folioq_folio_size(folioq, slot);
base = kmap_local_folio(folio, skip);
part = umin(len, PAGE_SIZE - skip % PAGE_SIZE);
remain = step(base, progress, part, priv, priv2);
kunmap_local(base);
consumed = part - remain;
len -= consumed;
progress += consumed;
skip += consumed;
if (skip >= fsize) {
skip = 0;
slot++;
if (slot == folioq_nr_slots(folioq) && folioq->next) {
folioq = folioq->next;
slot = 0;
}
}
if (remain)
break;
} while (len);
iter->folioq_slot = slot;
iter->folioq = folioq;
iter->iov_offset = skip;
iter->count -= progress;
return progress;
}
/*
* Handle ITER_XARRAY.
*/
@ -249,6 +304,8 @@ size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
return iterate_bvec(iter, len, priv, priv2, step);
if (iov_iter_is_kvec(iter))
return iterate_kvec(iter, len, priv, priv2, step);
if (iov_iter_is_folioq(iter))
return iterate_folioq(iter, len, priv, priv2, step);
if (iov_iter_is_xarray(iter))
return iterate_xarray(iter, len, priv, priv2, step);
return iterate_discard(iter, len, priv, priv2, step);
@ -271,4 +328,51 @@ size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv,
return iterate_and_advance2(iter, len, priv, NULL, ustep, step);
}
/**
* iterate_and_advance_kernel - Iterate over a kernel-internal iterator
* @iter: The iterator to iterate over.
* @len: The amount to iterate over.
* @priv: Data for the step functions.
* @priv2: More data for the step functions.
* @step: Function for other iterators; given kernel addresses.
*
* Iterate over the next part of an iterator, up to the specified length. The
* buffer is presented in segments, which for kernel iteration are broken up by
* physical pages and mapped, with the mapped address being presented.
*
* [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type
* iterators; it will not handle UBUF or IOVEC-type iterators.
*
* A step functions, @step, must be provided, one for handling mapped kernel
* addresses and the other is given user addresses which have the potential to
* fault since no pinning is performed.
*
* The step functions are passed the address and length of the segment, @priv,
* @priv2 and the amount of data so far iterated over (which can, for example,
* be added to @priv to point to the right part of a second buffer). The step
* functions should return the amount of the segment they didn't process (ie. 0
* indicates complete processsing).
*
* This function returns the amount of data processed (ie. 0 means nothing was
* processed and the value of @len means processes to completion).
*/
static __always_inline
size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv,
void *priv2, iov_step_f step)
{
if (unlikely(iter->count < len))
len = iter->count;
if (unlikely(!len))
return 0;
if (iov_iter_is_bvec(iter))
return iterate_bvec(iter, len, priv, priv2, step);
if (iov_iter_is_kvec(iter))
return iterate_kvec(iter, len, priv, priv2, step);
if (iov_iter_is_folioq(iter))
return iterate_folioq(iter, len, priv, priv2, step);
if (iov_iter_is_xarray(iter))
return iterate_xarray(iter, len, priv, priv2, step);
return iterate_discard(iter, len, priv, priv2, step);
}
#endif /* _LINUX_IOV_ITER_H */

View File

@ -38,11 +38,8 @@ static inline void folio_start_private_2(struct folio *folio)
folio_set_private_2(folio);
}
/* Marks used on xarray-based buffers */
#define NETFS_BUF_PUT_MARK XA_MARK_0 /* - Page needs putting */
#define NETFS_BUF_PAGECACHE_MARK XA_MARK_1 /* - Page needs wb/dirty flag wrangling */
enum netfs_io_source {
NETFS_SOURCE_UNKNOWN,
NETFS_FILL_WITH_ZEROES,
NETFS_DOWNLOAD_FROM_SERVER,
NETFS_READ_FROM_CACHE,
@ -73,6 +70,7 @@ struct netfs_inode {
#define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */
#define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */
#define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */
#define NETFS_ICTX_MODIFIED_ATTR 3 /* Indicate change in mtime/ctime */
};
/*
@ -133,9 +131,11 @@ static inline struct netfs_group *netfs_folio_group(struct folio *folio)
struct netfs_io_stream {
/* Submission tracking */
struct netfs_io_subrequest *construct; /* Op being constructed */
size_t sreq_max_len; /* Maximum size of a subrequest */
unsigned int sreq_max_segs; /* 0 or max number of segments in an iterator */
unsigned int submit_off; /* Folio offset we're submitting from */
unsigned int submit_len; /* Amount of data left to submit */
unsigned int submit_max_len; /* Amount I/O can be rounded up to */
unsigned int submit_extendable_to; /* Amount I/O can be rounded up to */
void (*prepare_write)(struct netfs_io_subrequest *subreq);
void (*issue_write)(struct netfs_io_subrequest *subreq);
/* Collection tracking */
@ -176,41 +176,45 @@ struct netfs_io_subrequest {
struct list_head rreq_link; /* Link in rreq->subrequests */
struct iov_iter io_iter; /* Iterator for this subrequest */
unsigned long long start; /* Where to start the I/O */
size_t max_len; /* Maximum size of the I/O */
size_t len; /* Size of the I/O */
size_t transferred; /* Amount of data transferred */
size_t consumed; /* Amount of read data consumed */
size_t prev_donated; /* Amount of data donated from previous subreq */
size_t next_donated; /* Amount of data donated from next subreq */
refcount_t ref;
short error; /* 0 or error that occurred */
unsigned short debug_index; /* Index in list (for debugging output) */
unsigned int nr_segs; /* Number of segs in io_iter */
unsigned int max_nr_segs; /* 0 or max number of segments in an iterator */
enum netfs_io_source source; /* Where to read from/write to */
unsigned char stream_nr; /* I/O stream this belongs to */
unsigned char curr_folioq_slot; /* Folio currently being read */
unsigned char curr_folio_order; /* Order of folio */
struct folio_queue *curr_folioq; /* Queue segment in which current folio resides */
unsigned long flags;
#define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */
#define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */
#define NETFS_SREQ_SHORT_IO 2 /* Set if the I/O was short */
#define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */
#define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */
#define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */
#define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */
#define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */
#define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */
#define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */
#define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */
#define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */
#define NETFS_SREQ_HIT_EOF 12 /* Set if we hit the EOF */
};
enum netfs_io_origin {
NETFS_READAHEAD, /* This read was triggered by readahead */
NETFS_READPAGE, /* This read is a synchronous read */
NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */
NETFS_READ_FOR_WRITE, /* This read is to prepare a write */
NETFS_COPY_TO_CACHE, /* This write is to copy a read to the cache */
NETFS_DIO_READ, /* This is a direct I/O read */
NETFS_WRITEBACK, /* This write was triggered by writepages */
NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */
NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */
NETFS_DIO_READ, /* This is a direct I/O read */
NETFS_DIO_WRITE, /* This is a direct I/O write */
NETFS_PGPRIV2_COPY_TO_CACHE, /* [DEPRECATED] This is writing read data to the cache */
nr__netfs_io_origin
} __mode(byte);
@ -227,11 +231,14 @@ struct netfs_io_request {
struct address_space *mapping; /* The mapping being accessed */
struct kiocb *iocb; /* AIO completion vector */
struct netfs_cache_resources cache_resources;
struct readahead_control *ractl; /* Readahead descriptor */
struct list_head proc_link; /* Link in netfs_iorequests */
struct list_head subrequests; /* Contributory I/O operations */
struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */
#define NR_IO_STREAMS 2 //wreq->nr_io_streams
struct netfs_group *group; /* Writeback group being written back */
struct folio_queue *buffer; /* Head of I/O buffer */
struct folio_queue *buffer_tail; /* Tail of I/O buffer */
struct iov_iter iter; /* Unencrypted-side iterator */
struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */
void *netfs_priv; /* Private data for the netfs */
@ -245,24 +252,23 @@ struct netfs_io_request {
unsigned int nr_group_rel; /* Number of refs to release on ->group */
spinlock_t lock; /* Lock for queuing subreqs */
atomic_t nr_outstanding; /* Number of ops in progress */
atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */
size_t upper_len; /* Length can be extended to here */
unsigned long long submitted; /* Amount submitted for I/O so far */
unsigned long long len; /* Length of the request */
size_t transferred; /* Amount to be indicated as transferred */
short error; /* 0 or error that occurred */
long error; /* 0 or error that occurred */
enum netfs_io_origin origin; /* Origin of the request */
bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */
u8 buffer_head_slot; /* First slot in ->buffer */
u8 buffer_tail_slot; /* Next slot in ->buffer_tail */
unsigned long long i_size; /* Size of the file */
unsigned long long start; /* Start position */
atomic64_t issued_to; /* Write issuer folio cursor */
unsigned long long contiguity; /* Tracking for gaps in the writeback sequence */
unsigned long long collected_to; /* Point we've collected to */
unsigned long long cleaned_to; /* Position we've cleaned folios to */
pgoff_t no_unlock_folio; /* Don't unlock this folio after read */
size_t prev_donated; /* Fallback for subreq->prev_donated */
refcount_t ref;
unsigned long flags;
#define NETFS_RREQ_INCOMPLETE_IO 0 /* Some ioreqs terminated short or with error */
#define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */
#define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */
#define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */
@ -274,6 +280,7 @@ struct netfs_io_request {
#define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */
#define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */
#define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */
#define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */
#define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark
* write to cache on read */
const struct netfs_request_ops *netfs_ops;
@ -292,7 +299,7 @@ struct netfs_request_ops {
/* Read request handling */
void (*expand_readahead)(struct netfs_io_request *rreq);
bool (*clamp_length)(struct netfs_io_subrequest *subreq);
int (*prepare_read)(struct netfs_io_subrequest *subreq);
void (*issue_read)(struct netfs_io_subrequest *subreq);
bool (*is_still_valid)(struct netfs_io_request *rreq);
int (*check_write_begin)(struct file *file, loff_t pos, unsigned len,
@ -422,7 +429,10 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp);
vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);
/* (Sub)request management API. */
void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool);
void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
bool was_async);
void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
int error, bool was_async);
void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
enum netfs_sreq_ref_trace what);
void netfs_put_subrequest(struct netfs_io_subrequest *subreq,

View File

@ -11,6 +11,7 @@
#include <uapi/linux/uio.h>
struct page;
struct folio_queue;
typedef unsigned int __bitwise iov_iter_extraction_t;
@ -25,6 +26,7 @@ enum iter_type {
ITER_IOVEC,
ITER_BVEC,
ITER_KVEC,
ITER_FOLIOQ,
ITER_XARRAY,
ITER_DISCARD,
};
@ -66,6 +68,7 @@ struct iov_iter {
const struct iovec *__iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
const struct folio_queue *folioq;
struct xarray *xarray;
void __user *ubuf;
};
@ -74,6 +77,7 @@ struct iov_iter {
};
union {
unsigned long nr_segs;
u8 folioq_slot;
loff_t xarray_start;
};
};
@ -126,6 +130,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter *i)
return iov_iter_type(i) == ITER_DISCARD;
}
static inline bool iov_iter_is_folioq(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_FOLIOQ;
}
static inline bool iov_iter_is_xarray(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_XARRAY;
@ -180,6 +189,12 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
return copy_page_to_iter(&folio->page, offset, bytes, i);
}
static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset,
size_t bytes, struct iov_iter *i)
{
return copy_page_from_iter(&folio->page, offset, bytes, i);
}
static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
size_t offset, size_t bytes, struct iov_iter *i)
{
@ -273,6 +288,9 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
unsigned long nr_segs, size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
const struct folio_queue *folioq,
unsigned int first_slot, unsigned int offset, size_t count);
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
loff_t start, size_t count);
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,

View File

@ -20,6 +20,7 @@
EM(netfs_read_trace_expanded, "EXPANDED ") \
EM(netfs_read_trace_readahead, "READAHEAD") \
EM(netfs_read_trace_readpage, "READPAGE ") \
EM(netfs_read_trace_read_gaps, "READ-GAPS") \
EM(netfs_read_trace_prefetch_for_write, "PREFETCHW") \
E_(netfs_read_trace_write_begin, "WRITEBEGN")
@ -33,13 +34,14 @@
#define netfs_rreq_origins \
EM(NETFS_READAHEAD, "RA") \
EM(NETFS_READPAGE, "RP") \
EM(NETFS_READ_GAPS, "RG") \
EM(NETFS_READ_FOR_WRITE, "RW") \
EM(NETFS_COPY_TO_CACHE, "CC") \
EM(NETFS_DIO_READ, "DR") \
EM(NETFS_WRITEBACK, "WB") \
EM(NETFS_WRITETHROUGH, "WT") \
EM(NETFS_UNBUFFERED_WRITE, "UW") \
EM(NETFS_DIO_READ, "DR") \
E_(NETFS_DIO_WRITE, "DW")
EM(NETFS_DIO_WRITE, "DW") \
E_(NETFS_PGPRIV2_COPY_TO_CACHE, "2C")
#define netfs_rreq_traces \
EM(netfs_rreq_trace_assess, "ASSESS ") \
@ -60,6 +62,7 @@
E_(netfs_rreq_trace_write_done, "WR-DONE")
#define netfs_sreq_sources \
EM(NETFS_SOURCE_UNKNOWN, "----") \
EM(NETFS_FILL_WITH_ZEROES, "ZERO") \
EM(NETFS_DOWNLOAD_FROM_SERVER, "DOWN") \
EM(NETFS_READ_FROM_CACHE, "READ") \
@ -69,15 +72,25 @@
E_(NETFS_INVALID_WRITE, "INVL")
#define netfs_sreq_traces \
EM(netfs_sreq_trace_add_donations, "+DON ") \
EM(netfs_sreq_trace_added, "ADD ") \
EM(netfs_sreq_trace_clear, "CLEAR") \
EM(netfs_sreq_trace_discard, "DSCRD") \
EM(netfs_sreq_trace_donate_to_prev, "DON-P") \
EM(netfs_sreq_trace_donate_to_next, "DON-N") \
EM(netfs_sreq_trace_download_instead, "RDOWN") \
EM(netfs_sreq_trace_fail, "FAIL ") \
EM(netfs_sreq_trace_free, "FREE ") \
EM(netfs_sreq_trace_hit_eof, "EOF ") \
EM(netfs_sreq_trace_io_progress, "IO ") \
EM(netfs_sreq_trace_limited, "LIMIT") \
EM(netfs_sreq_trace_prepare, "PREP ") \
EM(netfs_sreq_trace_prep_failed, "PRPFL") \
EM(netfs_sreq_trace_resubmit_short, "SHORT") \
EM(netfs_sreq_trace_progress, "PRGRS") \
EM(netfs_sreq_trace_reprep_failed, "REPFL") \
EM(netfs_sreq_trace_retry, "RETRY") \
EM(netfs_sreq_trace_short, "SHORT") \
EM(netfs_sreq_trace_split, "SPLIT") \
EM(netfs_sreq_trace_submit, "SUBMT") \
EM(netfs_sreq_trace_terminated, "TERM ") \
EM(netfs_sreq_trace_write, "WRITE") \
@ -118,7 +131,7 @@
EM(netfs_sreq_trace_new, "NEW ") \
EM(netfs_sreq_trace_put_cancel, "PUT CANCEL ") \
EM(netfs_sreq_trace_put_clear, "PUT CLEAR ") \
EM(netfs_sreq_trace_put_discard, "PUT DISCARD") \
EM(netfs_sreq_trace_put_consumed, "PUT CONSUME") \
EM(netfs_sreq_trace_put_done, "PUT DONE ") \
EM(netfs_sreq_trace_put_failed, "PUT FAILED ") \
EM(netfs_sreq_trace_put_merged, "PUT MERGED ") \
@ -129,7 +142,6 @@
E_(netfs_sreq_trace_put_terminated, "PUT TERM ")
#define netfs_folio_traces \
/* The first few correspond to enum netfs_how_to_modify */ \
EM(netfs_folio_is_uptodate, "mod-uptodate") \
EM(netfs_just_prefetch, "mod-prefetch") \
EM(netfs_whole_folio_modify, "mod-whole-f") \
@ -139,8 +151,9 @@
EM(netfs_flush_content, "flush") \
EM(netfs_streaming_filled_page, "mod-streamw-f") \
EM(netfs_streaming_cont_filled_page, "mod-streamw-f+") \
/* The rest are for writeback */ \
EM(netfs_folio_trace_abandon, "abandon") \
EM(netfs_folio_trace_cancel_copy, "cancel-copy") \
EM(netfs_folio_trace_cancel_store, "cancel-store") \
EM(netfs_folio_trace_clear, "clear") \
EM(netfs_folio_trace_clear_cc, "clear-cc") \
EM(netfs_folio_trace_clear_g, "clear-g") \
@ -155,7 +168,12 @@
EM(netfs_folio_trace_mkwrite, "mkwrite") \
EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \
EM(netfs_folio_trace_not_under_wback, "!wback") \
EM(netfs_folio_trace_put, "put") \
EM(netfs_folio_trace_read, "read") \
EM(netfs_folio_trace_read_done, "read-done") \
EM(netfs_folio_trace_read_gaps, "read-gaps") \
EM(netfs_folio_trace_read_put, "read-put") \
EM(netfs_folio_trace_read_unlock, "read-unlock") \
EM(netfs_folio_trace_redirtied, "redirtied") \
EM(netfs_folio_trace_store, "store") \
EM(netfs_folio_trace_store_copy, "store-copy") \
@ -168,6 +186,12 @@
EM(netfs_contig_trace_jump, "-->JUMP-->") \
E_(netfs_contig_trace_unlock, "Unlock")
#define netfs_donate_traces \
EM(netfs_trace_donate_tail_to_prev, "tail-to-prev") \
EM(netfs_trace_donate_to_prev, "to-prev") \
EM(netfs_trace_donate_to_next, "to-next") \
E_(netfs_trace_donate_to_deferred_next, "defer-next")
#ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
#define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
@ -185,6 +209,7 @@ enum netfs_rreq_ref_trace { netfs_rreq_ref_traces } __mode(byte);
enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte);
enum netfs_folio_trace { netfs_folio_traces } __mode(byte);
enum netfs_collect_contig_trace { netfs_collect_contig_traces } __mode(byte);
enum netfs_donate_trace { netfs_donate_traces } __mode(byte);
#endif
@ -207,6 +232,7 @@ netfs_rreq_ref_traces;
netfs_sreq_ref_traces;
netfs_folio_traces;
netfs_collect_contig_traces;
netfs_donate_traces;
/*
* Now redefine the EM() and E_() macros to map the enums to the strings that
@ -227,6 +253,7 @@ TRACE_EVENT(netfs_read,
TP_STRUCT__entry(
__field(unsigned int, rreq )
__field(unsigned int, cookie )
__field(loff_t, i_size )
__field(loff_t, start )
__field(size_t, len )
__field(enum netfs_read_trace, what )
@ -236,18 +263,19 @@ TRACE_EVENT(netfs_read,
TP_fast_assign(
__entry->rreq = rreq->debug_id;
__entry->cookie = rreq->cache_resources.debug_id;
__entry->i_size = rreq->i_size;
__entry->start = start;
__entry->len = len;
__entry->what = what;
__entry->netfs_inode = rreq->inode->i_ino;
),
TP_printk("R=%08x %s c=%08x ni=%x s=%llx %zx",
TP_printk("R=%08x %s c=%08x ni=%x s=%llx l=%zx sz=%llx",
__entry->rreq,
__print_symbolic(__entry->what, netfs_read_traces),
__entry->cookie,
__entry->netfs_inode,
__entry->start, __entry->len)
__entry->start, __entry->len, __entry->i_size)
);
TRACE_EVENT(netfs_rreq,
@ -513,33 +541,6 @@ TRACE_EVENT(netfs_collect,
__entry->start + __entry->len)
);
TRACE_EVENT(netfs_collect_contig,
TP_PROTO(const struct netfs_io_request *wreq, unsigned long long to,
enum netfs_collect_contig_trace type),
TP_ARGS(wreq, to, type),
TP_STRUCT__entry(
__field(unsigned int, wreq)
__field(enum netfs_collect_contig_trace, type)
__field(unsigned long long, contiguity)
__field(unsigned long long, to)
),
TP_fast_assign(
__entry->wreq = wreq->debug_id;
__entry->type = type;
__entry->contiguity = wreq->contiguity;
__entry->to = to;
),
TP_printk("R=%08x %llx -> %llx %s",
__entry->wreq,
__entry->contiguity,
__entry->to,
__print_symbolic(__entry->type, netfs_collect_contig_traces))
);
TRACE_EVENT(netfs_collect_sreq,
TP_PROTO(const struct netfs_io_request *wreq,
const struct netfs_io_subrequest *subreq),
@ -611,7 +612,6 @@ TRACE_EVENT(netfs_collect_state,
__field(unsigned int, notes )
__field(unsigned long long, collected_to )
__field(unsigned long long, cleaned_to )
__field(unsigned long long, contiguity )
),
TP_fast_assign(
@ -619,12 +619,11 @@ TRACE_EVENT(netfs_collect_state,
__entry->notes = notes;
__entry->collected_to = collected_to;
__entry->cleaned_to = wreq->cleaned_to;
__entry->contiguity = wreq->contiguity;
),
TP_printk("R=%08x cto=%llx fto=%llx ctg=%llx n=%x",
TP_printk("R=%08x col=%llx cln=%llx n=%x",
__entry->wreq, __entry->collected_to,
__entry->cleaned_to, __entry->contiguity,
__entry->cleaned_to,
__entry->notes)
);
@ -681,6 +680,71 @@ TRACE_EVENT(netfs_collect_stream,
__entry->collected_to, __entry->front)
);
TRACE_EVENT(netfs_progress,
TP_PROTO(const struct netfs_io_subrequest *subreq,
unsigned long long start, size_t avail, size_t part),
TP_ARGS(subreq, start, avail, part),
TP_STRUCT__entry(
__field(unsigned int, rreq)
__field(unsigned int, subreq)
__field(unsigned int, consumed)
__field(unsigned int, transferred)
__field(unsigned long long, f_start)
__field(unsigned int, f_avail)
__field(unsigned int, f_part)
__field(unsigned char, slot)
),
TP_fast_assign(
__entry->rreq = subreq->rreq->debug_id;
__entry->subreq = subreq->debug_index;
__entry->consumed = subreq->consumed;
__entry->transferred = subreq->transferred;
__entry->f_start = start;
__entry->f_avail = avail;
__entry->f_part = part;
__entry->slot = subreq->curr_folioq_slot;
),
TP_printk("R=%08x[%02x] s=%llx ct=%x/%x pa=%x/%x sl=%x",
__entry->rreq, __entry->subreq, __entry->f_start,
__entry->consumed, __entry->transferred,
__entry->f_part, __entry->f_avail, __entry->slot)
);
TRACE_EVENT(netfs_donate,
TP_PROTO(const struct netfs_io_request *rreq,
const struct netfs_io_subrequest *from,
const struct netfs_io_subrequest *to,
size_t amount,
enum netfs_donate_trace trace),
TP_ARGS(rreq, from, to, amount, trace),
TP_STRUCT__entry(
__field(unsigned int, rreq)
__field(unsigned int, from)
__field(unsigned int, to)
__field(unsigned int, amount)
__field(enum netfs_donate_trace, trace)
),
TP_fast_assign(
__entry->rreq = rreq->debug_id;
__entry->from = from->debug_index;
__entry->to = to ? to->debug_index : -1;
__entry->amount = amount;
__entry->trace = trace;
),
TP_printk("R=%08x[%02x] -> [%02x] %s am=%x",
__entry->rreq, __entry->from, __entry->to,
__print_symbolic(__entry->trace, netfs_donate_traces),
__entry->amount)
);
#undef EM
#undef E_
#endif /* _TRACE_NETFS_H */

View File

@ -527,6 +527,39 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
i->__iov = iov;
}
static void iov_iter_folioq_advance(struct iov_iter *i, size_t size)
{
const struct folio_queue *folioq = i->folioq;
unsigned int slot = i->folioq_slot;
if (!i->count)
return;
i->count -= size;
if (slot >= folioq_nr_slots(folioq)) {
folioq = folioq->next;
slot = 0;
}
size += i->iov_offset; /* From beginning of current segment. */
do {
size_t fsize = folioq_folio_size(folioq, slot);
if (likely(size < fsize))
break;
size -= fsize;
slot++;
if (slot >= folioq_nr_slots(folioq) && folioq->next) {
folioq = folioq->next;
slot = 0;
}
} while (size);
i->iov_offset = size;
i->folioq_slot = slot;
i->folioq = folioq;
}
void iov_iter_advance(struct iov_iter *i, size_t size)
{
if (unlikely(i->count < size))
@ -539,12 +572,40 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
iov_iter_iovec_advance(i, size);
} else if (iov_iter_is_bvec(i)) {
iov_iter_bvec_advance(i, size);
} else if (iov_iter_is_folioq(i)) {
iov_iter_folioq_advance(i, size);
} else if (iov_iter_is_discard(i)) {
i->count -= size;
}
}
EXPORT_SYMBOL(iov_iter_advance);
static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll)
{
const struct folio_queue *folioq = i->folioq;
unsigned int slot = i->folioq_slot;
for (;;) {
size_t fsize;
if (slot == 0) {
folioq = folioq->prev;
slot = folioq_nr_slots(folioq);
}
slot--;
fsize = folioq_folio_size(folioq, slot);
if (unroll <= fsize) {
i->iov_offset = fsize - unroll;
break;
}
unroll -= fsize;
}
i->folioq_slot = slot;
i->folioq = folioq;
}
void iov_iter_revert(struct iov_iter *i, size_t unroll)
{
if (!unroll)
@ -576,6 +637,9 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
}
unroll -= n;
}
} else if (iov_iter_is_folioq(i)) {
i->iov_offset = 0;
iov_iter_folioq_revert(i, unroll);
} else { /* same logics for iovec and kvec */
const struct iovec *iov = iter_iov(i);
while (1) {
@ -603,6 +667,9 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
if (iov_iter_is_bvec(i))
return min(i->count, i->bvec->bv_len - i->iov_offset);
}
if (unlikely(iov_iter_is_folioq(i)))
return !i->count ? 0 :
umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count);
return i->count;
}
EXPORT_SYMBOL(iov_iter_single_seg_count);
@ -639,6 +706,36 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
}
EXPORT_SYMBOL(iov_iter_bvec);
/**
* iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue
* @i: The iterator to initialise.
* @direction: The direction of the transfer.
* @folioq: The starting point in the folio queue.
* @first_slot: The first slot in the folio queue to use
* @offset: The offset into the folio in the first slot to start at
* @count: The size of the I/O buffer in bytes.
*
* Set up an I/O iterator to either draw data out of the pages attached to an
* inode or to inject data into those pages. The pages *must* be prevented
* from evaporation, either by taking a ref on them or locking them by the
* caller.
*/
void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
const struct folio_queue *folioq, unsigned int first_slot,
unsigned int offset, size_t count)
{
BUG_ON(direction & ~1);
*i = (struct iov_iter) {
.iter_type = ITER_FOLIOQ,
.data_source = direction,
.folioq = folioq,
.folioq_slot = first_slot,
.count = count,
.iov_offset = offset,
};
}
EXPORT_SYMBOL(iov_iter_folio_queue);
/**
* iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
* @i: The iterator to initialise.
@ -765,12 +862,19 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
if (iov_iter_is_bvec(i))
return iov_iter_aligned_bvec(i, addr_mask, len_mask);
/* With both xarray and folioq types, we're dealing with whole folios. */
if (iov_iter_is_xarray(i)) {
if (i->count & len_mask)
return false;
if ((i->xarray_start + i->iov_offset) & addr_mask)
return false;
}
if (iov_iter_is_folioq(i)) {
if (i->count & len_mask)
return false;
if (i->iov_offset & addr_mask)
return false;
}
return true;
}
@ -835,6 +939,9 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
if (iov_iter_is_bvec(i))
return iov_iter_alignment_bvec(i);
/* With both xarray and folioq types, we're dealing with whole folios. */
if (iov_iter_is_folioq(i))
return i->iov_offset | i->count;
if (iov_iter_is_xarray(i))
return (i->xarray_start + i->iov_offset) | i->count;
@ -887,6 +994,62 @@ static int want_pages_array(struct page ***res, size_t size,
return count;
}
static ssize_t iter_folioq_get_pages(struct iov_iter *iter,
struct page ***ppages, size_t maxsize,
unsigned maxpages, size_t *_start_offset)
{
const struct folio_queue *folioq = iter->folioq;
struct page **pages;
unsigned int slot = iter->folioq_slot;
size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset;
if (slot >= folioq_nr_slots(folioq)) {
folioq = folioq->next;
slot = 0;
if (WARN_ON(iov_offset != 0))
return -EIO;
}
maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages);
if (!maxpages)
return -ENOMEM;
*_start_offset = iov_offset & ~PAGE_MASK;
pages = *ppages;
for (;;) {
struct folio *folio = folioq_folio(folioq, slot);
size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot);
size_t part = PAGE_SIZE - offset % PAGE_SIZE;
part = umin(part, umin(maxsize - extracted, fsize - offset));
count -= part;
iov_offset += part;
extracted += part;
*pages = folio_page(folio, offset / PAGE_SIZE);
get_page(*pages);
pages++;
maxpages--;
if (maxpages == 0 || extracted >= maxsize)
break;
if (offset >= fsize) {
iov_offset = 0;
slot++;
if (slot == folioq_nr_slots(folioq) && folioq->next) {
folioq = folioq->next;
slot = 0;
}
}
}
iter->count = count;
iter->iov_offset = iov_offset;
iter->folioq = folioq;
iter->folioq_slot = slot;
return extracted;
}
static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
pgoff_t index, unsigned int nr_pages)
{
@ -1034,6 +1197,8 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
}
return maxsize;
}
if (iov_iter_is_folioq(i))
return iter_folioq_get_pages(i, pages, maxsize, maxpages, start);
if (iov_iter_is_xarray(i))
return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
return -EFAULT;
@ -1118,6 +1283,11 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
return iov_npages(i, maxpages);
if (iov_iter_is_bvec(i))
return bvec_npages(i, maxpages);
if (iov_iter_is_folioq(i)) {
unsigned offset = i->iov_offset % PAGE_SIZE;
int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
return min(npages, maxpages);
}
if (iov_iter_is_xarray(i)) {
unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
@ -1398,6 +1568,68 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
i->nr_segs = state->nr_segs;
}
/*
* Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This does
* not get references on the pages, nor does it get a pin on them.
*/
static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i,
struct page ***pages, size_t maxsize,
unsigned int maxpages,
iov_iter_extraction_t extraction_flags,
size_t *offset0)
{
const struct folio_queue *folioq = i->folioq;
struct page **p;
unsigned int nr = 0;
size_t extracted = 0, offset, slot = i->folioq_slot;
if (slot >= folioq_nr_slots(folioq)) {
folioq = folioq->next;
slot = 0;
if (WARN_ON(i->iov_offset != 0))
return -EIO;
}
offset = i->iov_offset & ~PAGE_MASK;
*offset0 = offset;
maxpages = want_pages_array(pages, maxsize, offset, maxpages);
if (!maxpages)
return -ENOMEM;
p = *pages;
for (;;) {
struct folio *folio = folioq_folio(folioq, slot);
size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot);
size_t part = PAGE_SIZE - offset % PAGE_SIZE;
if (offset < fsize) {
part = umin(part, umin(maxsize - extracted, fsize - offset));
i->count -= part;
i->iov_offset += part;
extracted += part;
p[nr++] = folio_page(folio, offset / PAGE_SIZE);
}
if (nr >= maxpages || extracted >= maxsize)
break;
if (i->iov_offset >= fsize) {
i->iov_offset = 0;
slot++;
if (slot == folioq_nr_slots(folioq) && folioq->next) {
folioq = folioq->next;
slot = 0;
}
}
}
i->folioq = folioq;
i->folioq_slot = slot;
return extracted;
}
/*
* Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not
* get references on the pages, nor does it get a pin on them.
@ -1618,8 +1850,8 @@ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i,
* added to the pages, but refs will not be taken.
* iov_iter_extract_will_pin() will return true.
*
* (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are
* merely listed; no extra refs or pins are obtained.
* (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the
* pages are merely listed; no extra refs or pins are obtained.
* iov_iter_extract_will_pin() will return 0.
*
* Note also:
@ -1654,6 +1886,10 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i,
return iov_iter_extract_bvec_pages(i, pages, maxsize,
maxpages, extraction_flags,
offset0);
if (iov_iter_is_folioq(i))
return iov_iter_extract_folioq_pages(i, pages, maxsize,
maxpages, extraction_flags,
offset0);
if (iov_iter_is_xarray(i))
return iov_iter_extract_xarray_pages(i, pages, maxsize,
maxpages, extraction_flags,

View File

@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/uio.h>
#include <linux/bvec.h>
#include <linux/folio_queue.h>
#include <kunit/test.h>
MODULE_DESCRIPTION("iov_iter testing");
@ -62,6 +63,9 @@ static void *__init iov_kunit_create_buffer(struct kunit *test,
KUNIT_ASSERT_EQ(test, got, npages);
}
for (int i = 0; i < npages; i++)
pages[i]->index = i;
buffer = vmap(pages, npages, VM_MAP | VM_MAP_PUT_PAGES, PAGE_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buffer);
@ -362,6 +366,179 @@ stop:
KUNIT_SUCCEED(test);
}
static void iov_kunit_destroy_folioq(void *data)
{
struct folio_queue *folioq, *next;
for (folioq = data; folioq; folioq = next) {
next = folioq->next;
for (int i = 0; i < folioq_nr_slots(folioq); i++)
if (folioq_folio(folioq, i))
folio_put(folioq_folio(folioq, i));
kfree(folioq);
}
}
static void __init iov_kunit_load_folioq(struct kunit *test,
struct iov_iter *iter, int dir,
struct folio_queue *folioq,
struct page **pages, size_t npages)
{
struct folio_queue *p = folioq;
size_t size = 0;
int i;
for (i = 0; i < npages; i++) {
if (folioq_full(p)) {
p->next = kzalloc(sizeof(struct folio_queue), GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p->next);
folioq_init(p->next);
p->next->prev = p;
p = p->next;
}
folioq_append(p, page_folio(pages[i]));
size += PAGE_SIZE;
}
iov_iter_folio_queue(iter, dir, folioq, 0, 0, size);
}
static struct folio_queue *iov_kunit_create_folioq(struct kunit *test)
{
struct folio_queue *folioq;
folioq = kzalloc(sizeof(struct folio_queue), GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, folioq);
kunit_add_action_or_reset(test, iov_kunit_destroy_folioq, folioq);
folioq_init(folioq);
return folioq;
}
/*
* Test copying to a ITER_FOLIOQ-type iterator.
*/
static void __init iov_kunit_copy_to_folioq(struct kunit *test)
{
const struct kvec_test_range *pr;
struct iov_iter iter;
struct folio_queue *folioq;
struct page **spages, **bpages;
u8 *scratch, *buffer;
size_t bufsize, npages, size, copied;
int i, patt;
bufsize = 0x100000;
npages = bufsize / PAGE_SIZE;
folioq = iov_kunit_create_folioq(test);
scratch = iov_kunit_create_buffer(test, &spages, npages);
for (i = 0; i < bufsize; i++)
scratch[i] = pattern(i);
buffer = iov_kunit_create_buffer(test, &bpages, npages);
memset(buffer, 0, bufsize);
iov_kunit_load_folioq(test, &iter, READ, folioq, bpages, npages);
i = 0;
for (pr = kvec_test_ranges; pr->from >= 0; pr++) {
size = pr->to - pr->from;
KUNIT_ASSERT_LE(test, pr->to, bufsize);
iov_iter_folio_queue(&iter, READ, folioq, 0, 0, pr->to);
iov_iter_advance(&iter, pr->from);
copied = copy_to_iter(scratch + i, size, &iter);
KUNIT_EXPECT_EQ(test, copied, size);
KUNIT_EXPECT_EQ(test, iter.count, 0);
KUNIT_EXPECT_EQ(test, iter.iov_offset, pr->to % PAGE_SIZE);
i += size;
if (test->status == KUNIT_FAILURE)
goto stop;
}
/* Build the expected image in the scratch buffer. */
patt = 0;
memset(scratch, 0, bufsize);
for (pr = kvec_test_ranges; pr->from >= 0; pr++)
for (i = pr->from; i < pr->to; i++)
scratch[i] = pattern(patt++);
/* Compare the images */
for (i = 0; i < bufsize; i++) {
KUNIT_EXPECT_EQ_MSG(test, buffer[i], scratch[i], "at i=%x", i);
if (buffer[i] != scratch[i])
return;
}
stop:
KUNIT_SUCCEED(test);
}
/*
* Test copying from a ITER_FOLIOQ-type iterator.
*/
static void __init iov_kunit_copy_from_folioq(struct kunit *test)
{
const struct kvec_test_range *pr;
struct iov_iter iter;
struct folio_queue *folioq;
struct page **spages, **bpages;
u8 *scratch, *buffer;
size_t bufsize, npages, size, copied;
int i, j;
bufsize = 0x100000;
npages = bufsize / PAGE_SIZE;
folioq = iov_kunit_create_folioq(test);
buffer = iov_kunit_create_buffer(test, &bpages, npages);
for (i = 0; i < bufsize; i++)
buffer[i] = pattern(i);
scratch = iov_kunit_create_buffer(test, &spages, npages);
memset(scratch, 0, bufsize);
iov_kunit_load_folioq(test, &iter, READ, folioq, bpages, npages);
i = 0;
for (pr = kvec_test_ranges; pr->from >= 0; pr++) {
size = pr->to - pr->from;
KUNIT_ASSERT_LE(test, pr->to, bufsize);
iov_iter_folio_queue(&iter, WRITE, folioq, 0, 0, pr->to);
iov_iter_advance(&iter, pr->from);
copied = copy_from_iter(scratch + i, size, &iter);
KUNIT_EXPECT_EQ(test, copied, size);
KUNIT_EXPECT_EQ(test, iter.count, 0);
KUNIT_EXPECT_EQ(test, iter.iov_offset, pr->to % PAGE_SIZE);
i += size;
}
/* Build the expected image in the main buffer. */
i = 0;
memset(buffer, 0, bufsize);
for (pr = kvec_test_ranges; pr->from >= 0; pr++) {
for (j = pr->from; j < pr->to; j++) {
buffer[i++] = pattern(j);
if (i >= bufsize)
goto stop;
}
}
stop:
/* Compare the images */
for (i = 0; i < bufsize; i++) {
KUNIT_EXPECT_EQ_MSG(test, scratch[i], buffer[i], "at i=%x", i);
if (scratch[i] != buffer[i])
return;
}
KUNIT_SUCCEED(test);
}
static void iov_kunit_destroy_xarray(void *data)
{
struct xarray *xarray = data;
@ -677,6 +854,85 @@ stop:
KUNIT_SUCCEED(test);
}
/*
* Test the extraction of ITER_FOLIOQ-type iterators.
*/
static void __init iov_kunit_extract_pages_folioq(struct kunit *test)
{
const struct kvec_test_range *pr;
struct folio_queue *folioq;
struct iov_iter iter;
struct page **bpages, *pagelist[8], **pages = pagelist;
ssize_t len;
size_t bufsize, size = 0, npages;
int i, from;
bufsize = 0x100000;
npages = bufsize / PAGE_SIZE;
folioq = iov_kunit_create_folioq(test);
iov_kunit_create_buffer(test, &bpages, npages);
iov_kunit_load_folioq(test, &iter, READ, folioq, bpages, npages);
for (pr = kvec_test_ranges; pr->from >= 0; pr++) {
from = pr->from;
size = pr->to - from;
KUNIT_ASSERT_LE(test, pr->to, bufsize);
iov_iter_folio_queue(&iter, WRITE, folioq, 0, 0, pr->to);
iov_iter_advance(&iter, from);
do {
size_t offset0 = LONG_MAX;
for (i = 0; i < ARRAY_SIZE(pagelist); i++)
pagelist[i] = (void *)(unsigned long)0xaa55aa55aa55aa55ULL;
len = iov_iter_extract_pages(&iter, &pages, 100 * 1024,
ARRAY_SIZE(pagelist), 0, &offset0);
KUNIT_EXPECT_GE(test, len, 0);
if (len < 0)
break;
KUNIT_EXPECT_LE(test, len, size);
KUNIT_EXPECT_EQ(test, iter.count, size - len);
if (len == 0)
break;
size -= len;
KUNIT_EXPECT_GE(test, (ssize_t)offset0, 0);
KUNIT_EXPECT_LT(test, offset0, PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(pagelist); i++) {
struct page *p;
ssize_t part = min_t(ssize_t, len, PAGE_SIZE - offset0);
int ix;
KUNIT_ASSERT_GE(test, part, 0);
ix = from / PAGE_SIZE;
KUNIT_ASSERT_LT(test, ix, npages);
p = bpages[ix];
KUNIT_EXPECT_PTR_EQ(test, pagelist[i], p);
KUNIT_EXPECT_EQ(test, offset0, from % PAGE_SIZE);
from += part;
len -= part;
KUNIT_ASSERT_GE(test, len, 0);
if (len == 0)
break;
offset0 = 0;
}
if (test->status == KUNIT_FAILURE)
goto stop;
} while (iov_iter_count(&iter) > 0);
KUNIT_EXPECT_EQ(test, size, 0);
KUNIT_EXPECT_EQ(test, iter.count, 0);
}
stop:
KUNIT_SUCCEED(test);
}
/*
* Test the extraction of ITER_XARRAY-type iterators.
*/
@ -761,10 +1017,13 @@ static struct kunit_case __refdata iov_kunit_cases[] = {
KUNIT_CASE(iov_kunit_copy_from_kvec),
KUNIT_CASE(iov_kunit_copy_to_bvec),
KUNIT_CASE(iov_kunit_copy_from_bvec),
KUNIT_CASE(iov_kunit_copy_to_folioq),
KUNIT_CASE(iov_kunit_copy_from_folioq),
KUNIT_CASE(iov_kunit_copy_to_xarray),
KUNIT_CASE(iov_kunit_copy_from_xarray),
KUNIT_CASE(iov_kunit_extract_pages_kvec),
KUNIT_CASE(iov_kunit_extract_pages_bvec),
KUNIT_CASE(iov_kunit_extract_pages_folioq),
KUNIT_CASE(iov_kunit_extract_pages_xarray),
{}
};

View File

@ -11,6 +11,7 @@
#include <linux/kmemleak.h>
#include <linux/bvec.h>
#include <linux/uio.h>
#include <linux/folio_queue.h>
/**
* sg_next - return the next scatterlist entry in a list
@ -1261,6 +1262,67 @@ static ssize_t extract_kvec_to_sg(struct iov_iter *iter,
return ret;
}
/*
* Extract up to sg_max folios from an FOLIOQ-type iterator and add them to
* the scatterlist. The pages are not pinned.
*/
static ssize_t extract_folioq_to_sg(struct iov_iter *iter,
ssize_t maxsize,
struct sg_table *sgtable,
unsigned int sg_max,
iov_iter_extraction_t extraction_flags)
{
const struct folio_queue *folioq = iter->folioq;
struct scatterlist *sg = sgtable->sgl + sgtable->nents;
unsigned int slot = iter->folioq_slot;
ssize_t ret = 0;
size_t offset = iter->iov_offset;
BUG_ON(!folioq);
if (slot >= folioq_nr_slots(folioq)) {
folioq = folioq->next;
if (WARN_ON_ONCE(!folioq))
return 0;
slot = 0;
}
do {
struct folio *folio = folioq_folio(folioq, slot);
size_t fsize = folioq_folio_size(folioq, slot);
if (offset < fsize) {
size_t part = umin(maxsize - ret, fsize - offset);
sg_set_page(sg, folio_page(folio, 0), part, offset);
sgtable->nents++;
sg++;
sg_max--;
offset += part;
ret += part;
}
if (offset >= fsize) {
offset = 0;
slot++;
if (slot >= folioq_nr_slots(folioq)) {
if (!folioq->next) {
WARN_ON_ONCE(ret < iter->count);
break;
}
folioq = folioq->next;
slot = 0;
}
}
} while (sg_max > 0 && ret < maxsize);
iter->folioq = folioq;
iter->folioq_slot = slot;
iter->iov_offset = offset;
iter->count -= ret;
return ret;
}
/*
* Extract up to sg_max folios from an XARRAY-type iterator and add them to
* the scatterlist. The pages are not pinned.
@ -1323,8 +1385,8 @@ static ssize_t extract_xarray_to_sg(struct iov_iter *iter,
* addition of @sg_max elements.
*
* The pages referred to by UBUF- and IOVEC-type iterators are extracted and
* pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE-
* and DISCARD-type are not supported.
* pinned; BVEC-, KVEC-, FOLIOQ- and XARRAY-type are extracted but aren't
* pinned; DISCARD-type is not supported.
*
* No end mark is placed on the scatterlist; that's left to the caller.
*
@ -1356,6 +1418,9 @@ ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t maxsize,
case ITER_KVEC:
return extract_kvec_to_sg(iter, maxsize, sgtable, sg_max,
extraction_flags);
case ITER_FOLIOQ:
return extract_folioq_to_sg(iter, maxsize, sgtable, sg_max,
extraction_flags);
case ITER_XARRAY:
return extract_xarray_to_sg(iter, maxsize, sgtable, sg_max,
extraction_flags);