/* * linux/fs/nfs/pagelist.c * * A set of helper functions for managing NFS read and write requests. * The main purpose of these routines is to provide support for the * coalescing of several requests into a single RPC call. * * Copyright 2000, 2001 (c) Trond Myklebust * */ #include #include #include #include #include #include #include #include #include #include #include #include "internal.h" #include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE static struct kmem_cache *nfs_page_cachep; static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) { p->npages = pagecount; if (pagecount <= ARRAY_SIZE(p->page_array)) p->pagevec = p->page_array; else { p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); if (!p->pagevec) p->npages = 0; } return p->pagevec != NULL; } void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)) { hdr->req = nfs_list_entry(desc->pg_list.next); hdr->inode = desc->pg_inode; hdr->cred = hdr->req->wb_context->cred; hdr->io_start = req_offset(hdr->req); hdr->good_bytes = desc->pg_count; hdr->dreq = desc->pg_dreq; hdr->layout_private = desc->pg_layout_private; hdr->release = release; hdr->completion_ops = desc->pg_completion_ops; if (hdr->completion_ops->init_hdr) hdr->completion_ops->init_hdr(hdr); } EXPORT_SYMBOL_GPL(nfs_pgheader_init); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) { spin_lock(&hdr->lock); if (pos < hdr->io_start + hdr->good_bytes) { set_bit(NFS_IOHDR_ERROR, &hdr->flags); clear_bit(NFS_IOHDR_EOF, &hdr->flags); hdr->good_bytes = pos - hdr->io_start; hdr->error = error; } spin_unlock(&hdr->lock); } static inline struct nfs_page * nfs_page_alloc(void) { struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO); if (p) INIT_LIST_HEAD(&p->wb_list); return p; } static inline void nfs_page_free(struct nfs_page *p) { kmem_cache_free(nfs_page_cachep, p); } static void nfs_iocounter_inc(struct nfs_io_counter *c) { atomic_inc(&c->io_count); } static void nfs_iocounter_dec(struct nfs_io_counter *c) { if (atomic_dec_and_test(&c->io_count)) { clear_bit(NFS_IO_INPROGRESS, &c->flags); smp_mb__after_clear_bit(); wake_up_bit(&c->flags, NFS_IO_INPROGRESS); } } static int __nfs_iocounter_wait(struct nfs_io_counter *c) { wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS); DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS); int ret = 0; do { prepare_to_wait(wq, &q.wait, TASK_KILLABLE); set_bit(NFS_IO_INPROGRESS, &c->flags); if (atomic_read(&c->io_count) == 0) break; ret = nfs_wait_bit_killable(&c->flags); } while (atomic_read(&c->io_count) != 0); finish_wait(wq, &q.wait); return ret; } /** * nfs_iocounter_wait - wait for i/o to complete * @c: nfs_io_counter to use * * returns -ERESTARTSYS if interrupted by a fatal signal. * Otherwise returns 0 once the io_count hits 0. */ int nfs_iocounter_wait(struct nfs_io_counter *c) { if (atomic_read(&c->io_count) == 0) return 0; return __nfs_iocounter_wait(c); } /** * nfs_create_request - Create an NFS read/write request. * @ctx: open context to use * @inode: inode to which the request is attached * @page: page to write * @offset: starting offset within the page for the write * @count: number of bytes to read/write * * The page must be locked by the caller. This makes sure we never * create two different requests for the same page. * User should ensure it is safe to sleep in this function. */ struct nfs_page * nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, struct page *page, unsigned int offset, unsigned int count) { struct nfs_page *req; struct nfs_lock_context *l_ctx; if (test_bit(NFS_CONTEXT_BAD, &ctx->flags)) return ERR_PTR(-EBADF); /* try to allocate the request struct */ req = nfs_page_alloc(); if (req == NULL) return ERR_PTR(-ENOMEM); /* get lock context early so we can deal with alloc failures */ l_ctx = nfs_get_lock_context(ctx); if (IS_ERR(l_ctx)) { nfs_page_free(req); return ERR_CAST(l_ctx); } req->wb_lock_context = l_ctx; nfs_iocounter_inc(&l_ctx->io_count); /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ req->wb_page = page; req->wb_index = page_file_index(page); page_cache_get(page); req->wb_offset = offset; req->wb_pgbase = offset; req->wb_bytes = count; req->wb_context = get_nfs_open_context(ctx); kref_init(&req->wb_kref); return req; } /** * nfs_unlock_request - Unlock request and wake up sleepers. * @req: */ void nfs_unlock_request(struct nfs_page *req) { if (!NFS_WBACK_BUSY(req)) { printk(KERN_ERR "NFS: Invalid unlock attempted\n"); BUG(); } smp_mb__before_clear_bit(); clear_bit(PG_BUSY, &req->wb_flags); smp_mb__after_clear_bit(); wake_up_bit(&req->wb_flags, PG_BUSY); } /** * nfs_unlock_and_release_request - Unlock request and release the nfs_page * @req: */ void nfs_unlock_and_release_request(struct nfs_page *req) { nfs_unlock_request(req); nfs_release_request(req); } /* * nfs_clear_request - Free up all resources allocated to the request * @req: * * Release page and open context resources associated with a read/write * request after it has completed. */ static void nfs_clear_request(struct nfs_page *req) { struct page *page = req->wb_page; struct nfs_open_context *ctx = req->wb_context; struct nfs_lock_context *l_ctx = req->wb_lock_context; if (page != NULL) { page_cache_release(page); req->wb_page = NULL; } if (l_ctx != NULL) { nfs_iocounter_dec(&l_ctx->io_count); nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } if (ctx != NULL) { put_nfs_open_context(ctx); req->wb_context = NULL; } } /** * nfs_release_request - Release the count on an NFS read/write request * @req: request to release * * Note: Should never be called with the spinlock held! */ static void nfs_free_request(struct kref *kref) { struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); /* Release struct file and open context */ nfs_clear_request(req); nfs_page_free(req); } void nfs_release_request(struct nfs_page *req) { kref_put(&req->wb_kref, nfs_free_request); } static int nfs_wait_bit_uninterruptible(void *word) { io_schedule(); return 0; } /** * nfs_wait_on_request - Wait for a request to complete. * @req: request to wait upon. * * Interruptible by fatal signals only. * The user is responsible for holding a count on the request. */ int nfs_wait_on_request(struct nfs_page *req) { return wait_on_bit(&req->wb_flags, PG_BUSY, nfs_wait_bit_uninterruptible, TASK_UNINTERRUPTIBLE); } bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) { /* * FIXME: ideally we should be able to coalesce all requests * that are not block boundary aligned, but currently this * is problematic for the case of bsize < PAGE_CACHE_SIZE, * since nfs_flush_multi and nfs_pagein_multi assume you * can have only one struct nfs_page. */ if (desc->pg_bsize < PAGE_SIZE) return 0; return desc->pg_count + req->wb_bytes <= desc->pg_bsize; } EXPORT_SYMBOL_GPL(nfs_generic_pg_test); static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr) { return container_of(hdr, struct nfs_rw_header, header); } /** * nfs_rw_header_alloc - Allocate a header for a read or write * @ops: Read or write function vector */ struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops) { struct nfs_rw_header *header = ops->rw_alloc_header(); if (header) { struct nfs_pgio_header *hdr = &header->header; INIT_LIST_HEAD(&hdr->pages); INIT_LIST_HEAD(&hdr->rpc_list); spin_lock_init(&hdr->lock); atomic_set(&hdr->refcnt, 0); hdr->rw_ops = ops; } return header; } EXPORT_SYMBOL_GPL(nfs_rw_header_alloc); /* * nfs_rw_header_free - Free a read or write header * @hdr: The header to free */ void nfs_rw_header_free(struct nfs_pgio_header *hdr) { hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr)); } EXPORT_SYMBOL_GPL(nfs_rw_header_free); /** * nfs_pgio_data_alloc - Allocate pageio data * @hdr: The header making a request * @pagecount: Number of pages to create */ struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr, unsigned int pagecount) { struct nfs_pgio_data *data, *prealloc; prealloc = &NFS_RW_HEADER(hdr)->rpc_data; if (prealloc->header == NULL) data = prealloc; else data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out; if (nfs_pgarray_set(&data->pages, pagecount)) { data->header = hdr; atomic_inc(&hdr->refcnt); } else { if (data != prealloc) kfree(data); data = NULL; } out: return data; } /** * nfs_pgio_data_release - Properly free pageio data * @data: The data to release */ void nfs_pgio_data_release(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr); put_nfs_open_context(data->args.context); if (data->pages.pagevec != data->pages.page_array) kfree(data->pages.pagevec); if (data == &pageio_header->rpc_data) { data->header = NULL; data = NULL; } if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); /* Note: we only free the rpc_task after callbacks are done. * See the comment in rpc_free_task() for why */ kfree(data); } EXPORT_SYMBOL_GPL(nfs_pgio_data_release); /** * nfs_pgio_rpcsetup - Set up arguments for a pageio call * @data: The pageio data * @count: Number of bytes to read * @offset: Initial offset * @how: How to commit data (writes only) * @cinfo: Commit information for the call (writes only) */ void nfs_pgio_rpcsetup(struct nfs_pgio_data *data, unsigned int count, unsigned int offset, int how, struct nfs_commit_info *cinfo) { struct nfs_page *req = data->header->req; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ data->args.fh = NFS_FH(data->header->inode); data->args.offset = req_offset(req) + offset; /* pnfs_set_layoutcommit needs this */ data->mds_offset = data->args.offset; data->args.pgbase = req->wb_pgbase + offset; data->args.pages = data->pages.pagevec; data->args.count = count; data->args.context = get_nfs_open_context(req->wb_context); data->args.lock_context = req->wb_lock_context; data->args.stable = NFS_UNSTABLE; switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { case 0: break; case FLUSH_COND_STABLE: if (nfs_reqs_to_commit(cinfo)) break; default: data->args.stable = NFS_FILE_SYNC; } data->res.fattr = &data->fattr; data->res.count = count; data->res.eof = 0; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); } /** * nfs_pgio_prepare - Prepare pageio data to go over the wire * @task: The current task * @calldata: pageio data to prepare */ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) { struct nfs_pgio_data *data = calldata; int err; err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data); if (err) rpc_exit(task, err); } /** * nfs_pgio_release - Release pageio data * @calldata: The pageio data to release */ static void nfs_pgio_release(void *calldata) { struct nfs_pgio_data *data = calldata; if (data->header->rw_ops->rw_release) data->header->rw_ops->rw_release(data); nfs_pgio_data_release(data); } /** * nfs_pageio_init - initialise a page io descriptor * @desc: pointer to descriptor * @inode: pointer to inode * @doio: pointer to io function * @bsize: io block size * @io_flags: extra parameters for the io function */ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, const struct nfs_pageio_ops *pg_ops, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, int io_flags) { INIT_LIST_HEAD(&desc->pg_list); desc->pg_bytes_written = 0; desc->pg_count = 0; desc->pg_bsize = bsize; desc->pg_base = 0; desc->pg_moreio = 0; desc->pg_recoalesce = 0; desc->pg_inode = inode; desc->pg_ops = pg_ops; desc->pg_completion_ops = compl_ops; desc->pg_rw_ops = rw_ops; desc->pg_ioflags = io_flags; desc->pg_error = 0; desc->pg_lseg = NULL; desc->pg_dreq = NULL; desc->pg_layout_private = NULL; } EXPORT_SYMBOL_GPL(nfs_pageio_init); /** * nfs_pgio_result - Basic pageio error handling * @task: The task that ran * @calldata: Pageio data to check */ static void nfs_pgio_result(struct rpc_task *task, void *calldata) { struct nfs_pgio_data *data = calldata; struct inode *inode = data->header->inode; dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid, task->tk_status); if (data->header->rw_ops->rw_done(task, data, inode) != 0) return; if (task->tk_status < 0) nfs_set_pgio_error(data->header, task->tk_status, data->args.offset); else data->header->rw_ops->rw_result(task, data); } static bool nfs_match_open_context(const struct nfs_open_context *ctx1, const struct nfs_open_context *ctx2) { return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; } static bool nfs_match_lock_context(const struct nfs_lock_context *l1, const struct nfs_lock_context *l2) { return l1->lockowner.l_owner == l2->lockowner.l_owner && l1->lockowner.l_pid == l2->lockowner.l_pid; } /** * nfs_can_coalesce_requests - test two requests for compatibility * @prev: pointer to nfs_page * @req: pointer to nfs_page * * The nfs_page structures 'prev' and 'req' are compared to ensure that the * page data area they describe is contiguous, and that their RPC * credentials, NFSv4 open state, and lockowners are the same. * * Return 'true' if this is the case, else return 'false'. */ static bool nfs_can_coalesce_requests(struct nfs_page *prev, struct nfs_page *req, struct nfs_pageio_descriptor *pgio) { if (!nfs_match_open_context(req->wb_context, prev->wb_context)) return false; if (req->wb_context->dentry->d_inode->i_flock != NULL && !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context)) return false; if (req->wb_pgbase != 0) return false; if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) return false; if (req_offset(req) != req_offset(prev) + prev->wb_bytes) return false; return pgio->pg_ops->pg_test(pgio, prev, req); } /** * nfs_pageio_do_add_request - Attempt to coalesce a request into a page list. * @desc: destination io descriptor * @req: request * * Returns true if the request 'req' was successfully coalesced into the * existing list of pages 'desc'. */ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { if (desc->pg_count != 0) { struct nfs_page *prev; prev = nfs_list_entry(desc->pg_list.prev); if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; } else { if (desc->pg_ops->pg_init) desc->pg_ops->pg_init(desc, req); desc->pg_base = req->wb_pgbase; } nfs_list_remove_request(req); nfs_list_add_request(req, &desc->pg_list); desc->pg_count += req->wb_bytes; return 1; } /* * Helper for nfs_pageio_add_request and nfs_pageio_complete */ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { if (!list_empty(&desc->pg_list)) { int error = desc->pg_ops->pg_doio(desc); if (error < 0) desc->pg_error = error; else desc->pg_bytes_written += desc->pg_count; } if (list_empty(&desc->pg_list)) { desc->pg_count = 0; desc->pg_base = 0; } } /** * nfs_pageio_add_request - Attempt to coalesce a request into a page list. * @desc: destination io descriptor * @req: request * * Returns true if the request 'req' was successfully coalesced into the * existing list of pages 'desc'. */ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { while (!nfs_pageio_do_add_request(desc, req)) { desc->pg_moreio = 1; nfs_pageio_doio(desc); if (desc->pg_error < 0) return 0; desc->pg_moreio = 0; if (desc->pg_recoalesce) return 0; } return 1; } static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) { LIST_HEAD(head); do { list_splice_init(&desc->pg_list, &head); desc->pg_bytes_written -= desc->pg_count; desc->pg_count = 0; desc->pg_base = 0; desc->pg_recoalesce = 0; while (!list_empty(&head)) { struct nfs_page *req; req = list_first_entry(&head, struct nfs_page, wb_list); nfs_list_remove_request(req); if (__nfs_pageio_add_request(desc, req)) continue; if (desc->pg_error < 0) return 0; break; } } while (desc->pg_recoalesce); return 1; } int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { int ret; do { ret = __nfs_pageio_add_request(desc, req); if (ret) break; if (desc->pg_error < 0) break; ret = nfs_do_recoalesce(desc); } while (ret); return ret; } EXPORT_SYMBOL_GPL(nfs_pageio_add_request); /** * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor * @desc: pointer to io descriptor */ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) { for (;;) { nfs_pageio_doio(desc); if (!desc->pg_recoalesce) break; if (!nfs_do_recoalesce(desc)) break; } } EXPORT_SYMBOL_GPL(nfs_pageio_complete); /** * nfs_pageio_cond_complete - Conditional I/O completion * @desc: pointer to io descriptor * @index: page index * * It is important to ensure that processes don't try to take locks * on non-contiguous ranges of pages as that might deadlock. This * function should be called before attempting to wait on a locked * nfs_page. It will complete the I/O if the page index 'index' * is not contiguous with the existing list of pages in 'desc'. */ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) { if (!list_empty(&desc->pg_list)) { struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev); if (index != prev->wb_index + 1) nfs_pageio_complete(desc); } } int __init nfs_init_nfspagecache(void) { nfs_page_cachep = kmem_cache_create("nfs_page", sizeof(struct nfs_page), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_page_cachep == NULL) return -ENOMEM; return 0; } void nfs_destroy_nfspagecache(void) { kmem_cache_destroy(nfs_page_cachep); } const struct rpc_call_ops nfs_pgio_common_ops = { .rpc_call_prepare = nfs_pgio_prepare, .rpc_call_done = nfs_pgio_result, .rpc_release = nfs_pgio_release, };