A big pile of assorted fixes and improvements for the filesystem with

nothing in particular standing out, except perhaps that the fact that
 the MDS never really maintained atime was made official and thus it's
 no longer updated on the client either.
 
 We also have a MAINTAINERS update: Jeff is transitioning his filesystem
 maintainership duties to Xiubo.
 -----BEGIN PGP SIGNATURE-----
 
 iQFHBAABCAAxFiEEydHwtzie9C7TfviiSn/eOAIR84sFAmKYs1wTHGlkcnlvbW92
 QGdtYWlsLmNvbQAKCRBKf944AhHzi+PvCACIj47W4FapO672xcIkQ4920ZT1Jw/o
 2BfKXUtNyVLpGgBlweJWSTd1tfXp0tl9MFg00t/zbVarHH0SGAgF1z6e/tM7rjA/
 vyCkFQXJDuwB0kCbCtZ9xt5XIQkkvPPJOmyLSKYl7RqImch7pTRd5IwxgKGWqXDx
 FraVXqFqvr8L+szV/JCopdxdMVTFixWRD48z5pPlOReaOXiGjtTMoFIBIPp7GqVL
 UB7wyOtDmyzcGnUsRNqMQFrkUBsBW1IEDKf/yVtQNDjUxmr3uXm8vugeISpMOGBO
 cCkZACDeO0lpgHrXSo4UCf46bg3/HujxZu0nTc9HqPDiFdOmKmf58N4n
 =MAi2
 -----END PGP SIGNATURE-----

Merge tag 'ceph-for-5.19-rc1' of https://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "A big pile of assorted fixes and improvements for the filesystem with
  nothing in particular standing out, except perhaps that the fact that
  the MDS never really maintained atime was made official and thus it's
  no longer updated on the client either.

  We also have a MAINTAINERS update: Jeff is transitioning his
  filesystem maintainership duties to Xiubo"

* tag 'ceph-for-5.19-rc1' of https://github.com/ceph/ceph-client: (23 commits)
  MAINTAINERS: move myself from ceph "Maintainer" to "Reviewer"
  ceph: fix decoding of client session messages flags
  ceph: switch TASK_INTERRUPTIBLE to TASK_KILLABLE
  ceph: remove redundant variable ino
  ceph: try to queue a writeback if revoking fails
  ceph: fix statfs for subdir mounts
  ceph: fix possible deadlock when holding Fwb to get inline_data
  ceph: redirty the page for writepage on failure
  ceph: try to choose the auth MDS if possible for getattr
  ceph: disable updating the atime since cephfs won't maintain it
  ceph: flush the mdlog for filesystem sync
  ceph: rename unsafe_request_wait()
  libceph: use swap() macro instead of taking tmp variable
  ceph: fix statx AT_STATX_DONT_SYNC vs AT_STATX_FORCE_SYNC check
  ceph: no need to invalidate the fscache twice
  ceph: replace usage of found with dedicated list iterator variable
  ceph: use dedicated list iterator variable
  ceph: update the dlease for the hashed dentry when removing
  ceph: stop retrying the request when exceeding 256 times
  ceph: stop forwarding the request when exceeding 256 times
  ...
This commit is contained in:
Linus Torvalds 2022-06-02 08:59:39 -07:00
commit 17d8e3d90b
12 changed files with 257 additions and 107 deletions

View File

@ -4566,8 +4566,8 @@ F: drivers/power/supply/cw2015_battery.c
CEPH COMMON CODE (LIBCEPH) CEPH COMMON CODE (LIBCEPH)
M: Ilya Dryomov <idryomov@gmail.com> M: Ilya Dryomov <idryomov@gmail.com>
M: Jeff Layton <jlayton@kernel.org>
M: Xiubo Li <xiubli@redhat.com> M: Xiubo Li <xiubli@redhat.com>
R: Jeff Layton <jlayton@kernel.org>
L: ceph-devel@vger.kernel.org L: ceph-devel@vger.kernel.org
S: Supported S: Supported
W: http://ceph.com/ W: http://ceph.com/
@ -4577,9 +4577,9 @@ F: include/linux/crush/
F: net/ceph/ F: net/ceph/
CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH) CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH)
M: Jeff Layton <jlayton@kernel.org>
M: Xiubo Li <xiubli@redhat.com> M: Xiubo Li <xiubli@redhat.com>
M: Ilya Dryomov <idryomov@gmail.com> M: Ilya Dryomov <idryomov@gmail.com>
R: Jeff Layton <jlayton@kernel.org>
L: ceph-devel@vger.kernel.org L: ceph-devel@vger.kernel.org
S: Supported S: Supported
W: http://ceph.com/ W: http://ceph.com/

View File

@ -756,24 +756,23 @@ static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
*/ */
static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
{ {
struct rbd_client *client_node; struct rbd_client *rbdc = NULL, *iter;
bool found = false;
if (ceph_opts->flags & CEPH_OPT_NOSHARE) if (ceph_opts->flags & CEPH_OPT_NOSHARE)
return NULL; return NULL;
spin_lock(&rbd_client_list_lock); spin_lock(&rbd_client_list_lock);
list_for_each_entry(client_node, &rbd_client_list, node) { list_for_each_entry(iter, &rbd_client_list, node) {
if (!ceph_compare_options(ceph_opts, client_node->client)) { if (!ceph_compare_options(ceph_opts, iter->client)) {
__rbd_get_client(client_node); __rbd_get_client(iter);
found = true; rbdc = iter;
break; break;
} }
} }
spin_unlock(&rbd_client_list_lock); spin_unlock(&rbd_client_list_lock);
return found ? client_node : NULL; return rbdc;
} }
/* /*

View File

@ -256,6 +256,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
struct iov_iter iter; struct iov_iter iter;
ssize_t err = 0; ssize_t err = 0;
size_t len; size_t len;
int mode;
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
@ -264,7 +265,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
goto out; goto out;
/* We need to fetch the inline data. */ /* We need to fetch the inline data. */
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
if (IS_ERR(req)) { if (IS_ERR(req)) {
err = PTR_ERR(req); err = PTR_ERR(req);
goto out; goto out;
@ -604,8 +606,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
ceph_wbc.truncate_seq, ceph_wbc.truncate_size, ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
true); true);
if (IS_ERR(req)) if (IS_ERR(req)) {
redirty_page_for_writepage(wbc, page);
return PTR_ERR(req); return PTR_ERR(req);
}
set_page_writeback(page); set_page_writeback(page);
if (caching) if (caching)
@ -1644,7 +1648,7 @@ int ceph_uninline_data(struct file *file)
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req; struct ceph_osd_request *req = NULL;
struct ceph_cap_flush *prealloc_cf; struct ceph_cap_flush *prealloc_cf;
struct folio *folio = NULL; struct folio *folio = NULL;
u64 inline_version = CEPH_INLINE_NONE; u64 inline_version = CEPH_INLINE_NONE;
@ -1652,18 +1656,6 @@ int ceph_uninline_data(struct file *file)
int err = 0; int err = 0;
u64 len; u64 len;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
folio = read_mapping_folio(inode->i_mapping, 0, file);
if (IS_ERR(folio)) {
err = PTR_ERR(folio);
goto out;
}
folio_lock(folio);
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
inline_version = ci->i_inline_version; inline_version = ci->i_inline_version;
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
@ -1671,9 +1663,23 @@ int ceph_uninline_data(struct file *file)
dout("uninline_data %p %llx.%llx inline_version %llu\n", dout("uninline_data %p %llx.%llx inline_version %llu\n",
inode, ceph_vinop(inode), inline_version); inode, ceph_vinop(inode), inline_version);
if (inline_version == 1 || /* initial version, no data */ if (inline_version == CEPH_INLINE_NONE)
inline_version == CEPH_INLINE_NONE) return 0;
goto out_unlock;
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
if (inline_version == 1) /* initial version, no data */
goto out_uninline;
folio = read_mapping_folio(inode->i_mapping, 0, file);
if (IS_ERR(folio)) {
err = PTR_ERR(folio);
goto out;
}
folio_lock(folio);
len = i_size_read(inode); len = i_size_read(inode);
if (len > folio_size(folio)) if (len > folio_size(folio))
@ -1739,6 +1745,7 @@ int ceph_uninline_data(struct file *file)
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err); req->r_end_latency, len, err);
out_uninline:
if (!err) { if (!err) {
int dirty; int dirty;
@ -1757,8 +1764,10 @@ out_put_req:
if (err == -ECANCELED) if (err == -ECANCELED)
err = 0; err = 0;
out_unlock: out_unlock:
folio_unlock(folio); if (folio) {
folio_put(folio); folio_unlock(folio);
folio_put(folio);
}
out: out:
ceph_free_cap_flush(prealloc_cf); ceph_free_cap_flush(prealloc_cf);
dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
@ -1777,7 +1786,6 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
if (!mapping->a_ops->read_folio) if (!mapping->a_ops->read_folio)
return -ENOEXEC; return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &ceph_vmops; vma->vm_ops = &ceph_vmops;
return 0; return 0;
} }

View File

@ -1577,7 +1577,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
while (first_tid <= last_tid) { while (first_tid <= last_tid) {
struct ceph_cap *cap = ci->i_auth_cap; struct ceph_cap *cap = ci->i_auth_cap;
struct ceph_cap_flush *cf; struct ceph_cap_flush *cf = NULL, *iter;
int ret; int ret;
if (!(cap && cap->session == session)) { if (!(cap && cap->session == session)) {
@ -1587,8 +1587,9 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
} }
ret = -ENOENT; ret = -ENOENT;
list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) {
if (cf->tid >= first_tid) { if (iter->tid >= first_tid) {
cf = iter;
ret = 0; ret = 0;
break; break;
} }
@ -1910,6 +1911,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct rb_node *p; struct rb_node *p;
bool queue_invalidate = false; bool queue_invalidate = false;
bool tried_invalidate = false; bool tried_invalidate = false;
bool queue_writeback = false;
if (session) if (session)
ceph_get_mds_session(session); ceph_get_mds_session(session);
@ -2062,10 +2064,27 @@ retry:
} }
/* completed revocation? going down and there are no caps? */ /* completed revocation? going down and there are no caps? */
if (revoking && (revoking & cap_used) == 0) { if (revoking) {
dout("completed revocation of %s\n", if ((revoking & cap_used) == 0) {
ceph_cap_string(cap->implemented & ~cap->issued)); dout("completed revocation of %s\n",
goto ack; ceph_cap_string(cap->implemented & ~cap->issued));
goto ack;
}
/*
* If the "i_wrbuffer_ref" was increased by mmap or generic
* cache write just before the ceph_check_caps() is called,
* the Fb capability revoking will fail this time. Then we
* must wait for the BDI's delayed work to flush the dirty
* pages and to release the "i_wrbuffer_ref", which will cost
* at most 5 seconds. That means the MDS needs to wait at
* most 5 seconds to finished the Fb capability's revocation.
*
* Let's queue a writeback for it.
*/
if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
(revoking & CEPH_CAP_FILE_BUFFER))
queue_writeback = true;
} }
/* want more caps from mds? */ /* want more caps from mds? */
@ -2135,6 +2154,8 @@ ack:
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
ceph_put_mds_session(session); ceph_put_mds_session(session);
if (queue_writeback)
ceph_queue_writeback(inode);
if (queue_invalidate) if (queue_invalidate)
ceph_queue_invalidate(inode); ceph_queue_invalidate(inode);
} }
@ -2218,9 +2239,9 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
} }
/* /*
* wait for any unsafe requests to complete. * flush the mdlog and wait for any unsafe requests to complete.
*/ */
static int unsafe_request_wait(struct inode *inode) static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
{ {
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
@ -2336,7 +2357,7 @@ retry:
kfree(sessions); kfree(sessions);
} }
dout("unsafe_request_wait %p wait on tid %llu %llu\n", dout("%s %p wait on tid %llu %llu\n", __func__,
inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
if (req1) { if (req1) {
ret = !wait_for_completion_timeout(&req1->r_safe_completion, ret = !wait_for_completion_timeout(&req1->r_safe_completion,
@ -2380,7 +2401,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
dirty = try_flush_caps(inode, &flush_tid); dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
err = unsafe_request_wait(inode); err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
/* /*
* only wait on non-file metadata writeback (the mds * only wait on non-file metadata writeback (the mds
@ -3182,10 +3203,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc) struct ceph_snap_context *snapc)
{ {
struct inode *inode = &ci->vfs_inode; struct inode *inode = &ci->vfs_inode;
struct ceph_cap_snap *capsnap = NULL; struct ceph_cap_snap *capsnap = NULL, *iter;
int put = 0; int put = 0;
bool last = false; bool last = false;
bool found = false;
bool flush_snaps = false; bool flush_snaps = false;
bool complete_capsnap = false; bool complete_capsnap = false;
@ -3212,14 +3232,14 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
last ? " LAST" : ""); last ? " LAST" : "");
} else { } else {
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
if (capsnap->context == snapc) { if (iter->context == snapc) {
found = true; capsnap = iter;
break; break;
} }
} }
if (!found) { if (!capsnap) {
/* /*
* The capsnap should already be removed when removing * The capsnap should already be removed when removing
* auth cap in the case of a forced unmount. * auth cap in the case of a forced unmount.
@ -3769,8 +3789,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
u64 follows = le64_to_cpu(m->snap_follows); u64 follows = le64_to_cpu(m->snap_follows);
struct ceph_cap_snap *capsnap; struct ceph_cap_snap *capsnap = NULL, *iter;
bool flushed = false;
bool wake_ci = false; bool wake_ci = false;
bool wake_mdsc = false; bool wake_mdsc = false;
@ -3778,26 +3797,26 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
inode, ci, session->s_mds, follows); inode, ci, session->s_mds, follows);
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
if (capsnap->follows == follows) { if (iter->follows == follows) {
if (capsnap->cap_flush.tid != flush_tid) { if (iter->cap_flush.tid != flush_tid) {
dout(" cap_snap %p follows %lld tid %lld !=" dout(" cap_snap %p follows %lld tid %lld !="
" %lld\n", capsnap, follows, " %lld\n", iter, follows,
flush_tid, capsnap->cap_flush.tid); flush_tid, iter->cap_flush.tid);
break; break;
} }
flushed = true; capsnap = iter;
break; break;
} else { } else {
dout(" skipping cap_snap %p follows %lld\n", dout(" skipping cap_snap %p follows %lld\n",
capsnap, capsnap->follows); iter, iter->follows);
} }
} }
if (flushed) if (capsnap)
ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (flushed) { if (capsnap) {
ceph_put_snap_context(capsnap->context); ceph_put_snap_context(capsnap->context);
ceph_put_cap_snap(capsnap); ceph_put_cap_snap(capsnap);
if (wake_ci) if (wake_ci)

View File

@ -578,7 +578,7 @@ void ceph_evict_inode(struct inode *inode)
__ceph_remove_caps(ci); __ceph_remove_caps(ci);
if (__ceph_has_any_quota(ci)) if (__ceph_has_quota(ci, QUOTA_GET_ANY))
ceph_adjust_quota_realms_count(inode, false); ceph_adjust_quota_realms_count(inode, false);
/* /*
@ -1466,10 +1466,12 @@ retry_lookup:
} else if (have_lease) { } else if (have_lease) {
if (d_unhashed(dn)) if (d_unhashed(dn))
d_add(dn, NULL); d_add(dn, NULL);
}
if (!d_unhashed(dn) && have_lease)
update_dentry_lease(dir, dn, update_dentry_lease(dir, dn,
rinfo->dlease, session, rinfo->dlease, session,
req->r_request_started); req->r_request_started);
}
goto done; goto done;
} }
@ -1884,7 +1886,6 @@ static void ceph_do_invalidate_pages(struct inode *inode)
orig_gen = ci->i_rdcache_gen; orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
ceph_fscache_invalidate(inode, false);
if (invalidate_inode_pages2(inode->i_mapping) < 0) { if (invalidate_inode_pages2(inode->i_mapping) < 0) {
pr_err("invalidate_inode_pages2 %llx.%llx failed\n", pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
ceph_vinop(inode)); ceph_vinop(inode));
@ -2258,6 +2259,30 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err; return err;
} }
int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
{
int issued = ceph_caps_issued(ceph_inode(inode));
/*
* If any 'x' caps is issued we can just choose the auth MDS
* instead of the random replica MDSes. Because only when the
* Locker is in LOCK_EXEC state will the loner client could
* get the 'x' caps. And if we send the getattr requests to
* any replica MDS it must auth pin and tries to rdlock from
* the auth MDS, and then the auth MDS need to do the Locker
* state transition to LOCK_SYNC. And after that the lock state
* will change back.
*
* This cost much when doing the Locker state transition and
* usually will need to revoke caps from clients.
*/
if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
|| (mask & CEPH_STAT_RSTAT))
return USE_AUTH_MDS;
else
return USE_ANY_MDS;
}
/* /*
* Verify that we have a lease on the given mask. If not, * Verify that we have a lease on the given mask. If not,
* do a getattr against an mds. * do a getattr against an mds.
@ -2281,7 +2306,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
return 0; return 0;
mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; mode = ceph_try_to_choose_auth_mds(inode, mask);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
if (IS_ERR(req)) if (IS_ERR(req))
return PTR_ERR(req); return PTR_ERR(req);
@ -2423,7 +2448,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
return -ESTALE; return -ESTALE;
/* Skip the getattr altogether if we're asked not to sync */ /* Skip the getattr altogether if we're asked not to sync */
if (!(flags & AT_STATX_DONT_SYNC)) { if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
err = ceph_do_getattr(inode, err = ceph_do_getattr(inode,
statx_to_caps(request_mask, inode->i_mode), statx_to_caps(request_mask, inode->i_mode),
flags & AT_STATX_FORCE_SYNC); flags & AT_STATX_FORCE_SYNC);

View File

@ -437,7 +437,7 @@ static int ceph_parse_deleg_inos(void **p, void *end,
ceph_decode_32_safe(p, end, sets, bad); ceph_decode_32_safe(p, end, sets, bad);
dout("got %u sets of delegated inodes\n", sets); dout("got %u sets of delegated inodes\n", sets);
while (sets--) { while (sets--) {
u64 start, len, ino; u64 start, len;
ceph_decode_64_safe(p, end, start, bad); ceph_decode_64_safe(p, end, start, bad);
ceph_decode_64_safe(p, end, len, bad); ceph_decode_64_safe(p, end, len, bad);
@ -449,7 +449,7 @@ static int ceph_parse_deleg_inos(void **p, void *end,
continue; continue;
} }
while (len--) { while (len--) {
int err = xa_insert(&s->s_delegated_inos, ino = start++, int err = xa_insert(&s->s_delegated_inos, start++,
DELEGATED_INO_AVAILABLE, DELEGATED_INO_AVAILABLE,
GFP_KERNEL); GFP_KERNEL);
if (!err) { if (!err) {
@ -2651,7 +2651,28 @@ static int __prepare_send_request(struct ceph_mds_session *session,
struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_mds_request_head_old *rhead; struct ceph_mds_request_head_old *rhead;
struct ceph_msg *msg; struct ceph_msg *msg;
int flags = 0; int flags = 0, max_retry;
/*
* The type of 'r_attempts' in kernel 'ceph_mds_request'
* is 'int', while in 'ceph_mds_request_head' the type of
* 'num_retry' is '__u8'. So in case the request retries
* exceeding 256 times, the MDS will receive a incorrect
* retry seq.
*
* In this case it's ususally a bug in MDS and continue
* retrying the request makes no sense.
*
* In future this could be fixed in ceph code, so avoid
* using the hardcode here.
*/
max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
max_retry = 1 << (max_retry * BITS_PER_BYTE);
if (req->r_attempts >= max_retry) {
pr_warn_ratelimited("%s request tid %llu seq overflow\n",
__func__, req->r_tid);
return -EMULTIHOP;
}
req->r_attempts++; req->r_attempts++;
if (req->r_inode) { if (req->r_inode) {
@ -2663,7 +2684,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
else else
req->r_sent_on_mseq = -1; req->r_sent_on_mseq = -1;
} }
dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, dout("%s %p tid %lld %s (attempt %d)\n", __func__, req,
req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
@ -3265,6 +3286,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
int err = -EINVAL; int err = -EINVAL;
void *p = msg->front.iov_base; void *p = msg->front.iov_base;
void *end = p + msg->front.iov_len; void *end = p + msg->front.iov_len;
bool aborted = false;
ceph_decode_need(&p, end, 2*sizeof(u32), bad); ceph_decode_need(&p, end, 2*sizeof(u32), bad);
next_mds = ceph_decode_32(&p); next_mds = ceph_decode_32(&p);
@ -3273,16 +3295,41 @@ static void handle_forward(struct ceph_mds_client *mdsc,
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
req = lookup_get_request(mdsc, tid); req = lookup_get_request(mdsc, tid);
if (!req) { if (!req) {
mutex_unlock(&mdsc->mutex);
dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
goto out; /* dup reply? */ return; /* dup reply? */
} }
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
dout("forward tid %llu aborted, unregistering\n", tid); dout("forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req); __unregister_request(mdsc, req);
} else if (fwd_seq <= req->r_num_fwd) { } else if (fwd_seq <= req->r_num_fwd) {
dout("forward tid %llu to mds%d - old seq %d <= %d\n", /*
tid, next_mds, req->r_num_fwd, fwd_seq); * The type of 'num_fwd' in ceph 'MClientRequestForward'
* is 'int32_t', while in 'ceph_mds_request_head' the
* type is '__u8'. So in case the request bounces between
* MDSes exceeding 256 times, the client will get stuck.
*
* In this case it's ususally a bug in MDS and continue
* bouncing the request makes no sense.
*
* In future this could be fixed in ceph code, so avoid
* using the hardcode here.
*/
int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
max = 1 << (max * BITS_PER_BYTE);
if (req->r_num_fwd >= max) {
mutex_lock(&req->r_fill_mutex);
req->r_err = -EMULTIHOP;
set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
mutex_unlock(&req->r_fill_mutex);
aborted = true;
pr_warn_ratelimited("forward tid %llu seq overflow\n",
tid);
} else {
dout("forward tid %llu to mds%d - old seq %d <= %d\n",
tid, next_mds, req->r_num_fwd, fwd_seq);
}
} else { } else {
/* resend. forward race not possible; mds would drop */ /* resend. forward race not possible; mds would drop */
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
@ -3294,9 +3341,12 @@ static void handle_forward(struct ceph_mds_client *mdsc,
put_request_session(req); put_request_session(req);
__do_request(mdsc, req); __do_request(mdsc, req);
} }
ceph_mdsc_put_request(req);
out:
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
/* kick calling process */
if (aborted)
complete_request(mdsc, req);
ceph_mdsc_put_request(req);
return; return;
bad: bad:
@ -3375,13 +3425,17 @@ static void handle_session(struct ceph_mds_session *session,
} }
if (msg_version >= 5) { if (msg_version >= 5) {
u32 flags; u32 flags, len;
/* version >= 4, struct_v, struct_cv, len, metric_spec */
ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 2, bad); /* version >= 4 */
ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */
ceph_decode_32_safe(&p, end, len, bad); /* len */
ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */
/* version >= 5, flags */ /* version >= 5, flags */
ceph_decode_32_safe(&p, end, flags, bad); ceph_decode_32_safe(&p, end, flags, bad);
if (flags & CEPH_SESSION_BLOCKLISTED) { if (flags & CEPH_SESSION_BLOCKLISTED) {
pr_warn("mds%d session blocklisted\n", session->s_mds); pr_warn("mds%d session blocklisted\n", session->s_mds);
blocklisted = true; blocklisted = true;
} }
} }
@ -4396,12 +4450,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
memcpy((void *)(lease + 1) + 4, memcpy((void *)(lease + 1) + 4,
dentry->d_name.name, dentry->d_name.len); dentry->d_name.name, dentry->d_name.len);
spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_lock);
/*
* if this is a preemptive lease RELEASE, no need to
* flush request stream, since the actual request will
* soon follow.
*/
msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
ceph_con_send(&session->s_con, msg); ceph_con_send(&session->s_con, msg);
} }
@ -4696,15 +4744,17 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
} }
/* /*
* wait for all write mds requests to flush. * flush the mdlog and wait for all write mds requests to flush.
*/ */
static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
u64 want_tid)
{ {
struct ceph_mds_request *req = NULL, *nextreq; struct ceph_mds_request *req = NULL, *nextreq;
struct ceph_mds_session *last_session = NULL;
struct rb_node *n; struct rb_node *n;
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
dout("wait_unsafe_requests want %lld\n", want_tid); dout("%s want %lld\n", __func__, want_tid);
restart: restart:
req = __get_oldest_req(mdsc); req = __get_oldest_req(mdsc);
while (req && req->r_tid <= want_tid) { while (req && req->r_tid <= want_tid) {
@ -4716,14 +4766,32 @@ restart:
nextreq = NULL; nextreq = NULL;
if (req->r_op != CEPH_MDS_OP_SETFILELOCK && if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
(req->r_op & CEPH_MDS_OP_WRITE)) { (req->r_op & CEPH_MDS_OP_WRITE)) {
struct ceph_mds_session *s = req->r_session;
if (!s) {
req = nextreq;
continue;
}
/* write op */ /* write op */
ceph_mdsc_get_request(req); ceph_mdsc_get_request(req);
if (nextreq) if (nextreq)
ceph_mdsc_get_request(nextreq); ceph_mdsc_get_request(nextreq);
s = ceph_get_mds_session(s);
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
dout("wait_unsafe_requests wait on %llu (want %llu)\n",
/* send flush mdlog request to MDS */
if (last_session != s) {
send_flush_mdlog(s);
ceph_put_mds_session(last_session);
last_session = s;
} else {
ceph_put_mds_session(s);
}
dout("%s wait on %llu (want %llu)\n", __func__,
req->r_tid, want_tid); req->r_tid, want_tid);
wait_for_completion(&req->r_safe_completion); wait_for_completion(&req->r_safe_completion);
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
if (!nextreq) if (!nextreq)
@ -4738,7 +4806,8 @@ restart:
req = nextreq; req = nextreq;
} }
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
dout("wait_unsafe_requests done\n"); ceph_put_mds_session(last_session);
dout("%s done\n", __func__);
} }
void ceph_mdsc_sync(struct ceph_mds_client *mdsc) void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
@ -4767,7 +4836,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
dout("sync want tid %lld flush_seq %lld\n", dout("sync want tid %lld flush_seq %lld\n",
want_tid, want_flush); want_tid, want_flush);
wait_unsafe_requests(mdsc, want_tid); flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
wait_caps_flush(mdsc, want_flush); wait_caps_flush(mdsc, want_flush);
} }

View File

@ -579,7 +579,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
TASK_INTERRUPTIBLE); TASK_KILLABLE);
} }
extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);

View File

@ -195,9 +195,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
/* /*
* This function walks through the snaprealm for an inode and returns the * This function walks through the snaprealm for an inode and returns the
* ceph_snap_realm for the first snaprealm that has quotas set (either max_files * ceph_snap_realm for the first snaprealm that has quotas set (max_files,
* or max_bytes). If the root is reached, return the root ceph_snap_realm * max_bytes, or any, depending on the 'which_quota' argument). If the root is
* instead. * reached, return the root ceph_snap_realm instead.
* *
* Note that the caller is responsible for calling ceph_put_snap_realm() on the * Note that the caller is responsible for calling ceph_put_snap_realm() on the
* returned realm. * returned realm.
@ -209,7 +209,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
* will be restarted. * will be restarted.
*/ */
static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
struct inode *inode, bool retry) struct inode *inode,
enum quota_get_realm which_quota,
bool retry)
{ {
struct ceph_inode_info *ci = NULL; struct ceph_inode_info *ci = NULL;
struct ceph_snap_realm *realm, *next; struct ceph_snap_realm *realm, *next;
@ -248,7 +250,7 @@ restart:
} }
ci = ceph_inode(in); ci = ceph_inode(in);
has_quota = __ceph_has_any_quota(ci); has_quota = __ceph_has_quota(ci, which_quota);
iput(in); iput(in);
next = realm->parent; next = realm->parent;
@ -279,8 +281,8 @@ restart:
* dropped and we can then restart the whole operation. * dropped and we can then restart the whole operation.
*/ */
down_read(&mdsc->snap_rwsem); down_read(&mdsc->snap_rwsem);
old_realm = get_quota_realm(mdsc, old, true); old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true);
new_realm = get_quota_realm(mdsc, new, false); new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false);
if (PTR_ERR(new_realm) == -EAGAIN) { if (PTR_ERR(new_realm) == -EAGAIN) {
up_read(&mdsc->snap_rwsem); up_read(&mdsc->snap_rwsem);
if (old_realm) if (old_realm)
@ -483,7 +485,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
bool is_updated = false; bool is_updated = false;
down_read(&mdsc->snap_rwsem); down_read(&mdsc->snap_rwsem);
realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true); realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root),
QUOTA_GET_MAX_BYTES, true);
up_read(&mdsc->snap_rwsem); up_read(&mdsc->snap_rwsem);
if (!realm) if (!realm)
return false; return false;

View File

@ -1119,6 +1119,7 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc)
s->s_time_gran = 1; s->s_time_gran = 1;
s->s_time_min = 0; s->s_time_min = 0;
s->s_time_max = U32_MAX; s->s_time_max = U32_MAX;
s->s_flags |= SB_NODIRATIME | SB_NOATIME;
ret = set_anon_super_fc(s, fc); ret = set_anon_super_fc(s, fc);
if (ret != 0) if (ret != 0)

View File

@ -1022,6 +1022,7 @@ static inline void ceph_queue_flush_snaps(struct inode *inode)
ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS); ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS);
} }
extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask);
extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
int mask, bool force); int mask, bool force);
static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
@ -1278,9 +1279,29 @@ extern void ceph_fs_debugfs_init(struct ceph_fs_client *client);
extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
/* quota.c */ /* quota.c */
static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci)
enum quota_get_realm {
QUOTA_GET_MAX_FILES,
QUOTA_GET_MAX_BYTES,
QUOTA_GET_ANY
};
static inline bool __ceph_has_quota(struct ceph_inode_info *ci,
enum quota_get_realm which)
{ {
return ci->i_max_files || ci->i_max_bytes; bool has_quota = false;
switch (which) {
case QUOTA_GET_MAX_BYTES:
has_quota = !!ci->i_max_bytes;
break;
case QUOTA_GET_MAX_FILES:
has_quota = !!ci->i_max_files;
break;
default:
has_quota = !!(ci->i_max_files || ci->i_max_bytes);
}
return has_quota;
} }
extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc);
@ -1289,10 +1310,10 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci,
u64 max_bytes, u64 max_files) u64 max_bytes, u64 max_files)
{ {
bool had_quota, has_quota; bool had_quota, has_quota;
had_quota = __ceph_has_any_quota(ci); had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
ci->i_max_bytes = max_bytes; ci->i_max_bytes = max_bytes;
ci->i_max_files = max_files; ci->i_max_files = max_files;
has_quota = __ceph_has_any_quota(ci); has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
if (had_quota != has_quota) if (had_quota != has_quota)
ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota); ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota);

View File

@ -366,6 +366,14 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci,
} }
#define XATTR_RSTAT_FIELD(_type, _name) \ #define XATTR_RSTAT_FIELD(_type, _name) \
XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT) XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT)
#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \
{ \
.name = CEPH_XATTR_NAME(_type, _name), \
.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
.exists_cb = NULL, \
.flags = VXATTR_FLAG_RSTAT, \
}
#define XATTR_LAYOUT_FIELD(_type, _name, _field) \ #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
{ \ { \
.name = CEPH_XATTR_NAME2(_type, _name, _field), \ .name = CEPH_XATTR_NAME2(_type, _name, _field), \
@ -404,7 +412,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_RSTAT_FIELD(dir, rsubdirs), XATTR_RSTAT_FIELD(dir, rsubdirs),
XATTR_RSTAT_FIELD(dir, rsnaps), XATTR_RSTAT_FIELD(dir, rsnaps),
XATTR_RSTAT_FIELD(dir, rbytes), XATTR_RSTAT_FIELD(dir, rbytes),
XATTR_RSTAT_FIELD(dir, rctime), XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime),
{ {
.name = "ceph.dir.pin", .name = "ceph.dir.pin",
.name_size = sizeof("ceph.dir.pin"), .name_size = sizeof("ceph.dir.pin"),

View File

@ -906,7 +906,6 @@ int crush_do_rule(const struct crush_map *map,
int recurse_to_leaf; int recurse_to_leaf;
int wsize = 0; int wsize = 0;
int osize; int osize;
int *tmp;
const struct crush_rule *rule; const struct crush_rule *rule;
__u32 step; __u32 step;
int i, j; int i, j;
@ -1073,9 +1072,7 @@ int crush_do_rule(const struct crush_map *map,
memcpy(o, c, osize*sizeof(*o)); memcpy(o, c, osize*sizeof(*o));
/* swap o and w arrays */ /* swap o and w arrays */
tmp = o; swap(o, w);
o = w;
w = tmp;
wsize = osize; wsize = osize;
break; break;