Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (28 commits)
  ceph: update discussion list address in MAINTAINERS
  ceph: some documentations fixes
  ceph: fix use after free on mds __unregister_request
  ceph: avoid loaded term 'OSD' in documention
  ceph: fix possible double-free of mds request reference
  ceph: fix session check on mds reply
  ceph: handle kmalloc() failure
  ceph: propagate mds session allocation failures to caller
  ceph: make write_begin wait propagate ERESTARTSYS
  ceph: fix snap rebuild condition
  ceph: avoid reopening osd connections when address hasn't changed
  ceph: rename r_sent_stamp r_stamp
  ceph: fix connection fault con_work reentrancy problem
  ceph: prevent dup stale messages to console for restarting mds
  ceph: fix pg pool decoding from incremental osdmap update
  ceph: fix mds sync() race with completing requests
  ceph: only release unused caps with mds requests
  ceph: clean up handle_cap_grant, handle_caps wrt session mutex
  ceph: fix session locking in handle_caps, ceph_check_caps
  ceph: drop unnecessary WARN_ON in caps migration
  ...
This commit is contained in:
Linus Torvalds 2010-03-29 14:42:25 -07:00
commit 9f32160372
15 changed files with 191 additions and 97 deletions

View File

@ -16,6 +16,8 @@ befs.txt
- information about the BeOS filesystem for Linux. - information about the BeOS filesystem for Linux.
bfs.txt bfs.txt
- info for the SCO UnixWare Boot Filesystem (BFS). - info for the SCO UnixWare Boot Filesystem (BFS).
ceph.txt
- info for the Ceph Distributed File System
cifs.txt cifs.txt
- description of the CIFS filesystem. - description of the CIFS filesystem.
coda.txt coda.txt

View File

@ -8,7 +8,7 @@ Basic features include:
* POSIX semantics * POSIX semantics
* Seamless scaling from 1 to many thousands of nodes * Seamless scaling from 1 to many thousands of nodes
* High availability and reliability. No single points of failure. * High availability and reliability. No single point of failure.
* N-way replication of data across storage nodes * N-way replication of data across storage nodes
* Fast recovery from node failures * Fast recovery from node failures
* Automatic rebalancing of data on node addition/removal * Automatic rebalancing of data on node addition/removal
@ -94,7 +94,7 @@ Mount Options
wsize=X wsize=X
Specify the maximum write size in bytes. By default there is no Specify the maximum write size in bytes. By default there is no
maximu. Ceph will normally size writes based on the file stripe maximum. Ceph will normally size writes based on the file stripe
size. size.
rsize=X rsize=X
@ -115,7 +115,7 @@ Mount Options
number of entries in that directory. number of entries in that directory.
nocrc nocrc
Disable CRC32C calculation for data writes. If set, the OSD Disable CRC32C calculation for data writes. If set, the storage node
must rely on TCP's error correction to detect data corruption must rely on TCP's error correction to detect data corruption
in the data payload. in the data payload.
@ -133,7 +133,8 @@ For more information on Ceph, see the home page at
http://ceph.newdream.net/ http://ceph.newdream.net/
The Linux kernel client source tree is available at The Linux kernel client source tree is available at
git://ceph.newdream.net/linux-ceph-client.git git://ceph.newdream.net/git/ceph-client.git
git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
and the source for the full system is at and the source for the full system is at
git://ceph.newdream.net/ceph.git git://ceph.newdream.net/git/ceph.git

View File

@ -1443,7 +1443,7 @@ F: arch/powerpc/platforms/cell/
CEPH DISTRIBUTED FILE SYSTEM CLIENT CEPH DISTRIBUTED FILE SYSTEM CLIENT
M: Sage Weil <sage@newdream.net> M: Sage Weil <sage@newdream.net>
L: ceph-devel@lists.sourceforge.net L: ceph-devel@vger.kernel.org
W: http://ceph.newdream.net/ W: http://ceph.newdream.net/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
S: Supported S: Supported

View File

@ -919,6 +919,10 @@ static int context_is_writeable_or_written(struct inode *inode,
/* /*
* We are only allowed to write into/dirty the page if the page is * We are only allowed to write into/dirty the page if the page is
* clean, or already dirty within the same snap context. * clean, or already dirty within the same snap context.
*
* called with page locked.
* return success with page locked,
* or any failure (incl -EAGAIN) with page unlocked.
*/ */
static int ceph_update_writeable_page(struct file *file, static int ceph_update_writeable_page(struct file *file,
loff_t pos, unsigned len, loff_t pos, unsigned len,
@ -961,9 +965,11 @@ retry_locked:
snapc = ceph_get_snap_context((void *)page->private); snapc = ceph_get_snap_context((void *)page->private);
unlock_page(page); unlock_page(page);
ceph_queue_writeback(inode); ceph_queue_writeback(inode);
wait_event_interruptible(ci->i_cap_wq, r = wait_event_interruptible(ci->i_cap_wq,
context_is_writeable_or_written(inode, snapc)); context_is_writeable_or_written(inode, snapc));
ceph_put_snap_context(snapc); ceph_put_snap_context(snapc);
if (r == -ERESTARTSYS)
return r;
return -EAGAIN; return -EAGAIN;
} }
@ -1035,7 +1041,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
int r; int r;
do { do {
/* get a page*/ /* get a page */
page = grab_cache_page_write_begin(mapping, index, 0); page = grab_cache_page_write_begin(mapping, index, 0);
if (!page) if (!page)
return -ENOMEM; return -ENOMEM;

View File

@ -28,6 +28,12 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
return (ac->want_keys & xi->have_keys) == ac->want_keys; return (ac->want_keys & xi->have_keys) == ac->want_keys;
} }
static int ceph_x_encrypt_buflen(int ilen)
{
return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
sizeof(u32);
}
static int ceph_x_encrypt(struct ceph_crypto_key *secret, static int ceph_x_encrypt(struct ceph_crypto_key *secret,
void *ibuf, int ilen, void *obuf, size_t olen) void *ibuf, int ilen, void *obuf, size_t olen)
{ {
@ -150,6 +156,11 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
struct timespec validity; struct timespec validity;
struct ceph_crypto_key old_key; struct ceph_crypto_key old_key;
void *tp, *tpend; void *tp, *tpend;
struct ceph_timespec new_validity;
struct ceph_crypto_key new_session_key;
struct ceph_buffer *new_ticket_blob;
unsigned long new_expires, new_renew_after;
u64 new_secret_id;
ceph_decode_need(&p, end, sizeof(u32) + 1, bad); ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
@ -182,16 +193,16 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
goto bad; goto bad;
memcpy(&old_key, &th->session_key, sizeof(old_key)); memcpy(&old_key, &th->session_key, sizeof(old_key));
ret = ceph_crypto_key_decode(&th->session_key, &dp, dend); ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
if (ret) if (ret)
goto out; goto out;
ceph_decode_copy(&dp, &th->validity, sizeof(th->validity)); ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
ceph_decode_timespec(&validity, &th->validity); ceph_decode_timespec(&validity, &new_validity);
th->expires = get_seconds() + validity.tv_sec; new_expires = get_seconds() + validity.tv_sec;
th->renew_after = th->expires - (validity.tv_sec / 4); new_renew_after = new_expires - (validity.tv_sec / 4);
dout(" expires=%lu renew_after=%lu\n", th->expires, dout(" expires=%lu renew_after=%lu\n", new_expires,
th->renew_after); new_renew_after);
/* ticket blob for service */ /* ticket blob for service */
ceph_decode_8_safe(&p, end, is_enc, bad); ceph_decode_8_safe(&p, end, is_enc, bad);
@ -216,10 +227,21 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
dout(" ticket blob is %d bytes\n", dlen); dout(" ticket blob is %d bytes\n", dlen);
ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
struct_v = ceph_decode_8(&tp); struct_v = ceph_decode_8(&tp);
th->secret_id = ceph_decode_64(&tp); new_secret_id = ceph_decode_64(&tp);
ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend); ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
if (ret) if (ret)
goto out; goto out;
/* all is well, update our ticket */
ceph_crypto_key_destroy(&th->session_key);
if (th->ticket_blob)
ceph_buffer_put(th->ticket_blob);
th->session_key = new_session_key;
th->ticket_blob = new_ticket_blob;
th->validity = new_validity;
th->secret_id = new_secret_id;
th->expires = new_expires;
th->renew_after = new_renew_after;
dout(" got ticket service %d (%s) secret_id %lld len %d\n", dout(" got ticket service %d (%s) secret_id %lld len %d\n",
type, ceph_entity_type_name(type), th->secret_id, type, ceph_entity_type_name(type), th->secret_id,
(int)th->ticket_blob->vec.iov_len); (int)th->ticket_blob->vec.iov_len);
@ -242,7 +264,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
struct ceph_x_ticket_handler *th, struct ceph_x_ticket_handler *th,
struct ceph_x_authorizer *au) struct ceph_x_authorizer *au)
{ {
int len; int maxlen;
struct ceph_x_authorize_a *msg_a; struct ceph_x_authorize_a *msg_a;
struct ceph_x_authorize_b msg_b; struct ceph_x_authorize_b msg_b;
void *p, *end; void *p, *end;
@ -253,15 +275,15 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
dout("build_authorizer for %s %p\n", dout("build_authorizer for %s %p\n",
ceph_entity_type_name(th->service), au); ceph_entity_type_name(th->service), au);
len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) + maxlen = sizeof(*msg_a) + sizeof(msg_b) +
ticket_blob_len + 16; ceph_x_encrypt_buflen(ticket_blob_len);
dout(" need len %d\n", len); dout(" need len %d\n", maxlen);
if (au->buf && au->buf->alloc_len < len) { if (au->buf && au->buf->alloc_len < maxlen) {
ceph_buffer_put(au->buf); ceph_buffer_put(au->buf);
au->buf = NULL; au->buf = NULL;
} }
if (!au->buf) { if (!au->buf) {
au->buf = ceph_buffer_new(len, GFP_NOFS); au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
if (!au->buf) if (!au->buf)
return -ENOMEM; return -ENOMEM;
} }
@ -296,6 +318,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
au->buf->vec.iov_len = p - au->buf->vec.iov_base; au->buf->vec.iov_len = p - au->buf->vec.iov_base;
dout(" built authorizer nonce %llx len %d\n", au->nonce, dout(" built authorizer nonce %llx len %d\n", au->nonce,
(int)au->buf->vec.iov_len); (int)au->buf->vec.iov_len);
BUG_ON(au->buf->vec.iov_len > maxlen);
return 0; return 0;
out_buf: out_buf:

View File

@ -1407,6 +1407,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
*/ */
void ceph_check_caps(struct ceph_inode_info *ci, int flags, void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session) struct ceph_mds_session *session)
__releases(session->s_mutex)
{ {
struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
struct ceph_mds_client *mdsc = &client->mdsc; struct ceph_mds_client *mdsc = &client->mdsc;
@ -1414,7 +1415,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_cap *cap; struct ceph_cap *cap;
int file_wanted, used; int file_wanted, used;
int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int drop_session_lock = session ? 0 : 1;
int issued, implemented, want, retain, revoking, flushing = 0; int issued, implemented, want, retain, revoking, flushing = 0;
int mds = -1; /* keep track of how far we've gone through i_caps list int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */ to avoid an infinite loop on retry */
@ -1639,7 +1639,7 @@ ack:
if (queue_invalidate) if (queue_invalidate)
ceph_queue_invalidate(inode); ceph_queue_invalidate(inode);
if (session && drop_session_lock) if (session)
mutex_unlock(&session->s_mutex); mutex_unlock(&session->s_mutex);
if (took_snap_rwsem) if (took_snap_rwsem)
up_read(&mdsc->snap_rwsem); up_read(&mdsc->snap_rwsem);
@ -2195,18 +2195,19 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
* Handle a cap GRANT message from the MDS. (Note that a GRANT may * Handle a cap GRANT message from the MDS. (Note that a GRANT may
* actually be a revocation if it specifies a smaller cap set.) * actually be a revocation if it specifies a smaller cap set.)
* *
* caller holds s_mutex. * caller holds s_mutex and i_lock, we drop both.
*
* return value: * return value:
* 0 - ok * 0 - ok
* 1 - check_caps on auth cap only (writeback) * 1 - check_caps on auth cap only (writeback)
* 2 - check_caps (ack revoke) * 2 - check_caps (ack revoke)
*/ */
static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
struct ceph_mds_session *session, struct ceph_mds_session *session,
struct ceph_cap *cap, struct ceph_cap *cap,
struct ceph_buffer *xattr_buf) struct ceph_buffer *xattr_buf)
__releases(inode->i_lock) __releases(inode->i_lock)
__releases(session->s_mutex)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds; int mds = session->s_mds;
@ -2216,7 +2217,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
u64 size = le64_to_cpu(grant->size); u64 size = le64_to_cpu(grant->size);
u64 max_size = le64_to_cpu(grant->max_size); u64 max_size = le64_to_cpu(grant->max_size);
struct timespec mtime, atime, ctime; struct timespec mtime, atime, ctime;
int reply = 0; int check_caps = 0;
int wake = 0; int wake = 0;
int writeback = 0; int writeback = 0;
int revoked_rdcache = 0; int revoked_rdcache = 0;
@ -2329,11 +2330,12 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
writeback = 1; /* will delay ack */ writeback = 1; /* will delay ack */
else if (dirty & ~newcaps) else if (dirty & ~newcaps)
reply = 1; /* initiate writeback in check_caps */ check_caps = 1; /* initiate writeback in check_caps */
else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
revoked_rdcache) revoked_rdcache)
reply = 2; /* send revoke ack in check_caps */ check_caps = 2; /* send revoke ack in check_caps */
cap->issued = newcaps; cap->issued = newcaps;
cap->implemented |= newcaps;
} else if (cap->issued == newcaps) { } else if (cap->issued == newcaps) {
dout("caps unchanged: %s -> %s\n", dout("caps unchanged: %s -> %s\n",
ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
@ -2346,6 +2348,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
* pending revocation */ * pending revocation */
wake = 1; wake = 1;
} }
BUG_ON(cap->issued & ~cap->implemented);
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
if (writeback) if (writeback)
@ -2359,7 +2362,14 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ceph_queue_invalidate(inode); ceph_queue_invalidate(inode);
if (wake) if (wake)
wake_up(&ci->i_cap_wq); wake_up(&ci->i_cap_wq);
return reply;
if (check_caps == 1)
ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
session);
else if (check_caps == 2)
ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
else
mutex_unlock(&session->s_mutex);
} }
/* /*
@ -2548,9 +2558,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
ci->i_cap_exporting_issued = cap->issued; ci->i_cap_exporting_issued = cap->issued;
} }
__ceph_remove_cap(cap); __ceph_remove_cap(cap);
} else {
WARN_ON(!cap);
} }
/* else, we already released it */
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
} }
@ -2621,9 +2630,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
u64 cap_id; u64 cap_id;
u64 size, max_size; u64 size, max_size;
u64 tid; u64 tid;
int check_caps = 0;
void *snaptrace; void *snaptrace;
int r;
dout("handle_caps from mds%d\n", mds); dout("handle_caps from mds%d\n", mds);
@ -2668,8 +2675,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_IMPORT: case CEPH_CAP_OP_IMPORT:
handle_cap_import(mdsc, inode, h, session, handle_cap_import(mdsc, inode, h, session,
snaptrace, le32_to_cpu(h->snap_trace_len)); snaptrace, le32_to_cpu(h->snap_trace_len));
check_caps = 1; /* we may have sent a RELEASE to the old auth */ ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
goto done; session);
goto done_unlocked;
} }
/* the rest require a cap */ /* the rest require a cap */
@ -2686,16 +2694,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
switch (op) { switch (op) {
case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_REVOKE:
case CEPH_CAP_OP_GRANT: case CEPH_CAP_OP_GRANT:
r = handle_cap_grant(inode, h, session, cap, msg->middle); handle_cap_grant(inode, h, session, cap, msg->middle);
if (r == 1) goto done_unlocked;
ceph_check_caps(ceph_inode(inode),
CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
session);
else if (r == 2)
ceph_check_caps(ceph_inode(inode),
CHECK_CAPS_NODELAY,
session);
break;
case CEPH_CAP_OP_FLUSH_ACK: case CEPH_CAP_OP_FLUSH_ACK:
handle_cap_flush_ack(inode, tid, h, session, cap); handle_cap_flush_ack(inode, tid, h, session, cap);
@ -2713,9 +2713,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
done: done:
mutex_unlock(&session->s_mutex); mutex_unlock(&session->s_mutex);
done_unlocked:
if (check_caps)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
if (inode) if (inode)
iput(inode); iput(inode);
return; return;
@ -2838,11 +2836,18 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
struct ceph_cap *cap; struct ceph_cap *cap;
struct ceph_mds_request_release *rel = *p; struct ceph_mds_request_release *rel = *p;
int ret = 0; int ret = 0;
int used = 0;
dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
mds, ceph_cap_string(drop), ceph_cap_string(unless));
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
used = __ceph_caps_used(ci);
dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
mds, ceph_cap_string(used), ceph_cap_string(drop),
ceph_cap_string(unless));
/* only drop unused caps */
drop &= ~used;
cap = __get_cap_for_mds(ci, mds); cap = __get_cap_for_mds(ci, mds);
if (cap && __cap_is_valid(cap)) { if (cap && __cap_is_valid(cap)) {
if (force || if (force ||

View File

@ -288,8 +288,10 @@ more:
CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
/* discard old result, if any */ /* discard old result, if any */
if (fi->last_readdir) if (fi->last_readdir) {
ceph_mdsc_put_request(fi->last_readdir); ceph_mdsc_put_request(fi->last_readdir);
fi->last_readdir = NULL;
}
/* requery frag tree, as the frag topology may have changed */ /* requery frag tree, as the frag topology may have changed */
frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);

View File

@ -378,6 +378,22 @@ void ceph_destroy_inode(struct inode *inode)
ceph_queue_caps_release(inode); ceph_queue_caps_release(inode);
/*
* we may still have a snap_realm reference if there are stray
* caps in i_cap_exporting_issued or i_snap_caps.
*/
if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc =
&ceph_client(ci->vfs_inode.i_sb)->mdsc;
struct ceph_snap_realm *realm = ci->i_snap_realm;
dout(" dropping residual ref to snap realm %p\n", realm);
spin_lock(&realm->inodes_with_caps_lock);
list_del_init(&ci->i_snap_realm_item);
spin_unlock(&realm->inodes_with_caps_lock);
ceph_put_snap_realm(mdsc, realm);
}
kfree(ci->i_symlink); kfree(ci->i_symlink);
while ((n = rb_first(&ci->i_fragtree)) != NULL) { while ((n = rb_first(&ci->i_fragtree)) != NULL) {
frag = rb_entry(n, struct ceph_inode_frag, node); frag = rb_entry(n, struct ceph_inode_frag, node);

View File

@ -328,6 +328,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
struct ceph_mds_session *s; struct ceph_mds_session *s;
s = kzalloc(sizeof(*s), GFP_NOFS); s = kzalloc(sizeof(*s), GFP_NOFS);
if (!s)
return ERR_PTR(-ENOMEM);
s->s_mdsc = mdsc; s->s_mdsc = mdsc;
s->s_mds = mds; s->s_mds = mds;
s->s_state = CEPH_MDS_SESSION_NEW; s->s_state = CEPH_MDS_SESSION_NEW;
@ -529,7 +531,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
{ {
dout("__unregister_request %p tid %lld\n", req, req->r_tid); dout("__unregister_request %p tid %lld\n", req, req->r_tid);
rb_erase(&req->r_node, &mdsc->request_tree); rb_erase(&req->r_node, &mdsc->request_tree);
ceph_mdsc_put_request(req); RB_CLEAR_NODE(&req->r_node);
if (req->r_unsafe_dir) { if (req->r_unsafe_dir) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@ -538,6 +540,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
list_del_init(&req->r_unsafe_dir_item); list_del_init(&req->r_unsafe_dir_item);
spin_unlock(&ci->i_unsafe_lock); spin_unlock(&ci->i_unsafe_lock);
} }
ceph_mdsc_put_request(req);
} }
/* /*
@ -862,6 +866,7 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
if (time_after_eq(jiffies, session->s_cap_ttl) && if (time_after_eq(jiffies, session->s_cap_ttl) &&
time_after_eq(session->s_cap_ttl, session->s_renew_requested)) time_after_eq(session->s_cap_ttl, session->s_renew_requested))
pr_info("mds%d caps stale\n", session->s_mds); pr_info("mds%d caps stale\n", session->s_mds);
session->s_renew_requested = jiffies;
/* do not try to renew caps until a recovering mds has reconnected /* do not try to renew caps until a recovering mds has reconnected
* with its clients. */ * with its clients. */
@ -874,7 +879,6 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
dout("send_renew_caps to mds%d (%s)\n", session->s_mds, dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
ceph_mds_state_name(state)); ceph_mds_state_name(state));
session->s_renew_requested = jiffies;
msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
++session->s_renew_seq); ++session->s_renew_seq);
if (IS_ERR(msg)) if (IS_ERR(msg))
@ -1566,8 +1570,13 @@ static int __do_request(struct ceph_mds_client *mdsc,
/* get, open session */ /* get, open session */
session = __ceph_lookup_mds_session(mdsc, mds); session = __ceph_lookup_mds_session(mdsc, mds);
if (!session) if (!session) {
session = register_session(mdsc, mds); session = register_session(mdsc, mds);
if (IS_ERR(session)) {
err = PTR_ERR(session);
goto finish;
}
}
dout("do_request mds%d session %p state %s\n", mds, session, dout("do_request mds%d session %p state %s\n", mds, session,
session_state_name(session->s_state)); session_state_name(session->s_state));
if (session->s_state != CEPH_MDS_SESSION_OPEN && if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@ -1770,7 +1779,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
dout("handle_reply %p\n", req); dout("handle_reply %p\n", req);
/* correct session? */ /* correct session? */
if (!req->r_session && req->r_session != session) { if (req->r_session != session) {
pr_err("mdsc_handle_reply got %llu on session mds%d" pr_err("mdsc_handle_reply got %llu on session mds%d"
" not mds%d\n", tid, session->s_mds, " not mds%d\n", tid, session->s_mds,
req->r_session ? req->r_session->s_mds : -1); req->r_session ? req->r_session->s_mds : -1);
@ -2682,29 +2691,41 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
*/ */
static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
{ {
struct ceph_mds_request *req = NULL; struct ceph_mds_request *req = NULL, *nextreq;
struct rb_node *n; struct rb_node *n;
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
dout("wait_unsafe_requests want %lld\n", want_tid); dout("wait_unsafe_requests want %lld\n", want_tid);
restart:
req = __get_oldest_req(mdsc); req = __get_oldest_req(mdsc);
while (req && req->r_tid <= want_tid) { while (req && req->r_tid <= want_tid) {
/* find next request */
n = rb_next(&req->r_node);
if (n)
nextreq = rb_entry(n, struct ceph_mds_request, r_node);
else
nextreq = NULL;
if ((req->r_op & CEPH_MDS_OP_WRITE)) { if ((req->r_op & CEPH_MDS_OP_WRITE)) {
/* write op */ /* write op */
ceph_mdsc_get_request(req); ceph_mdsc_get_request(req);
if (nextreq)
ceph_mdsc_get_request(nextreq);
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
dout("wait_unsafe_requests wait on %llu (want %llu)\n", dout("wait_unsafe_requests wait on %llu (want %llu)\n",
req->r_tid, want_tid); req->r_tid, want_tid);
wait_for_completion(&req->r_safe_completion); wait_for_completion(&req->r_safe_completion);
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
n = rb_next(&req->r_node);
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
} else { if (!nextreq)
n = rb_next(&req->r_node); break; /* next dne before, so we're done! */
if (RB_EMPTY_NODE(&nextreq->r_node)) {
/* next request was removed from tree */
ceph_mdsc_put_request(nextreq);
goto restart;
}
ceph_mdsc_put_request(nextreq); /* won't go away */
} }
if (!n) req = nextreq;
break;
req = rb_entry(n, struct ceph_mds_request, r_node);
} }
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
dout("wait_unsafe_requests done\n"); dout("wait_unsafe_requests done\n");

View File

@ -365,6 +365,14 @@ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
queue_con(con); queue_con(con);
} }
/*
* return true if this connection ever successfully opened
*/
bool ceph_con_opened(struct ceph_connection *con)
{
return con->connect_seq > 0;
}
/* /*
* generic get/put * generic get/put
*/ */
@ -830,13 +838,6 @@ static void prepare_read_connect(struct ceph_connection *con)
con->in_base_pos = 0; con->in_base_pos = 0;
} }
static void prepare_read_connect_retry(struct ceph_connection *con)
{
dout("prepare_read_connect_retry %p\n", con);
con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr)
+ sizeof(con->peer_addr_for_me);
}
static void prepare_read_ack(struct ceph_connection *con) static void prepare_read_ack(struct ceph_connection *con)
{ {
dout("prepare_read_ack %p\n", con); dout("prepare_read_ack %p\n", con);
@ -1146,7 +1147,7 @@ static int process_connect(struct ceph_connection *con)
} }
con->auth_retry = 1; con->auth_retry = 1;
prepare_write_connect(con->msgr, con, 0); prepare_write_connect(con->msgr, con, 0);
prepare_read_connect_retry(con); prepare_read_connect(con);
break; break;
case CEPH_MSGR_TAG_RESETSESSION: case CEPH_MSGR_TAG_RESETSESSION:
@ -1843,8 +1844,6 @@ static void ceph_fault(struct ceph_connection *con)
goto out; goto out;
} }
clear_bit(BUSY, &con->state); /* to avoid an improbable race */
mutex_lock(&con->mutex); mutex_lock(&con->mutex);
if (test_bit(CLOSED, &con->state)) if (test_bit(CLOSED, &con->state))
goto out_unlock; goto out_unlock;

View File

@ -223,6 +223,7 @@ extern void ceph_con_init(struct ceph_messenger *msgr,
struct ceph_connection *con); struct ceph_connection *con);
extern void ceph_con_open(struct ceph_connection *con, extern void ceph_con_open(struct ceph_connection *con,
struct ceph_entity_addr *addr); struct ceph_entity_addr *addr);
extern bool ceph_con_opened(struct ceph_connection *con);
extern void ceph_con_close(struct ceph_connection *con); extern void ceph_con_close(struct ceph_connection *con);
extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);

View File

@ -413,11 +413,22 @@ static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
*/ */
static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
{ {
struct ceph_osd_request *req;
int ret = 0; int ret = 0;
dout("__reset_osd %p osd%d\n", osd, osd->o_osd); dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
if (list_empty(&osd->o_requests)) { if (list_empty(&osd->o_requests)) {
__remove_osd(osdc, osd); __remove_osd(osdc, osd);
} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
&osd->o_con.peer_addr,
sizeof(osd->o_con.peer_addr)) == 0 &&
!ceph_con_opened(&osd->o_con)) {
dout(" osd addr hasn't changed and connection never opened,"
" letting msgr retry");
/* touch each r_stamp for handle_timeout()'s benfit */
list_for_each_entry(req, &osd->o_requests, r_osd_item)
req->r_stamp = jiffies;
ret = -EAGAIN;
} else { } else {
ceph_con_close(&osd->o_con); ceph_con_close(&osd->o_con);
ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
@ -633,7 +644,7 @@ static int __send_request(struct ceph_osd_client *osdc,
reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
reqhead->reassert_version = req->r_reassert_version; reqhead->reassert_version = req->r_reassert_version;
req->r_sent_stamp = jiffies; req->r_stamp = jiffies;
list_move_tail(&osdc->req_lru, &req->r_req_lru_item); list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
ceph_msg_get(req->r_request); /* send consumes a ref */ ceph_msg_get(req->r_request); /* send consumes a ref */
@ -660,7 +671,7 @@ static void handle_timeout(struct work_struct *work)
unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
unsigned long keepalive = unsigned long keepalive =
osdc->client->mount_args->osd_keepalive_timeout * HZ; osdc->client->mount_args->osd_keepalive_timeout * HZ;
unsigned long last_sent = 0; unsigned long last_stamp = 0;
struct rb_node *p; struct rb_node *p;
struct list_head slow_osds; struct list_head slow_osds;
@ -697,12 +708,12 @@ static void handle_timeout(struct work_struct *work)
req = list_entry(osdc->req_lru.next, struct ceph_osd_request, req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
r_req_lru_item); r_req_lru_item);
if (time_before(jiffies, req->r_sent_stamp + timeout)) if (time_before(jiffies, req->r_stamp + timeout))
break; break;
BUG_ON(req == last_req && req->r_sent_stamp == last_sent); BUG_ON(req == last_req && req->r_stamp == last_stamp);
last_req = req; last_req = req;
last_sent = req->r_sent_stamp; last_stamp = req->r_stamp;
osd = req->r_osd; osd = req->r_osd;
BUG_ON(!osd); BUG_ON(!osd);
@ -718,7 +729,7 @@ static void handle_timeout(struct work_struct *work)
*/ */
INIT_LIST_HEAD(&slow_osds); INIT_LIST_HEAD(&slow_osds);
list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
if (time_before(jiffies, req->r_sent_stamp + keepalive)) if (time_before(jiffies, req->r_stamp + keepalive))
break; break;
osd = req->r_osd; osd = req->r_osd;
@ -862,7 +873,9 @@ static int __kick_requests(struct ceph_osd_client *osdc,
dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
if (kickosd) { if (kickosd) {
__reset_osd(osdc, kickosd); err = __reset_osd(osdc, kickosd);
if (err == -EAGAIN)
return 1;
} else { } else {
for (p = rb_first(&osdc->osds); p; p = n) { for (p = rb_first(&osdc->osds); p; p = n) {
struct ceph_osd *osd = struct ceph_osd *osd =
@ -913,7 +926,7 @@ static int __kick_requests(struct ceph_osd_client *osdc,
kick: kick:
dout("kicking %p tid %llu osd%d\n", req, req->r_tid, dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
req->r_osd->o_osd); req->r_osd ? req->r_osd->o_osd : -1);
req->r_flags |= CEPH_OSD_FLAG_RETRY; req->r_flags |= CEPH_OSD_FLAG_RETRY;
err = __send_request(osdc, req); err = __send_request(osdc, req);
if (err) { if (err) {

View File

@ -70,7 +70,7 @@ struct ceph_osd_request {
char r_oid[40]; /* object name */ char r_oid[40]; /* object name */
int r_oid_len; int r_oid_len;
unsigned long r_sent_stamp; unsigned long r_stamp; /* send OR check time */
bool r_resend; /* msg send failed, needs retry */ bool r_resend; /* msg send failed, needs retry */
struct ceph_file_layout r_file_layout; struct ceph_file_layout r_file_layout;

View File

@ -480,6 +480,14 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
return NULL; return NULL;
} }
void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
{
ceph_decode_copy(p, &pi->v, sizeof(pi->v));
calc_pg_masks(pi);
*p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
}
/* /*
* decode a full map. * decode a full map.
*/ */
@ -526,12 +534,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
ev, CEPH_PG_POOL_VERSION); ev, CEPH_PG_POOL_VERSION);
goto bad; goto bad;
} }
ceph_decode_copy(p, &pi->v, sizeof(pi->v)); __decode_pool(p, pi);
__insert_pg_pool(&map->pg_pools, pi); __insert_pg_pool(&map->pg_pools, pi);
calc_pg_masks(pi);
*p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
*p += le32_to_cpu(pi->v.num_removed_snap_intervals)
* sizeof(u64) * 2;
} }
ceph_decode_32_safe(p, end, map->pool_max, bad); ceph_decode_32_safe(p, end, map->pool_max, bad);
@ -714,8 +718,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
pi->id = pool; pi->id = pool;
__insert_pg_pool(&map->pg_pools, pi); __insert_pg_pool(&map->pg_pools, pi);
} }
ceph_decode_copy(p, &pi->v, sizeof(pi->v)); __decode_pool(p, pi);
calc_pg_masks(pi);
} }
/* old_pool */ /* old_pool */

View File

@ -314,9 +314,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
because we rebuild_snap_realms() works _downward_ in because we rebuild_snap_realms() works _downward_ in
hierarchy after each update.) */ hierarchy after each update.) */
if (realm->cached_context && if (realm->cached_context &&
realm->cached_context->seq <= realm->seq && realm->cached_context->seq == realm->seq &&
(!parent || (!parent ||
realm->cached_context->seq <= parent->cached_context->seq)) { realm->cached_context->seq >= parent->cached_context->seq)) {
dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
" (unchanged)\n", " (unchanged)\n",
realm->ino, realm, realm->cached_context, realm->ino, realm, realm->cached_context,
@ -818,7 +818,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
* queued (again) by ceph_update_snap_trace() * queued (again) by ceph_update_snap_trace()
* below. Queue it _now_, under the old context. * below. Queue it _now_, under the old context.
*/ */
spin_lock(&realm->inodes_with_caps_lock);
list_del_init(&ci->i_snap_realm_item); list_del_init(&ci->i_snap_realm_item);
spin_unlock(&realm->inodes_with_caps_lock);
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
ceph_queue_cap_snap(ci, ceph_queue_cap_snap(ci,